From 1329c0a75e6a7defc5c380eaf80d8e0f66d7da78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20L=C3=B3pez?= Date: Thu, 31 Oct 2024 10:09:52 +0100 Subject: [PATCH 001/279] kompute: add mul_mat_q4_k shader (#10097) This is a more or less direct translation from the Metal implementation to GLSL. Signed-off-by: Sergio Lopez --- ggml/src/CMakeLists.txt | 2 + ggml/src/ggml-kompute.cpp | 42 ++++++ ggml/src/kompute-shaders/common.comp | 9 ++ ggml/src/kompute-shaders/op_mul_mat_q4_k.comp | 133 ++++++++++++++++++ 4 files changed, 186 insertions(+) create mode 100644 ggml/src/kompute-shaders/op_mul_mat_q4_k.comp diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index aa405e4d0..915568798 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -800,6 +800,7 @@ if (GGML_KOMPUTE) kompute-shaders/op_mul_mat_q8_0.comp kompute-shaders/op_mul_mat_q4_0.comp kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q4_k.comp kompute-shaders/op_mul_mat_q6_k.comp kompute-shaders/op_getrows_f32.comp kompute-shaders/op_getrows_f16.comp @@ -833,6 +834,7 @@ if (GGML_KOMPUTE) shaderop_mul_mat_q8_0.h shaderop_mul_mat_q4_0.h shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q4_k.h shaderop_mul_mat_q6_k.h shaderop_getrows_f32.h shaderop_getrows_f16.h diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index fea69fb04..2fea9e4cc 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -20,6 +20,7 @@ #include "shaderop_mul_mat_q8_0.h" #include "shaderop_mul_mat_q4_0.h" #include "shaderop_mul_mat_q4_1.h" +#include "shaderop_mul_mat_q4_k.h" #include "shaderop_mul_mat_q6_k.h" #include "shaderop_mul_mat_mat_f32.h" #include "shaderop_getrows_f32.h" @@ -1067,6 +1068,40 @@ static void ggml_vk_mul_mat_q8_0(Args&&... args) { ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward(args)...); } +static void ggml_vk_mul_mat_q4_k( + kp::Sequence& seq, + const std::shared_ptr& inA, + const std::shared_ptr& inB, + const std::shared_ptr& out, + uint32_t inAOff, uint32_t inBOff, uint32_t outOff, + int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10, + int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0, + int32_t ne1, int32_t r2, int32_t r3 +) { + const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, + kp::shader_data::op_mul_mat_q4_k_comp_spv_len); + + struct PushConstants { + uint32_t inAOff, inBOff, outOff; + int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3; + } pushConsts { + 0, 0, 0, + ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3 + }; + + std::shared_ptr s_algo = nullptr; + if (!komputeManager()->hasAlgorithm(__func__)) { + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts}); + } else { + s_algo = komputeManager()->getAlgorithm(__func__); + s_algo->setTensors({inA, inB, out}); + s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}); + s_algo->setPushConstants({pushConsts}); + s_algo->updateDescriptors(s_kompute_context->pool.get()); + } + seq.record(s_algo); +} + static void ggml_vk_mul_mat_q6_k( kp::Sequence& seq, const std::shared_ptr& inA, @@ -1384,6 +1419,7 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: return true; default: ; @@ -1635,6 +1671,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3 ); break; + case GGML_TYPE_Q4_K: + ggml_vk_mul_mat_q4_k( + seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03 + ); + break; case GGML_TYPE_Q6_K: ggml_vk_mul_mat_q6_k( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, diff --git a/ggml/src/kompute-shaders/common.comp b/ggml/src/kompute-shaders/common.comp index 62d62b025..2aaddf704 100644 --- a/ggml/src/kompute-shaders/common.comp +++ b/ggml/src/kompute-shaders/common.comp @@ -15,6 +15,7 @@ #define TWOPI_F 6.283185307179586f #define QK_K 256 +#define K_SCALE_SIZE 12 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) @@ -64,6 +65,14 @@ mat4 dequantize_q4_1(const block_q4_1 xb, uint il) { return reg; } +#define sizeof_block_q4_k 144 +struct block_q4_k { + float16_t d; + float16_t dmin; + uint8_t scales[K_SCALE_SIZE]; + uint8_t qs[QK_K/2]; +}; + #define sizeof_block_q6_k 210 struct block_q6_k { uint8_t ql[QK_K/2]; // quants, lower 4 bits diff --git a/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp new file mode 100644 index 000000000..fc8e45aa9 --- /dev/null +++ b/ggml/src/kompute-shaders/op_mul_mat_q4_k.comp @@ -0,0 +1,133 @@ +#version 450 + +#include "common.comp" + +#define N_DST 4 +#define SIZE_OF_BLOCK sizeof_block_q4_k + +layout(local_size_x = 4) in; +layout(local_size_y = 8) in; +layout(local_size_z = 1) in; + +layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; }; +layout (binding = 1) readonly buffer tensorInB { float inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int ne10; + int ne0; + int ne1; + int ne01; + int ne02; + int ne12; + int r2; + int r3; +} pcs; + +void main() { + const uint16_t kmask1 = uint16_t(0x3f3f); + const uint16_t kmask2 = uint16_t(0x0f0f); + const uint16_t kmask3 = uint16_t(0xc0c0); + + const uint ix = gl_SubgroupInvocationID/8; // 0...3 + const uint it = gl_SubgroupInvocationID%8; // 0...7 + const uint iq = it/4; // 0 or 1 + const uint ir = it%4; // 0...3 + + const uint nb = pcs.ne00/QK_K; + + const uint r0 = gl_WorkGroupID.x; + const uint r1 = gl_WorkGroupID.y; + const uint im = gl_WorkGroupID.z; + + const uint first_row = r0 * N_DST; + const uint ib_row = first_row * nb; + + const uint i12 = im%pcs.ne12; + const uint i13 = im/pcs.ne12; + + const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); + + const uint xblk = ib_row + offset0 + pcs.inAOff; + const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; + + float yl[16]; + float yh[16]; + float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f}; + float all_sum = 0.f; + + uint y4 = y + ix * QK_K + 64 * iq + 8 * ir; + + for (uint ib = ix; ib < nb; ib += 4) { + const uint blk_idx = ib + xblk; + + float sumy[4] = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0]; + yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8]; + yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0]; + yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8]; + } + + for (int row = 0; row < N_DST; row++) { + uint row_idx = row * nb; + + uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); + uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); + uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4); + uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6); + uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8); + + uint16_t sc16[4]; + sc16[0] = sc_0 & kmask1; + sc16[1] = sc_2 & kmask1; + sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2); + sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2); + + float acc1[4] = {0.f, 0.f, 0.f, 0.f}; + float acc2[4] = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i); + uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i); + acc1[0] += yl[i+0] * (q1 & 0x000F); + acc1[1] += yl[i+1] * (q1 & 0x0F00); + acc1[2] += yl[i+8] * (q1 & 0x00F0); + acc1[3] += yl[i+9] * (q1 & 0xF000); + acc2[0] += yh[i+0] * (q2 & 0x000F); + acc2[1] += yh[i+1] * (q2 & 0x0F00); + acc2[2] += yh[i+8] * (q2 & 0x00F0); + acc2[3] += yh[i+9] * (q2 & 0xF000); + } + + uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF); + uint8_t sc8_1 = uint8_t(sc16[0] >> 8 ); + uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF); + uint8_t sc8_3 = uint8_t(sc16[1] >> 8 ); + uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF); + uint8_t sc8_5 = uint8_t(sc16[2] >> 8 ); + uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF); + uint8_t sc8_7 = uint8_t(sc16[3] >> 8 ); + + float dall = float(inA[blk_idx + row_idx].d); + float dmin = float(inA[blk_idx + row_idx].dmin); + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 + + (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f + + (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 + + (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) - + dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7); + } + + y4 += 4 * QK_K; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = subgroupAdd(sumf[row]); + if (subgroupElect()) { + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum; + } + } +} From dea5e86051aadcdf42f7db7a8855a78d8f5ff3d6 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 31 Oct 2024 11:40:59 +0100 Subject: [PATCH 002/279] ggml : check tensor name lengths in gguf files (#10100) --- ggml/src/ggml.c | 45 ++++++++++++++++++++++++++++++++++++--------- src/llama.cpp | 7 +++++-- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0d99b0791..149d8f970 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -22102,18 +22102,46 @@ static size_t gguf_type_size(enum gguf_type type) { return GGUF_TYPE_SIZE[type]; } -static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { - GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS); - GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT); +static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { + if (info->n_dims > GGML_MAX_DIMS) { + fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims); + return false; + } + + if (info->type < 0 || info->type >= GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type); + return false; + } + + if (strlen(info->name.data) >= GGML_MAX_NAME) { + fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data); + return false; + } for (uint32_t i = 0; i < info->n_dims; ++i) { - GGML_ASSERT(info->ne[i] > 0); + if (info->ne[i] <= 0) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]); + return false; + } } // prevent overflow for total number of elements - GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]); - GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]); - GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]); + if (INT64_MAX/info->ne[1] <= info->ne[0]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]); + return false; + } + + if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]); + return false; + } + + if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) { + fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]); + return false; + } + + return true; } static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { @@ -22414,8 +22442,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); - // TODO: return an error instead of crashing with GGML_ASSERT - gguf_tensor_info_sanitize(info); + ok = ok && gguf_tensor_info_sanitize(info); // make sure there is no duplicated tensor names for (uint64_t j = 0; j < i && ok; ++j) { diff --git a/src/llama.cpp b/src/llama.cpp index ef1b8ee59..60a0db29c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4273,8 +4273,11 @@ struct llama_model_loader { llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { const int tensor_idx = gguf_find_tensor(gguf_ctx, name); - offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + if (tensor_idx < 0) { + throw std::runtime_error(format("tensor '%s' not found in the model", name)); + } + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) { throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); } @@ -7426,7 +7429,7 @@ static bool llm_load_tensors( if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) { return nullptr; } - throw std::runtime_error(format("missing tensor %s", tn.str().c_str())); + throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); } // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops From 0a683e8088d849626e7471f9e2ed381f7dbdf2e9 Mon Sep 17 00:00:00 2001 From: Kevin Gibbons Date: Thu, 31 Oct 2024 06:02:35 -0700 Subject: [PATCH 003/279] server : include scheme when printing URL (#10106) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7953b5065..f914ff88c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3259,7 +3259,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.terminate(); }; - LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); ctx_server.queue_tasks.start_loop(); From ab3d71f97f5b2915a229099777af00d3eada1d24 Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Fri, 1 Nov 2024 02:50:39 +0800 Subject: [PATCH 004/279] loader: refactor tensor weights storage (#9935) * loader: refactor tensor weights storage * use sorted map, sort weights by layer --------- Co-authored-by: slaren --- src/llama.cpp | 123 ++++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 58 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 60a0db29c..bc94d7ff0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4271,20 +4271,34 @@ struct llama_model_loader { ggml_tensor * tensor; - llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { - const int tensor_idx = gguf_find_tensor(gguf_ctx, name); + llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); if (tensor_idx < 0) { - throw std::runtime_error(format("tensor '%s' not found in the model", name)); + throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); } offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) { - throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); + throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); } } }; - std::vector weights; + // custom comparator to sort weights more nicely by layer + struct weight_name_comparer { + bool operator()(const std::string & a, const std::string & b) const { + int a_layer = -1; + int b_layer = -1; + sscanf(a.c_str(), "blk.%d.", &a_layer); + sscanf(b.c_str(), "blk.%d.", &b_layer); + if (a_layer != b_layer) { + return a_layer < b_layer; + } + return a < b; + } + }; + + std::map weights_map; std::unordered_map kv_overrides; struct gguf_context * meta = NULL; @@ -4326,7 +4340,14 @@ struct llama_model_loader { // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(files.back().get(), 0, cur->name, meta, cur); + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur)); } uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); @@ -4366,7 +4387,14 @@ struct llama_model_loader { // Save tensors data offset info of the shard. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur); + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur)); } gguf_free(ctx_gguf); @@ -4376,7 +4404,7 @@ struct llama_model_loader { // sanity check { - const int n_tensors_loaded = (int) weights.size(); + const int n_tensors_loaded = (int) weights_map.size(); if (n_tensors != n_tensors_loaded) { throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); } @@ -4386,23 +4414,10 @@ struct llama_model_loader { } n_kv = gguf_get_n_kv(meta); - n_tensors = weights.size(); + n_tensors = weights_map.size(); fver = (enum llama_fver) gguf_get_version(meta); - std::set tensor_names; - for (auto & w : weights) { - n_elements += ggml_nelements(w.tensor); - n_bytes += ggml_nbytes(w.tensor); - // make sure there is no duplicated tensor names - const std::string name(w.tensor->name); - auto found = tensor_names.find(name); - if (found != tensor_names.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name)); - } - tensor_names.insert(name); - } - LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); @@ -4414,8 +4429,10 @@ struct llama_model_loader { uint32_t n_type_max = 0; enum ggml_type type_max = GGML_TYPE_F32; - for (int i = 0; i < n_tensors; i++) { - const ggml_tensor * tensor = weights.at(i).tensor; + for (const auto & it : weights_map) { + const llama_tensor_weight & w = it.second; + const ggml_tensor * tensor = w.tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -4426,8 +4443,8 @@ struct llama_model_loader { } if (trace > 0) { - const uint16_t sid = weights.at(i).idx; - LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); + const uint16_t sid = w.idx; + LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } @@ -4691,21 +4708,13 @@ struct llama_model_loader { return llm_kv.arch; } - const char * get_tensor_name(int i) const { - return weights.at(i).tensor->name; - } - const llama_tensor_weight * get_weight(const char * name) const { - for (const auto & weight : weights) { - if (strcmp(name, weight.tensor->name) == 0) { - return &weight; - } + auto pos = weights_map.find(name); + if (pos != weights_map.end()) { + return &pos->second; } - return nullptr; - } - const llama_tensor_weight * get_weight(int i) const { - return get_weight(get_tensor_name(i)); + return nullptr; } const llama_tensor_weight & require_weight(const char * name) const { @@ -4732,10 +4741,6 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * get_tensor_meta(int i) const { - return get_tensor_meta(get_tensor_name(i)); - } - const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); @@ -4842,8 +4847,8 @@ struct llama_model_loader { } // compute the total size of all tensors for progress reporting - for (auto & w : weights) { - size_data += ggml_nbytes(w.tensor); + for (const auto & it : weights_map) { + size_data += ggml_nbytes(it.second.tensor); } } @@ -18598,10 +18603,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } - for (int i = 0; i < ml.n_tensors; ++i) { - const struct ggml_tensor * meta = ml.get_tensor_meta(i); + for (const auto & it : ml.weights_map) { + const struct ggml_tensor * tensor = it.second.tensor; - const std::string name = ggml_get_name(meta); + const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || @@ -18639,20 +18644,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> f32_conv_buf; uint16_t n_split = 1; + const auto & weights_map = ml.weights_map; + // Assume split index is continuous if (params->keep_split) { - for (int i = 0; i < ml.n_tensors; ++i) { - n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split); + for (const auto & it : weights_map) { + n_split = std::max(uint16_t(it.second.idx + 1), n_split); } + } std::vector ctx_outs(n_split, NULL); ctx_outs[0] = ctx_out; // populate the original tensors so we get an initial meta data - for (int i = 0; i < ml.n_tensors; ++i) { - auto weight = ml.get_weight(i); - uint16_t i_split = params->keep_split ? weight->idx : 0; - struct ggml_tensor * tensor = weight->tensor; + for (const auto & it : weights_map) { + uint16_t i_split = params->keep_split ? it.second.idx : 0; + struct ggml_tensor * tensor = it.second.tensor; if (ctx_outs[i_split] == NULL) { ctx_outs[i_split] = gguf_init_empty(); } @@ -18699,12 +18706,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const auto tn = LLM_TN(model.arch); new_ofstream(0); - for (int i = 0; i < ml.n_tensors; ++i) { - auto weight = ml.get_weight(i); - struct ggml_tensor * tensor = weight->tensor; - if (weight->idx != cur_split && params->keep_split) { + for (const auto & it : weights_map) { + const auto & weight = it.second; + struct ggml_tensor * tensor = weight.tensor; + if (weight.idx != cur_split && params->keep_split) { close_ofstream(); - new_ofstream(weight->idx); + new_ofstream(weight.idx); } const std::string name = ggml_get_name(tensor); From c02e5ab2a675c8bc1abc8b1e4cb6a93b26bdcce7 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 31 Oct 2024 22:54:23 +0100 Subject: [PATCH 005/279] llama : fix buffer checks for mamba and rwk (#10111) * llama : fix buffer checks for mamba and rwk * llama : fix missing worst case flag during reserve * cuda : fix supports_op for norm * disable sched SET_CAUSE --- ggml/src/ggml-backend.cpp | 2 +- ggml/src/ggml-cuda.cu | 6 ++++-- ggml/src/ggml.c | 1 + src/llama.cpp | 38 +++++++++++++++++++++++++++++--------- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f397f6252..c2afdf391 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1508,7 +1508,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } -#if 1 +#if 0 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 087091516..b57f1b3b7 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3107,18 +3107,20 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } return false; } break; + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; + break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: - case GGML_OP_RMS_NORM: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 149d8f970..6a7154920 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7272,6 +7272,7 @@ struct ggml_tensor * ggml_ssm_conv( const int64_t n_s = sx->ne[2]; // TODO: maybe support other strides than 1? + // FIXME: this is always true? GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t); GGML_ASSERT(sx->ne[1] == d_inner); GGML_ASSERT(n_t >= 0); diff --git a/src/llama.cpp b/src/llama.cpp index bc94d7ff0..e697c310c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_MUL_MAT: { - ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); } break; case GGML_OP_MUL_MAT_ID: @@ -7167,18 +7167,38 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_SSM_CONV: { - // TODO: ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); - op_tensor = ggml_ssm_conv(ctx, nullptr, w); + // FIXME + ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789); + op_tensor = ggml_ssm_conv(ctx, conv_x, w); } break; case GGML_OP_SSM_SCAN: { - // TODO: ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); - op_tensor = ggml_ssm_scan(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr); + // FIXME + const int64_t d_state = w->ne[0]; + const int64_t d_inner = w->ne[1]; + const int64_t n_seq_tokens = 512; + const int64_t n_seqs = 1; + ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs); + ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs); + ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); + ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); + op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C); } break; case GGML_OP_RWKV_WKV: { - // TODO: ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - op_tensor = ggml_rwkv_wkv(ctx, nullptr, nullptr, nullptr, w, nullptr, nullptr); + // FIXME + const int64_t S = 123; + const int64_t H = 123; + const int64_t n_tokens = 123; + const int64_t n_seqs = 123; + ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens); + ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * tf = w; + ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); + ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); + op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state); } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); @@ -7453,7 +7473,7 @@ static bool llm_load_tensors( // tensors with "bias" suffix are always used with GGML_OP_ADD ggml_op op; - bool bias = strcmp(tn.suffix, "bias") == 0; + bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0; if (bias) { op = GGML_OP_ADD; } else { @@ -19681,7 +19701,7 @@ struct llama_context * llama_new_context_with_model( int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = llama_build_graph(*ctx, ubatch_pp, false); + gf_pp = llama_build_graph(*ctx, ubatch_pp, true); if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); llama_free(ctx); From 1e9f94994ef908d964cf81069f03d9d3668beb7d Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 1 Nov 2024 00:45:34 +0100 Subject: [PATCH 006/279] quantize : fix --keep-split (#10114) --- src/llama.cpp | 53 +++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e697c310c..ed3998a1f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4860,19 +4860,12 @@ struct llama_model_loader { *last = 0; *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - try { - const auto * weight = get_weight(ggml_get_name(tensor)); - if (!weight) { - continue; - } - if (weight->idx != idx) { - continue; - } - *first = std::min(*first, weight->offs); - *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); - } catch(...) { - // the tensor is not in the model + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); } } @@ -5049,7 +5042,6 @@ struct llama_model_loader { ggml_backend_tensor_set(cur, data, 0, n_size); } } else { - GGML_ASSERT(weight->idx < files.size()); const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); @@ -18623,8 +18615,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + // make a list of weights + std::vector tensors; + tensors.reserve(ml.weights_map.size()); for (const auto & it : ml.weights_map) { - const struct ggml_tensor * tensor = it.second.tensor; + tensors.push_back(&it.second); + } + + // keep_split requires that the weights are sorted by split index + if (params->keep_split) { + std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { + if (a->idx == b->idx) { + return a->offs < b->offs; + } + return a->idx < b->idx; + }); + } + + for (const auto * it : tensors) { + const struct ggml_tensor * tensor = it->tensor; const std::string name = ggml_get_name(tensor); @@ -18664,22 +18673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> f32_conv_buf; uint16_t n_split = 1; - const auto & weights_map = ml.weights_map; // Assume split index is continuous if (params->keep_split) { - for (const auto & it : weights_map) { - n_split = std::max(uint16_t(it.second.idx + 1), n_split); + for (const auto * it : tensors) { + n_split = std::max(uint16_t(it->idx + 1), n_split); } - } std::vector ctx_outs(n_split, NULL); ctx_outs[0] = ctx_out; // populate the original tensors so we get an initial meta data - for (const auto & it : weights_map) { - uint16_t i_split = params->keep_split ? it.second.idx : 0; - struct ggml_tensor * tensor = it.second.tensor; + for (const auto * it : tensors) { + uint16_t i_split = params->keep_split ? it->idx : 0; + struct ggml_tensor * tensor = it->tensor; if (ctx_outs[i_split] == NULL) { ctx_outs[i_split] = gguf_init_empty(); } @@ -18726,8 +18733,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const auto tn = LLM_TN(model.arch); new_ofstream(0); - for (const auto & it : weights_map) { - const auto & weight = it.second; + for (const auto * it : tensors) { + const auto & weight = *it; struct ggml_tensor * tensor = weight.tensor; if (weight.idx != cur_split && params->keep_split) { close_ofstream(); From 85679d37f34f66783cc04664a06c405b28e8e035 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 1 Nov 2024 00:49:53 +0100 Subject: [PATCH 007/279] llama : improve output buffer type selection (#10098) --- src/llama.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ed3998a1f..ca0d259b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17162,18 +17162,10 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { auto * buft = ggml_backend_cpu_buffer_type(); // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - ggml_tensor * output_tensor = lctx.model.output; - if (!output_tensor) { - // bert models don't have an output tensor, use the last layer - output_tensor = lctx.model.layers.back().layer_out_norm; - } - if (output_tensor) { - auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer); - auto * output_dev = ggml_backend_buft_get_device(output_buft); - auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev); - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } + auto * output_dev = lctx.model.dev_output.dev; + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; } lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size); if (lctx.buf_output == nullptr) { From e597e50794f07ec8dc24b9efb18f94ec6386fda0 Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Fri, 1 Nov 2024 11:09:59 +0800 Subject: [PATCH 008/279] build: fix build error in Windows env with OneAPI setup (#10107) --- ggml/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 915568798..7365ac91b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1402,7 +1402,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads) find_library(MATH_LIBRARY m) if (MATH_LIBRARY) - if (NOT WIN32 OR NOT GGML_SYCL) + if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT}) list(APPEND GGML_EXTRA_LIBS_PRIVATE m) endif() endif() From f221d56220899f38f0126e683b2432bc79d1e3f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 10:23:05 +0200 Subject: [PATCH 009/279] ggml : alloc ggml_contexts on the heap (whisper/2525) --- ggml/include/ggml.h | 7 +++-- ggml/src/ggml.c | 67 +++++++++++++-------------------------------- 2 files changed, 23 insertions(+), 51 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index de3c706fc..e5862246c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -217,7 +217,6 @@ #define GGML_MAX_DIMS 4 #define GGML_MAX_PARAMS 2048 -#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 #define GGML_MAX_N_THREADS 512 #define GGML_MAX_OP_PARAMS 64 @@ -657,6 +656,7 @@ extern "C" { }; // scratch buffer + // TODO: deprecate and remove struct ggml_scratch { size_t offs; size_t size; @@ -760,8 +760,9 @@ extern "C" { // main - GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); - GGML_API void ggml_free(struct ggml_context * ctx); + GGML_API struct ggml_context * ggml_init (struct ggml_init_params params); + GGML_API void ggml_reset(struct ggml_context * ctx); + GGML_API void ggml_free (struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6a7154920..59f2ed043 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { } #define GGML_DEBUG 0 + #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -2014,7 +2015,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); struct ggml_context { size_t mem_size; - void* mem_buffer; + void * mem_buffer; bool mem_buffer_owned; bool no_alloc; bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers @@ -3263,7 +3264,6 @@ struct ggml_numa_nodes { // struct ggml_state { - struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; struct ggml_numa_nodes numa; }; @@ -3845,7 +3845,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); g_state = (struct ggml_state) { - /*.contexts =*/ { { 0 } }, /*.numa =*/ { .n_nodes = 0, .total_cpus = 0, @@ -3864,26 +3863,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { is_first_call = false; } - // find non-used context in g_state - struct ggml_context * ctx = NULL; + ggml_critical_section_end(); - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { - if (!g_state.contexts[i].used) { - g_state.contexts[i].used = true; - ctx = &g_state.contexts[i].context; - - GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); - break; - } - } - - if (ctx == NULL) { - GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); - - ggml_critical_section_end(); - - return NULL; - } + struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context)); // allow to call ggml_init with 0 size if (params.mem_size == 0) { @@ -3911,42 +3893,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: context initialized\n", __func__); - ggml_critical_section_end(); - return ctx; } +void ggml_reset(struct ggml_context * ctx) { + if (ctx == NULL) { + return; + } + + ctx->n_objects = 0; + ctx->objects_begin = NULL; + ctx->objects_end = NULL; + ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, }; + ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, }; +} + void ggml_free(struct ggml_context * ctx) { if (ctx == NULL) { return; } - // make this function thread safe - ggml_critical_section_start(); - - bool found = false; - - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { - if (&g_state.contexts[i].context == ctx) { - g_state.contexts[i].used = false; - - GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", - __func__, i, ggml_used_mem(ctx)); - - if (ctx->mem_buffer_owned) { - ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); - } - - found = true; - break; - } + if (ctx->mem_buffer_owned) { + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } - if (!found) { - GGML_PRINT_DEBUG("%s: context not found\n", __func__); - } - - ggml_critical_section_end(); + GGML_FREE(ctx); } size_t ggml_used_mem(const struct ggml_context * ctx) { From 815fe72adcea5ec79d358db6a4c479191f396b3c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 10:28:24 +0200 Subject: [PATCH 010/279] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index da40927e1..48863847c 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -162e232411ee98ceb0cccfa84886118d917d2123 +bb78a40dc60e04c626bac2b65840b509988e990d From 1804adb0cfee4811eaf633741503d683a46e4c77 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 12:58:45 +0200 Subject: [PATCH 011/279] ggml : remove ggml_scratch (#10121) ggml-ci --- ggml/include/ggml.h | 9 ------ ggml/src/ggml.c | 67 ++------------------------------------------- 2 files changed, 2 insertions(+), 74 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e5862246c..41df85557 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -655,14 +655,6 @@ extern "C" { void * abort_callback_data; }; - // scratch buffer - // TODO: deprecate and remove - struct ggml_scratch { - size_t offs; - size_t size; - void * data; - }; - struct ggml_init_params { // memory pool size_t mem_size; // bytes @@ -766,7 +758,6 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 59f2ed043..84f2c766b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2018,15 +2018,11 @@ struct ggml_context { void * mem_buffer; bool mem_buffer_owned; bool no_alloc; - bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers int n_objects; struct ggml_object * objects_begin; struct ggml_object * objects_end; - - struct ggml_scratch scratch; - struct ggml_scratch scratch_save; }; struct ggml_context_container { @@ -3879,12 +3875,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, - /*.no_alloc_save =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, /*.objects_end =*/ NULL, - /*.scratch =*/ { 0, 0, NULL, }, - /*.scratch_save =*/ { 0, 0, NULL, }, }; GGML_ASSERT(ctx->mem_buffer != NULL); @@ -3904,8 +3897,6 @@ void ggml_reset(struct ggml_context * ctx) { ctx->n_objects = 0; ctx->objects_begin = NULL; ctx->objects_end = NULL; - ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, }; - ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, }; } void ggml_free(struct ggml_context * ctx) { @@ -3924,14 +3915,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx) { return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { - const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; - - ctx->scratch = scratch; - - return result; -} - bool ggml_get_no_alloc(struct ggml_context * ctx) { return ctx->no_alloc; } @@ -3959,27 +3942,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { return max_size; } -// IMPORTANT: -// when creating "opt" tensors, always save and load the scratch buffer -// this is an error prone process, but it is necessary to support inplace -// operators when using scratch buffers -// TODO: implement a better way -static void ggml_scratch_save(struct ggml_context * ctx) { - // this is needed to allow opt tensors to store their data - // TODO: again, need to find a better way - ctx->no_alloc_save = ctx->no_alloc; - ctx->no_alloc = false; - - ctx->scratch_save = ctx->scratch; - ctx->scratch.data = NULL; -} - -static void ggml_scratch_load(struct ggml_context * ctx) { - ctx->no_alloc = ctx->no_alloc_save; - - ctx->scratch = ctx->scratch_save; -} - //////////////////////////////////////////////////////////////////////////////// static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { @@ -4060,29 +4022,13 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; if (view_src == NULL && !ctx->no_alloc) { - if (ctx->scratch.data != NULL) { - // allocate tensor data in the scratch buffer - if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + data_size, ctx->scratch.size); - assert(false); - return NULL; - } - - data = (char * const) ctx->scratch.data + ctx->scratch.offs; - - ctx->scratch.offs += data_size; - } else { - // allocate tensor data in the context's memory pool - obj_alloc_size = data_size; - } + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; } struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); GGML_ASSERT(obj_new); - // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here - struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); #ifdef __clang__ @@ -4178,24 +4124,16 @@ struct ggml_tensor * ggml_new_tensor_4d( } struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - ggml_scratch_load(ctx); - ggml_set_i32(result, value); return result; } struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_scratch_load(ctx); - ggml_set_f32(result, value); return result; @@ -20263,7 +20201,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { uint64_t size_eval = 0; // compute size of intermediate results - // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } From d865d1478cd4e403f82d793c2afcd0f943412f05 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Fri, 1 Nov 2024 13:33:14 +0000 Subject: [PATCH 012/279] server : fix smart selection of available slot (#10120) * Fix smart selection of available slot * minor fix * replace vectors of tokens with shorthands --- examples/server/server.cpp | 35 +++++++++---------------- examples/server/utils.hpp | 52 ++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f914ff88c..54cdb4b72 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -725,12 +725,12 @@ struct server_context { return nullptr; } - server_slot * get_available_slot(const std::string & prompt) { + server_slot * get_available_slot(const server_task & task) { server_slot * ret = nullptr; // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { - int max_lcp_len = 0; + if (ret == nullptr && slot_prompt_similarity != 0.0f) { + int max_lcs_len = 0; float similarity = 0; for (server_slot & slot : slots) { @@ -740,25 +740,25 @@ struct server_context { } // skip the slot if it does not contains cached tokens - if (slot.prompt_tokens.empty()) { + if (slot.cache_tokens.empty()) { continue; } - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens); + // length of the Longest Common Subsequence between the current slot's prompt and the input prompt + int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); - // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / static_cast(slot.prompt_tokens.size()); + // fraction of the common subsequence length compared to the current slot's prompt length + similarity = static_cast(lcs_len) / static_cast(slot.cache_tokens.size()); // select the current slot if the criteria match - if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { - max_lcp_len = lcp_len; + if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { + max_lcs_len = lcs_len; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); + SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); } } @@ -1514,18 +1514,7 @@ struct server_context { { const int id_slot = json_value(task.data, "id_slot", -1); - server_slot * slot; - - if (id_slot != -1) { - slot = get_slot_by_id(id_slot); - } else { - std::string prompt; - if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { - prompt = json_value(task.data, "prompt", std::string()); - } - - slot = get_available_slot(prompt); - } + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 58f5a5684..871a17a4f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -439,18 +439,60 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t longest_common_prefix(const std::string & a, const std::string & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} +static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } - return i; + // get the lengths of the input sequences + int a_len = a.size(); + int b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + int max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (int i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (int j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; } static bool ends_with(const std::string & str, const std::string & suffix) { From ba6f62eb793d6617892d252f5c04d7685d908a38 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 17:31:51 +0200 Subject: [PATCH 013/279] readme : update hot topics --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8fe1f4b4b..0378a674e 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics -- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669** +- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123 +- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- From 418f5eef262cea07c2af4f45ee6a88d882221fcb Mon Sep 17 00:00:00 2001 From: Shupei Fan Date: Sat, 2 Nov 2024 02:33:14 +0800 Subject: [PATCH 014/279] vulkan : improve ggml_vk_create_buffer error handling (#9898) --- ggml/src/ggml-vulkan.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 83c37ea9c..a8e78c4db 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor return buf; } - buf->size = size; vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, @@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor if (memory_type_index == UINT32_MAX) { device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } @@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor } catch (const vk::SystemError& e) { device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw e; } } else { // Out of Host/Device memory, clean up buffer device->device.destroyBuffer(buf->buffer); - buf->size = 0; throw e; } } @@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); buf->device = device; + buf->size = size; #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger->log_allocation(buf, size); From e991e3127ff71a29e61fe1de5dd1cbd2e1df1858 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 1 Nov 2024 23:48:26 +0100 Subject: [PATCH 015/279] llama : use smart pointers for ggml resources (#10117) --- ggml/include/ggml-cpp.h | 38 ++++ ggml/src/CMakeLists.txt | 1 + spm-headers/ggml-cpp.h | 1 + src/llama.cpp | 424 +++++++++++++++++----------------------- 4 files changed, 219 insertions(+), 245 deletions(-) create mode 100644 ggml/include/ggml-cpp.h create mode 120000 spm-headers/ggml-cpp.h diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h new file mode 100644 index 000000000..219361af4 --- /dev/null +++ b/ggml/include/ggml-cpp.h @@ -0,0 +1,38 @@ +#pragma once + +#ifndef __cplusplus +#error "This header is for C++ only" +#endif + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include + +// Smart pointers for ggml types + +// ggml + +struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } }; +struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } }; + +typedef std::unique_ptr ggml_context_ptr; +typedef std::unique_ptr gguf_context_ptr; + +// ggml-alloc + +struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; + +typedef std::unique_ptr ggml_gallocr_ptr; + +// ggml-backend + +struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } }; +struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } }; +struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } }; +struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } }; + +typedef std::unique_ptr ggml_backend_ptr; +typedef std::unique_ptr ggml_backend_buffer_ptr; +typedef std::unique_ptr ggml_backend_event_ptr; +typedef std::unique_ptr ggml_backend_sched_ptr; diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 7365ac91b..0764a8d90 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1368,6 +1368,7 @@ add_library(ggml ../include/ggml.h ../include/ggml-alloc.h ../include/ggml-backend.h + ../include/ggml-cpp.h ggml.c ggml-alloc.c ggml-backend.cpp diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h new file mode 120000 index 000000000..8a8604cc2 --- /dev/null +++ b/spm-headers/ggml-cpp.h @@ -0,0 +1 @@ +../ggml/include/ggml-cpp.h \ No newline at end of file diff --git a/src/llama.cpp b/src/llama.cpp index ca0d259b2..0991c4089 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7,6 +7,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "ggml-cpp.h" // TODO: replace with ggml API call #define QK_K 256 @@ -2797,31 +2798,22 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; - size_t total_size() const { + size_t total_size() { size_t size = 0; - for (ggml_backend_buffer_t buf : bufs) { - size += ggml_backend_buffer_get_size(buf); + for (auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); } return size; } - - ~llama_kv_cache() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } }; struct llama_control_vector { std::vector tensors; // per layer - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; int32_t layer_start = -1; int32_t layer_end = -1; @@ -2840,15 +2832,6 @@ struct llama_control_vector { } return cur; } - - ~llama_control_vector() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } }; struct llama_model { @@ -2908,10 +2891,10 @@ struct llama_model { std::vector dev_layer; // contexts where the model tensors metadata is stored - std::vector ctxs; + std::vector ctxs; // the model memory buffers for the tensor data - std::vector bufs; + std::vector bufs; // model memory mapped files llama_mmaps mappings; @@ -2930,13 +2913,7 @@ struct llama_model { std::set lora_adapters; ~llama_model() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - while (!lora_adapters.empty()) { + while (!lora_adapters.empty()) { llama_lora_adapter_free(*lora_adapters.begin()); } } @@ -3253,16 +3230,6 @@ struct llama_context { , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} - ~llama_context() { - ggml_backend_sched_free(sched); - - for (ggml_backend_t backend : backends) { - ggml_backend_free(backend); - } - - ggml_backend_buffer_free(buf_output); - } - const struct llama_model & model; struct llama_cparams cparams; @@ -3272,7 +3239,7 @@ struct llama_context { std::unordered_map lora_adapters; - std::vector backends; + std::vector backends; std::vector> set_n_threads_fns; ggml_backend_t backend_cpu = nullptr; @@ -3294,7 +3261,7 @@ struct llama_context { mutable int32_t n_eval = 0; // number of eval calls // host buffer for the model output (logits and embeddings) - ggml_backend_buffer_t buf_output = nullptr; + ggml_backend_buffer_ptr buf_output; // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits @@ -3324,7 +3291,7 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_sched_t sched = nullptr; + ggml_backend_sched_ptr sched; ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; @@ -3358,8 +3325,8 @@ struct llama_lora_adapter { struct llama_model * base_model; // map tensor name to lora_a_b std::unordered_map ab_map; - std::vector ctxs; - std::vector bufs; + std::vector ctxs; + std::vector bufs; float alpha; @@ -3377,12 +3344,6 @@ struct llama_lora_adapter { } ~llama_lora_adapter() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } auto pos = base_model->lora_adapters.find(this); if (pos != base_model->lora_adapters.end()) { base_model->lora_adapters.erase(pos); @@ -3401,24 +3362,21 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); + ggml_context_ptr ctx { ggml_init(params) }; if (!ctx) { throw std::runtime_error(format("failed to create ggml context")); } - ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, 0); - ggml_tensor * op_tensor = fn(ctx); + ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; + ggml_tensor * op_tensor = fn(ctx.get()); for (int i = 0; i < GGML_MAX_SRC; i++) { if (op_tensor->src[i] != nullptr) { assert(op_tensor->src[i]->buffer == nullptr); - op_tensor->src[i]->buffer = buf; + op_tensor->src[i]->buffer = buf.get(); } } bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); - ggml_free(ctx); - ggml_backend_buffer_free(buf); - return op_supported; } @@ -3470,7 +3428,8 @@ static bool llama_kv_cache_init( // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { struct ggml_init_params params = { /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, @@ -3481,9 +3440,10 @@ static bool llama_kv_cache_init( return nullptr; } ctx_map[buft] = ctx; - cache.ctxs.push_back(ctx); + cache.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; cache.k_l.reserve(n_layer); @@ -3535,7 +3495,7 @@ static bool llama_kv_cache_init( } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.push_back(buf); + cache.bufs.emplace_back(buf); } return true; @@ -3788,7 +3748,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { cache.used = 0; for (auto & buf : cache.bufs) { - ggml_backend_buffer_clear(buf, 0); + ggml_backend_buffer_clear(buf.get(), 0); } } @@ -4301,8 +4261,8 @@ struct llama_model_loader { std::map weights_map; std::unordered_map kv_overrides; - struct gguf_context * meta = NULL; - std::vector contexts; + gguf_context_ptr meta; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); @@ -4325,7 +4285,7 @@ struct llama_model_loader { /*.ctx = */ &ctx, }; - meta = gguf_init_from_file(fname.c_str(), params); + meta.reset(gguf_init_from_file(fname.c_str(), params)); if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } @@ -4347,7 +4307,7 @@ struct llama_model_loader { } n_elements += ggml_nelements(cur); n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur)); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); } uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); @@ -4377,7 +4337,7 @@ struct llama_model_loader { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path, split_params) }; if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } @@ -4394,10 +4354,8 @@ struct llama_model_loader { } n_elements += ggml_nelements(cur); n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur)); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); } - - gguf_free(ctx_gguf); } get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); @@ -4413,10 +4371,10 @@ struct llama_model_loader { LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } - n_kv = gguf_get_n_kv(meta); + n_kv = gguf_get_n_kv(meta.get()); n_tensors = weights_map.size(); - fver = (enum llama_fver) gguf_get_version(meta); + fver = (enum llama_fver) gguf_get_version(meta.get()); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); @@ -4487,23 +4445,23 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV + const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(meta, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(meta, i); - const enum gguf_type type = gguf_get_kv_type(meta, i); + const char * name = gguf_get_key(meta.get(), i); + const enum gguf_type type = gguf_get_kv_type(meta.get(), i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(meta, i); + std::string value = gguf_kv_to_str(meta.get(), i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -4532,19 +4490,10 @@ struct llama_model_loader { this->check_tensors = check_tensors; } - ~llama_model_loader() { - if (meta) { - gguf_free(meta); - } - for (auto * ctx : contexts) { - ggml_free(ctx); - } - } - template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); if (kid < 0) { if (required) { @@ -4554,7 +4503,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); result = arr_info.length; @@ -4569,9 +4518,9 @@ struct llama_model_loader { template bool get_arr(const std::string & key, std::vector & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -4579,7 +4528,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; @@ -4598,9 +4547,9 @@ struct llama_model_loader { template bool get_arr(const std::string & key, std::array & result, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -4608,7 +4557,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); switch (arr_info.gt) { case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; @@ -4640,7 +4589,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(meta, key, result, override); + const bool found = GGUFMeta::GKV::set(meta.get(), key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -4657,7 +4606,7 @@ struct llama_model_loader { // get array of n <= N_MAX elements, or a single element repeated n times template bool get_key_or_arr(const std::string & key, std::array & result, uint32_t n, const bool required = true) { - const int kid = gguf_find_key(meta, key.c_str()); + const int kid = gguf_find_key(meta.get(), key.c_str()); if (kid < 0) { if (required) { @@ -4670,9 +4619,9 @@ struct llama_model_loader { throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); } - if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) { + if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta, kid); + GGUFMeta::GKV::get_kv(meta.get(), kid); if (n != arr_info.length) { throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); @@ -5342,7 +5291,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.meta; + const gguf_context * ctx = ml.meta.get(); // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -6109,7 +6058,7 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.meta; + struct gguf_context * ctx = ml.meta.get(); const auto kv = LLM_KV(model.arch); @@ -7104,10 +7053,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); - if (!ctx) { + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { throw std::runtime_error(format("failed to create ggml context")); } + ggml_context * ctx = ctx_ptr.get(); ggml_tensor * op_tensor = nullptr; @@ -7203,8 +7153,6 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_backend_buffer_free(w->buffer); w->buffer = nullptr; - ggml_free(ctx); - return op_supported; } @@ -7395,7 +7343,8 @@ static bool llm_load_tensors( std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -7406,9 +7355,10 @@ static bool llm_load_tensors( throw std::runtime_error(format("failed to create ggml context")); } ctx_map[buft] = ctx; - model.ctxs.push_back(ctx); + model.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; // create tensors for the weights @@ -9134,7 +9084,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - model.bufs.push_back(buf); + model.bufs.emplace_back(buf); bufs.emplace(idx, buf); } } @@ -9143,7 +9093,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - model.bufs.push_back(buf); + model.bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); @@ -9183,13 +9133,13 @@ static bool llm_load_tensors( } // print memory requirements per buffer type - for (ggml_backend_buffer_t buf : model.bufs) { - LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + for (auto & buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name - for (ggml_context * ctx : model.ctxs) { - for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + for (auto & ctx : model.ctxs) { + for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) { model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } } @@ -10294,10 +10244,8 @@ struct llm_build_context { } void free() { - if (ctx0) { - ggml_free(ctx0); - ctx0 = nullptr; - } + ggml_free(ctx0); + ctx0 = nullptr; } struct ggml_cgraph * build_k_shift() { @@ -10325,10 +10273,10 @@ struct llm_build_context { // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); cb(tmp, "K_f32", il); - for (auto * backend : lctx.backends) { + for (auto & backend : lctx.backends) { // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend, ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend); + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); break; } } @@ -16443,7 +16391,7 @@ static struct ggml_cgraph * llama_build_graph( if (!lctx.cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu); + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); } } @@ -16453,10 +16401,10 @@ static struct ggml_cgraph * llama_build_graph( if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = lctx.model.dev_layer.at(il); - for (auto * backend : lctx.backends) { - if (ggml_backend_get_device(backend) == dev_layer.dev) { - if (ggml_backend_supports_op(backend, cur)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + for (auto & backend : lctx.backends) { + if (ggml_backend_get_device(backend.get()) == dev_layer.dev) { + if (ggml_backend_supports_op(backend.get(), cur)) { + ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get()); } } } @@ -17143,7 +17091,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { lctx.output_ids.resize(n_batch); } - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0; + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; const size_t new_size = (logits_size + embd_size) * sizeof(float); // alloc only when more than the current capacity is required @@ -17154,7 +17102,6 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif - ggml_backend_buffer_free(lctx.buf_output); lctx.buf_output = nullptr; lctx.logits = nullptr; lctx.embd = nullptr; @@ -17167,14 +17114,14 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { if (output_dev_host_buft) { buft = output_dev_host_buft; } - lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size); + lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); if (lctx.buf_output == nullptr) { LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); return 0; } } - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output); + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); lctx.logits = has_logits ? output_base : nullptr; lctx.embd = has_embd ? output_base + logits_size : nullptr; @@ -17186,7 +17133,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // set all ids as invalid (negative) std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - ggml_backend_buffer_clear(lctx.buf_output, 0); + ggml_backend_buffer_clear(lctx.buf_output.get(), 0); lctx.n_outputs = 0; @@ -17246,7 +17193,7 @@ static void llama_graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf); + auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); if (err != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err); } @@ -17404,8 +17351,8 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_reset(lctx.sched.get()); + ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -17433,7 +17380,7 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_inputs(lctx, ubatch); @@ -17456,7 +17403,7 @@ static int llama_decode_internal( // extract logits if (res) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res); GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(lctx.logits != nullptr); @@ -17472,7 +17419,7 @@ static int llama_decode_internal( // extract embeddings if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); GGML_ASSERT(backend_embd != nullptr); switch (cparams.pooling_type) { @@ -17567,7 +17514,7 @@ static int llama_decode_internal( // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); return 0; } @@ -17645,8 +17592,8 @@ static int llama_encode_internal( GGML_ASSERT(n_threads > 0); - ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_reset(lctx.sched.get()); + ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -17670,7 +17617,7 @@ static int llama_encode_internal( } } - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_inputs(lctx, ubatch); @@ -17678,7 +17625,7 @@ static int llama_encode_internal( // extract embeddings if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); GGML_ASSERT(backend_embd != nullptr); if (llama_model_has_decoder(&lctx.model)) { @@ -17745,7 +17692,7 @@ static int llama_encode_internal( // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); return 0; } @@ -17959,7 +17906,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { #else // ggml_graph defrag - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); @@ -17981,11 +17928,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } { - ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - ggml_backend_sched_alloc_graph(lctx.sched, gf); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); llama_set_k_shift(lctx); @@ -18025,8 +17972,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched); - if (!ggml_backend_sched_reserve(lctx.sched, gf)) { + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } } @@ -18577,30 +18524,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } const size_t align = GGUF_DEFAULT_ALIGNMENT; - struct gguf_context * ctx_out = gguf_init_empty(); + gguf_context_ptr ctx_out { gguf_init_empty() }; // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.meta); - gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV - gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV + gguf_set_kv (ctx_out.get(), ml.meta.get()); + gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV + gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV // Remove split metadata - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); - gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); + gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); if (params->kv_overrides) { const std::vector & overrides = *(const std::vector *)params->kv_overrides; - for (auto & o : overrides) { + for (const auto & o : overrides) { if (o.key[0] == 0) break; if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { - gguf_set_val_f32(ctx_out, o.key, o.val_f64); + gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { - gguf_set_val_i32(ctx_out, o.key, o.val_i64); + gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { - gguf_set_val_bool(ctx_out, o.key, o.val_bool); + gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { - gguf_set_val_str(ctx_out, o.key, o.val_str); + gguf_set_val_str(ctx_out.get(), o.key, o.val_str); } else { LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); } @@ -18672,25 +18619,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s n_split = std::max(uint16_t(it->idx + 1), n_split); } } - std::vector ctx_outs(n_split, NULL); - ctx_outs[0] = ctx_out; + std::vector ctx_outs(n_split); + ctx_outs[0] = std::move(ctx_out); // populate the original tensors so we get an initial meta data for (const auto * it : tensors) { uint16_t i_split = params->keep_split ? it->idx : 0; struct ggml_tensor * tensor = it->tensor; - if (ctx_outs[i_split] == NULL) { - ctx_outs[i_split] = gguf_init_empty(); + if (!ctx_outs[i_split]) { + ctx_outs[i_split].reset(gguf_init_empty()); } - gguf_add_tensor(ctx_outs[i_split], tensor); + gguf_add_tensor(ctx_outs[i_split].get(), tensor); } // Set split info if needed if (n_split > 1) { for (size_t i = 0; i < ctx_outs.size(); ++i) { - gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); - gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); - gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); + gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); + gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); + gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); } } @@ -18700,8 +18647,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // Write metadata and close file handler if (fout.is_open()) { fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_outs[cur_split])); - gguf_get_meta_data(ctx_outs[cur_split], data.data()); + std::vector data(gguf_get_meta_size(ctx_outs[cur_split].get())); + gguf_get_meta_data(ctx_outs[cur_split].get(), data.data()); fout.write((const char *) data.data(), data.size()); fout.close(); } @@ -18718,7 +18665,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s fout = std::ofstream(fname, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors - const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]); + const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get()); // placeholder for the meta data ::zeros(fout, meta_size); }; @@ -18903,17 +18850,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s total_size_new += new_size; // update the gguf meta data as we go - gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); - gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); + gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size); // write tensor data + padding fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } close_ofstream(); - for (auto & c:ctx_outs) { - gguf_free(c); - } LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); @@ -18927,51 +18871,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - ggml_context * ctx = nullptr; + ggml_context * ctx_init; struct gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, - /* .ctx = */ &ctx, + /* .ctx = */ &ctx_init, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + + gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; if (!ctx_gguf) { throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); } + ggml_context_ptr ctx { ctx_init }; + // check metadata { auto get_kv_str = [&](const std::string & key) -> std::string { - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); }; auto get_kv_f32 = [&](const std::string & key) -> float { - int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); + int id = gguf_find_key(ctx_gguf.get(), key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); }; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); if (general_type != "adapter") { - gguf_free(ctx_gguf); throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); } auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); auto general_arch = llm_arch_from_string(general_arch_str); if (general_arch != model->arch) { - gguf_free(ctx_gguf); throw std::runtime_error("model arch and LoRA arch mismatch"); } auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); if (adapter_type != "lora") { - gguf_free(ctx_gguf); throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); } - int n_tensors = gguf_get_n_tensors(ctx_gguf); + int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); // contexts for each buffer type std::map ctx_map; @@ -18985,7 +18929,11 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c /*.no_alloc =*/ true, }; ggml_context * buft_ctx = ggml_init(params); + if (!buft_ctx) { + return nullptr; + } ctx_map[buft] = buft_ctx; + adapter.ctxs.emplace_back(buft_ctx); return buft_ctx; }; return it->second; @@ -18996,7 +18944,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c auto str_endswith = [](const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; }; - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { std::string name(cur->name); if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); @@ -19013,8 +18961,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ab_map[name].b = cur; } } else { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } } @@ -19025,28 +18971,20 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c llama_lora_weight & w = it.second; if (!w.a || !w.b) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); } // device buft and device ctx auto * model_tensor = llama_get_model_tensor(model, name.c_str()); if (!model_tensor) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); } struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("tensor '" + name + "' has incorrect shape"); } if (w.a->ne[1] != w.b->ne[0]) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); } // save tensor to adapter @@ -19061,18 +18999,15 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c { adapter.ctxs.reserve(ctx_map.size()); adapter.bufs.reserve(ctx_map.size()); - for (auto it : ctx_map) { + for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx_dev = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); + ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; if (!buf) { - gguf_free(ctx_gguf); - ggml_free(ctx); throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } - LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - adapter.ctxs.push_back(ctx_dev); - adapter.bufs.push_back(buf); + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); + adapter.bufs.emplace_back(std::move(buf)); } } @@ -19081,7 +19016,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c llama_file gguf_file(path_lora, "rb"); std::vector read_buf; auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { - size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); size_t size = ggml_nbytes(orig); read_buf.resize(size); gguf_file.seek(offs, SEEK_SET); @@ -19097,10 +19032,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c } LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); - - // free ctx for reading gguf - gguf_free(ctx_gguf); - ggml_free(ctx); } int32_t llama_lora_adapter_set( @@ -19549,7 +19480,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ctx->backends.emplace_back(backend); } // add ACCEL backends (such as BLAS) @@ -19562,7 +19493,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ctx->backends.emplace_back(backend); } } @@ -19573,16 +19504,16 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(ctx->backend_cpu); + ctx->backends.emplace_back(ctx->backend_cpu); // create a list of the set_n_threads functions in the backends - for (auto * backend : ctx->backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend); + for (auto & backend : ctx->backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; if (reg) { auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) { - ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn); + ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); } } } @@ -19621,17 +19552,18 @@ struct llama_context * llama_new_context_with_model( } LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_output), - ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); + ggml_backend_buffer_name(ctx->buf_output.get()), + ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); } // scheduler and compute buffers { // buffer types used for the compute buffer of each backend std::vector backend_buft; - for (auto * backend : ctx->backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend); - if (ggml_backend_is_cpu(backend) && !model->devices.empty()) { + std::vector backend_ptrs; + for (auto & backend : ctx->backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19640,6 +19572,7 @@ struct llama_context * llama_new_context_with_model( } } backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); } const size_t max_nodes = llama_model_max_nodes(*model); @@ -19657,12 +19590,12 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { - for (auto * backend : ctx->backends) { - if (ggml_backend_is_cpu(backend)) { + for (auto & backend : ctx->backends) { + if (ggml_backend_is_cpu(backend.get())) { // ignore CPU backend continue; } - auto * dev = ggml_backend_get_device(backend); + auto * dev = ggml_backend_get_device(backend.get()); ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); if (!props.caps.async || !props.caps.events) { @@ -19673,10 +19606,10 @@ struct llama_context * llama_new_context_with_model( } } - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); + ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); } // initialize scheduler with the worst-case graph @@ -19688,29 +19621,29 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(ctx->sched, gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched); + ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); + int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); int n_nodes_pp = ggml_graph_n_nodes(gf_pp); // reserve with tg graph to get the number of splits and nodes llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); - ggml_backend_sched_reserve(ctx->sched, gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched); + ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); + int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference gf_pp = llama_build_graph(*ctx, ubatch_pp, true); - if (!ggml_backend_sched_reserve(ctx->sched, gf_pp)) { + if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); llama_free(ctx); return nullptr; } - for (size_t i = 0; i < ctx->backends.size(); i++) { - ggml_backend_t backend = ctx->backends[i]; + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); + size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); if (size > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), @@ -19990,7 +19923,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { - if (ctx_map.count(buft) == 0) { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { struct ggml_init_params params = { /*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, @@ -20001,12 +19935,12 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return nullptr; } ctx_map[buft] = ctx; - cvec.ctxs.push_back(ctx); + cvec.ctxs.emplace_back(ctx); + return ctx; } - return ctx_map.at(buft); + return it->second; }; - // make tensors cvec.tensors.reserve(model.hparams.n_layer); cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 @@ -20037,7 +19971,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return false; } ggml_backend_buffer_clear(buf, 0); - cvec.bufs.push_back(buf); + cvec.bufs.emplace_back(buf); } return true; @@ -21305,7 +21239,7 @@ int32_t llama_decode( } void llama_synchronize(struct llama_context * ctx) { - ggml_backend_sched_synchronize(ctx->sched); + ggml_backend_sched_synchronize(ctx->sched.get()); // FIXME: if multiple single tokens are evaluated without a synchronization, // the stats will be added to the prompt evaluation stats From a6744e43e80f4be6398fc7733a01642c846dce1d Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 1 Nov 2024 23:50:59 +0100 Subject: [PATCH 016/279] llama : add simple-chat example (#10124) * llama : add simple-chat example --------- Co-authored-by: Xuan Son Nguyen --- Makefile | 6 + examples/CMakeLists.txt | 1 + examples/simple-chat/CMakeLists.txt | 5 + examples/simple-chat/README.md | 7 + examples/simple-chat/simple-chat.cpp | 197 +++++++++++++++++++++++++++ ggml/include/ggml.h | 8 +- 6 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 examples/simple-chat/CMakeLists.txt create mode 100644 examples/simple-chat/README.md create mode 100644 examples/simple-chat/simple-chat.cpp diff --git a/Makefile b/Makefile index 719f45d16..051436344 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ BUILD_TARGETS = \ llama-save-load-state \ llama-server \ llama-simple \ + llama-simple-chat \ llama-speculative \ llama-tokenize \ llama-vdot \ @@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-simple-chat: examples/simple-chat/simple-chat.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-tokenize: examples/tokenize/tokenize.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ead630661..6df318c19 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -49,6 +49,7 @@ else() endif() add_subdirectory(save-load-state) add_subdirectory(simple) + add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(tokenize) endif() diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt new file mode 100644 index 000000000..87723533b --- /dev/null +++ b/examples/simple-chat/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-simple-chat) +add_executable(${TARGET} simple-chat.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/simple-chat/README.md b/examples/simple-chat/README.md new file mode 100644 index 000000000..f0099ce3d --- /dev/null +++ b/examples/simple-chat/README.md @@ -0,0 +1,7 @@ +# llama.cpp/example/simple-chat + +The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file. + +```bash +./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048 +... diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp new file mode 100644 index 000000000..14264cfcb --- /dev/null +++ b/examples/simple-chat/simple-chat.cpp @@ -0,0 +1,197 @@ +#include "llama.h" +#include +#include +#include +#include +#include + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + std::string model_path; + int ngl = 99; + int n_ctx = 2048; + + // parse command line arguments + for (int i = 1; i < argc; i++) { + try { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-c") == 0) { + if (i + 1 < argc) { + n_ctx = std::stoi(argv[++i]); + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + ngl = std::stoi(argv[++i]); + } else { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } catch (std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + print_usage(argc, argv); + return 1; + } + } + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + + // only print errors + llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) { + if (level >= GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "%s", text); + } + }, nullptr); + + // initialize the model + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + if (!model) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // initialize the context + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (!ctx) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + // initialize the sampler + llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + // helper function to evaluate a prompt and generate a response + auto generate = [&](const std::string & prompt) { + std::string response; + + // tokenize the prompt + const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); + std::vector prompt_tokens(n_prompt_tokens); + if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + GGML_ABORT("failed to tokenize the prompt\n"); + } + + // prepare a batch for the prompt + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_token new_token_id; + while (true) { + // check if we have enough space in the context to evaluate this batch + int n_ctx = llama_n_ctx(ctx); + int n_ctx_used = llama_get_kv_cache_used_cells(ctx); + if (n_ctx_used + batch.n_tokens > n_ctx) { + printf("\033[0m\n"); + fprintf(stderr, "context size exceeded\n"); + exit(0); + } + + if (llama_decode(ctx, batch)) { + GGML_ABORT("failed to decode\n"); + } + + // sample the next token + new_token_id = llama_sampler_sample(smpl, ctx, -1); + + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id)) { + break; + } + + // convert the token to a string, print it and add it to the response + char buf[256]; + int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + GGML_ABORT("failed to convert token to piece\n"); + } + std::string piece(buf, n); + printf("%s", piece.c_str()); + fflush(stdout); + response += piece; + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + } + + return response; + }; + + std::vector messages; + std::vector formatted(llama_n_ctx(ctx)); + int prev_len = 0; + while (true) { + // get user input + printf("\033[32m> \033[0m"); + std::string user; + std::getline(std::cin, user); + + if (user.empty()) { + break; + } + + // add the user input to the message list and format it + messages.push_back({"user", strdup(user.c_str())}); + int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); + if (new_len > (int)formatted.size()) { + formatted.resize(new_len); + new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); + } + if (new_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return 1; + } + + // remove previous messages to obtain the prompt to generate the response + std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len); + + // generate a response + printf("\033[33m"); + std::string response = generate(prompt); + printf("\n\033[0m"); + + // add the response to the messages + messages.push_back({"assistant", strdup(response.c_str())}); + prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0); + if (prev_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return 1; + } + } + + // free resources + for (auto & msg : messages) { + free(const_cast(msg.content)); + } + llama_sampler_free(smpl); + llama_free(ctx); + llama_free_model(model); + + return 0; +} diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 41df85557..2d93f31fa 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -558,10 +558,10 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_NONE = 0, - GGML_LOG_LEVEL_INFO = 1, - GGML_LOG_LEVEL_WARN = 2, - GGML_LOG_LEVEL_ERROR = 3, - GGML_LOG_LEVEL_DEBUG = 4, + GGML_LOG_LEVEL_DEBUG = 1, + GGML_LOG_LEVEL_INFO = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_ERROR = 4, GGML_LOG_LEVEL_CONT = 5, // continue previous log }; From 7554aa4655f44b33a29068f2b18c5976fae45f9d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 2 Nov 2024 12:53:17 +0100 Subject: [PATCH 017/279] convert-lora : make `--base` optional (#10110) * convert-lora : make `--base` optional * lint * handle case where base_model_name_or_path is invalid * do not include metadata from base model * clarify unspecified --base * add small comment [no ci] * trigger ci --- convert_hf_to_gguf.py | 27 +++++++++++------------ convert_lora_to_gguf.py | 47 ++++++++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a34dabe23..76ee6cef5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -72,7 +72,8 @@ class Model: def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, + small_first_shard: bool = False, hparams: dict[str, Any] | None = None): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -87,7 +88,7 @@ class Model: self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = Model.load_hparams(self.dir_model) + self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -1541,6 +1542,17 @@ class LlamaModel(Model): special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -1557,17 +1569,6 @@ class LlamaModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 915e21836..ed1014cae 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -12,6 +12,7 @@ import json from math import prod from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast +from transformers import AutoConfig import torch @@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace: help="only print out what will be done, without writing any new files", ) parser.add_argument( - "--base", type=Path, required=True, - help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required", + "--base", type=Path, + help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", ) parser.add_argument( "lora_path", type=Path, @@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: + # normally, adapter does not come with base model config, we need to load it from AutoConfig + config = AutoConfig.from_pretrained(hf_model_id) + return config.to_dict() + + if __name__ == '__main__': args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) @@ -281,7 +288,7 @@ if __name__ == '__main__': ftype = ftype_map[args.outtype] - dir_base_model: Path = args.base + dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -301,9 +308,29 @@ if __name__ == '__main__': input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) + # load LoRA config + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + # load base model - logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + if dir_base_model is None: + if "base_model_name_or_path" in lparams: + model_id = lparams["base_model_name_or_path"] + logger.info(f"Loading base model from Hugging Face: {model_id}") + try: + hparams = load_hparams_from_hf(model_id) + except OSError as e: + logger.error(f"Failed to load base model config: {e}") + logger.error("Please try downloading the base model and add its path to --base") + sys.exit(1) + else: + logger.error("'base_model_name_or_path' is not found in adapter_config.json") + logger.error("Base model config is required. Please download the base model and add its path to --base") + sys.exit(1) + else: + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) + with torch.inference_mode(): try: model_class = Model.from_model_architecture(hparams["architectures"][0]) @@ -323,13 +350,15 @@ if __name__ == '__main__': self.dir_model_card = dir_lora_model self.lora_alpha = float(lora_alpha) + def set_vocab(self): + pass + def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") def set_gguf_parameters(self): self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) - super().set_gguf_parameters() def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Never add extra tensors (e.g. rope_freqs) for LoRA adapters @@ -350,7 +379,7 @@ if __name__ == '__main__': logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") - logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()") + logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") sys.exit(1) if base_name in tensor_map: @@ -384,9 +413,6 @@ if __name__ == '__main__': yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - alpha: float = lparams["lora_alpha"] model_instance = LoraModel( @@ -399,6 +425,7 @@ if __name__ == '__main__': dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, + hparams=hparams, ) logger.info("Exporting model...") From b634f8a26fef65210fd9fb2f87e83a2809535e89 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 2 Nov 2024 13:08:53 +0100 Subject: [PATCH 018/279] simple-chat : only add bos on first prompt (#10129) --- examples/simple-chat/simple-chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 14264cfcb..5f9973163 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -96,7 +96,7 @@ int main(int argc, char ** argv) { // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); std::vector prompt_tokens(n_prompt_tokens); - if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) { GGML_ABORT("failed to tokenize the prompt\n"); } From 1926d6e39d6f6358bc1a4c52316a560178be7233 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 2 Nov 2024 15:18:56 +0200 Subject: [PATCH 019/279] llama : adjust default context size + print warnings (#10136) * llama : adjust default context size + print warnings ggml-ci * ggml-ci : add missing gpu-layers + adjust context sizes --- ci/run.sh | 164 ++++++++++++++++++++++++------------------------ common/common.h | 2 +- src/llama.cpp | 26 ++++++-- 3 files changed, 103 insertions(+), 89 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index dc26d94ee..21b62dd1e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -460,34 +460,34 @@ function gg_run_pythia_1_4b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -591,36 +591,36 @@ function gg_run_pythia_2_8b { ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -706,8 +706,8 @@ function gg_run_embd_bge_small { ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 - (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log set +e } @@ -752,7 +752,7 @@ function gg_run_rerank_tiny { model_f16="${path_models}/ggml-model-f16.gguf" # for this model, the SEP token is "" - (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log + (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log # sample output # rerank score 0: 0.029 diff --git a/common/common.h b/common/common.h index cd5a8e051..727f85baa 100644 --- a/common/common.h +++ b/common/common.h @@ -155,7 +155,7 @@ struct common_sampler_params { struct common_params { int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 0; // context size + int32_t n_ctx = 4096; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt diff --git a/src/llama.cpp b/src/llama.cpp index 0991c4089..3f534596e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19440,12 +19440,26 @@ struct llama_context * llama_new_context_with_model( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); - LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); - LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + + if (n_ctx_per_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + if (n_ctx_per_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } ctx->abort_callback = params.abort_callback; ctx->abort_callback_data = params.abort_callback_data; From 45950415ed985830c59bf42cf9c9216b20cf08ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 2 Nov 2024 18:34:00 +0200 Subject: [PATCH 020/279] server : fix endpoint checks (#10135) ggml-ci --- examples/server/server.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 54cdb4b72..5c1af549b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2703,8 +2703,8 @@ int main(int argc, char ** argv) { }; const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { - if (ctx_server.params.embedding || ctx_server.params.reranking) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2809,8 +2809,8 @@ int main(int argc, char ** argv) { // TODO: maybe merge this function with "handle_completions_generic" const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding || ctx_server.params.reranking) { - res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2935,11 +2935,6 @@ int main(int argc, char ** argv) { }; const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - // TODO: somehow clean up this checks in the future - if (!ctx_server.params.embedding || ctx_server.params.reranking) { - res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); - return; - } const json body = json::parse(req.body); bool is_openai = false; @@ -2991,10 +2986,11 @@ int main(int argc, char ** argv) { }; const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params.reranking) { - res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); + if (!ctx_server.params.reranking || ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); return; } + const json body = json::parse(req.body); // TODO: implement From 42cadc74bda60afafb45b71b1a39d150ede0ed4d Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Sat, 2 Nov 2024 16:34:56 +0000 Subject: [PATCH 021/279] server : fix slot selection by lru (#10126) * server : fix slot selection by lru, migrate lcs to `size_t` * minor debug log fix --- examples/server/server.cpp | 14 ++++++++------ examples/server/utils.hpp | 14 +++++++------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5c1af549b..8531a784d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -247,6 +247,7 @@ struct server_slot { if (is_processing()) { SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); + t_last_used = ggml_time_us(); t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; state = SLOT_STATE_IDLE; callback_on_release(id); @@ -730,7 +731,7 @@ struct server_context { // find the slot that has at least n% prompt similarity if (ret == nullptr && slot_prompt_similarity != 0.0f) { - int max_lcs_len = 0; + int lcs_len = 0; float similarity = 0; for (server_slot & slot : slots) { @@ -745,20 +746,21 @@ struct server_context { } // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); + int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); // fraction of the common subsequence length compared to the current slot's prompt length - similarity = static_cast(lcs_len) / static_cast(slot.cache_tokens.size()); + float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); // select the current slot if the criteria match - if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { - max_lcs_len = lcs_len; + if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) { + lcs_len = cur_lcs_len; + similarity = cur_similarity; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); + SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity); } } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 871a17a4f..c47ed3e47 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok } // get the lengths of the input sequences - int a_len = a.size(); - int b_len = b.size(); + size_t a_len = a.size(); + size_t b_len = b.size(); // initialize the maximum length of the longest common subsequence (LCS) - int max_length = 0; + size_t max_length = 0; // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); // iterate through the elements of a - for (int i = 1; i <= a_len; i++) { + for (size_t i = 1; i <= a_len; i++) { // iterate through the elements of b - for (int j = 1; j <= b_len; j++) { + for (size_t j = 1; j <= b_len; j++) { // if elements at the current positions match if (a[i - 1] == b[j - 1]) { // if it's the first element of either sequences, set LCS length to 1 From 9830b6923b61f1e652a35afeac77aa5f886dad09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6hnenkamp?= Date: Sat, 2 Nov 2024 23:35:31 +0100 Subject: [PATCH 022/279] Add apple arm to presets (#10134) * Add apple arm to presets * Add final new line --- CMakePresets.json | 13 +++++++++++++ cmake/arm64-apple-clang.cmake | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 cmake/arm64-apple-clang.cmake diff --git a/CMakePresets.json b/CMakePresets.json index d22ffa490..ae45d60af 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -48,10 +48,23 @@ } }, + { + "name": "arm64-apple-clang", "hidden": true, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x64", "strategy": "external" }, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake" + } + }, + { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, + { "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] }, + { "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] }, + { "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] }, + { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, diff --git a/cmake/arm64-apple-clang.cmake b/cmake/arm64-apple-clang.cmake new file mode 100644 index 000000000..5fcd2882a --- /dev/null +++ b/cmake/arm64-apple-clang.cmake @@ -0,0 +1,16 @@ +set( CMAKE_SYSTEM_NAME Darwin ) +set( CMAKE_SYSTEM_PROCESSOR arm64 ) + +set( target arm64-apple-darwin-macho ) + +set( CMAKE_C_COMPILER clang ) +set( CMAKE_CXX_COMPILER clang++ ) + +set( CMAKE_C_COMPILER_TARGET ${target} ) +set( CMAKE_CXX_COMPILER_TARGET ${target} ) + +set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) +set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" ) + +set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) +set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) From 1839f69130151ceeac4d01c0ef8964e1fb43bba6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 3 Nov 2024 15:14:15 +0200 Subject: [PATCH 023/279] flake.lock: Update (#10146) --- flake.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index 732c7539c..c170c4952 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1727826117, - "narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=", + "lastModified": 1730504689, + "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1", + "rev": "506278e768c2a08bec68eb62932193e341f55c90", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1729665710, - "narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=", + "lastModified": 1730200266, + "narHash": "sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d", + "rev": "807e9154dcb16384b1b765ebe9cd2bba2ac287fd", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1727825735, - "narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=", + "lastModified": 1730504152, + "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz" } }, "root": { From 08828a6d7d0006a487c9655ba8ace0ebe35ecad1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 3 Nov 2024 15:18:40 +0200 Subject: [PATCH 024/279] metal : minor fixup in FA kernel (#10143) * metal : minor fixup in FA kernel ggml-ci * metal : use the unrolled loop variable * metal : remove unused var --- ggml/src/ggml-metal.metal | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index defde6246..57eb34f13 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -2776,11 +2776,11 @@ kernel void kernel_flash_attn_ext_vec_f16( const short iv3 = iq3 / rv3; // load the queries from shared memory into local memory - float4 mq[D4]; + float4 mq[D4/NW]; for (short ii = 0; ii < D4; ii += NW) { short i = ii + tiisg; - mq[i] = (float4) sq4[i]; + mq[ii/NW] = (float4) sq4[i]; } // pointer to the mask @@ -2812,7 +2812,7 @@ kernel void kernel_flash_attn_ext_vec_f16( mk[2] = (float4) pk4[i + 2*(nb11/8)]; mk[3] = (float4) pk4[i + 3*(nb11/8)]; - mqk += (float4) (mq[i] * mk); + mqk += (float4) (mq[ii/NW] * mk); } // reduce the results from the threads in the simdgroup @@ -2857,8 +2857,7 @@ kernel void kernel_flash_attn_ext_vec_f16( // O = diag(ms)*O #pragma unroll for (short ii = 0; ii < D4; ii += NW) { - const short i = ii + tiisg; - lo[i/NW] *= ms; + lo[ii/NW] *= ms; } } @@ -2872,10 +2871,10 @@ kernel void kernel_flash_attn_ext_vec_f16( for (short ii = 0; ii < D4; ii += NW) { const short i = ii + tiisg; - lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; - lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; - lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; - lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; + lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; + lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; + lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; + lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; } } } From 9f409893519b4a6def46ef80cd6f5d05ac0fb157 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 3 Nov 2024 19:34:08 +0100 Subject: [PATCH 025/279] ggml : move CPU backend to a separate file (#10144) --- Makefile | 21 +- Package.swift | 1 + common/CMakeLists.txt | 2 - common/common.cpp | 2 + common/train.cpp | 1515 --- common/train.h | 233 - examples/CMakeLists.txt | 1 - examples/baby-llama/CMakeLists.txt | 5 - examples/baby-llama/baby-llama.cpp | 1639 --- examples/llava/clip.cpp | 1 + examples/rpc/rpc-server.cpp | 2 + ggml/include/ggml-backend.h | 19 +- ggml/include/ggml-cpu.h | 150 + ggml/include/ggml.h | 149 +- ggml/src/CMakeLists.txt | 2 + ggml/src/ggml-aarch64.c | 1 + ggml/src/ggml-backend.cpp | 1237 +-- ggml/src/ggml-cpu.c | 13715 ++++++++++++++++++++++++ ggml/src/ggml-impl.h | 87 + ggml/src/ggml-rpc.cpp | 9 +- ggml/src/ggml.c | 15264 +-------------------------- include/llama.h | 1 + pocs/vdot/q8dot.cpp | 3 +- pocs/vdot/vdot.cpp | 10 +- spm-headers/ggml-cpu.h | 1 + src/llama.cpp | 2 + tests/test-backend-ops.cpp | 1 + tests/test-barrier.cpp | 1 + tests/test-grad0.cpp | 1 + tests/test-quantize-fns.cpp | 10 +- tests/test-quantize-perf.cpp | 6 +- tests/test-rope.cpp | 1 + 32 files changed, 14747 insertions(+), 19345 deletions(-) delete mode 100644 common/train.cpp delete mode 100644 common/train.h delete mode 100644 examples/baby-llama/CMakeLists.txt delete mode 100644 examples/baby-llama/baby-llama.cpp create mode 100644 ggml/include/ggml-cpu.h create mode 100644 ggml/src/ggml-cpu.c create mode 120000 spm-headers/ggml-cpu.h diff --git a/Makefile b/Makefile index 051436344..eb1da90f1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ libllava.a \ - llama-baby-llama \ llama-batched \ llama-batched-bench \ llama-bench \ @@ -56,7 +55,6 @@ TEST_TARGETS = \ tests/test-llama-grammar \ tests/test-log \ tests/test-model-load-cancel \ - tests/test-opt \ tests/test-quantize-fns \ tests/test-quantize-perf \ tests/test-rope \ @@ -64,6 +62,7 @@ TEST_TARGETS = \ tests/test-tokenizer-0 \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm +# tests/test-opt \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ @@ -916,6 +915,7 @@ endif # GGML_METAL OBJ_GGML += \ ggml/src/ggml.o \ + ggml/src/ggml-cpu.o \ ggml/src/ggml-alloc.o \ ggml/src/ggml-backend.o \ ggml/src/ggml-quants.o \ @@ -936,7 +936,6 @@ OBJ_COMMON = \ common/console.o \ common/ngram-cache.o \ common/sampling.o \ - common/train.o \ common/build-info.o \ common/json-schema-to-grammar.o @@ -1048,6 +1047,12 @@ ggml/src/ggml.o: \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ +ggml/src/ggml-cpu.o: \ + ggml/src/ggml-cpu.c \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h + $(CC) $(CFLAGS) -c $< -o $@ + ggml/src/ggml-alloc.o: \ ggml/src/ggml-alloc.c \ ggml/include/ggml.h \ @@ -1213,11 +1218,6 @@ common/json-schema-to-grammar.o: \ common/json-schema-to-grammar.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common/train.o: \ - common/train.cpp \ - common/train.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - common/ngram-cache.o: \ common/ngram-cache.cpp \ common/ngram-cache.h @@ -1390,11 +1390,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-baby-llama: examples/baby-llama/baby-llama.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/Package.swift b/Package.swift index 3a17e6c34..d3661d13c 100644 --- a/Package.swift +++ b/Package.swift @@ -10,6 +10,7 @@ var sources = [ "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", + "ggml/src/ggml-cpu.c", "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.cpp", "ggml/src/ggml-quants.c", diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 042e895ad..5ab1ffa19 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -66,8 +66,6 @@ add_library(${TARGET} STATIC ngram-cache.h sampling.cpp sampling.h - train.cpp - train.h ) if (BUILD_SHARED_LIBS) diff --git a/common/common.cpp b/common/common.cpp index 7656843b1..c8cbaae11 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1951,6 +1951,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + ggml_cpu_init(); // some ARM features are detected at runtime + const auto & sparams = params.sparams; fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); diff --git a/common/train.cpp b/common/train.cpp deleted file mode 100644 index 661ad8382..000000000 --- a/common/train.cpp +++ /dev/null @@ -1,1515 +0,0 @@ -#include "train.h" -#include "common.h" - -#include -#include -#include -#include -#include - -struct random_normal_distribution { - std::mt19937 gen; - std::normal_distribution rd; - float min; - float max; -}; - -struct random_uniform_distribution { - std::mt19937 gen; - std::uniform_real_distribution rd; -}; - -struct train_state * init_train_state() { - struct train_state * state = new struct train_state; - state->train_its = 0; - state->train_samples = 0; - state->train_tokens = 0; - state->train_epochs = 0; - state->shuffle_samples_hash = 0; - state->shuffle_sample_count = 0; - state->shuffle_next_sample = 0; - state->shuffle_rng_state_current = ""; - state->shuffle_rng_state_next = ""; - - state->opt = new struct ggml_opt_context; - state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; - state->opt->loss_after = 0.0f; - - return state; -} - -void free_train_state(struct train_state * state) { - delete state->opt; - delete state; -} - -struct random_normal_distribution * init_random_normal_distribution( - int seed, float mean, float std, float min, float max -) { - struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution)); - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; - return rnd; -} - -struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) { - struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution)); - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution{min, max}; - return rnd; -} - -void free_random_normal_distribution (struct random_normal_distribution * rnd) { - free(rnd); -} - -void free_random_uniform_distribution(struct random_uniform_distribution * rnd) { - free(rnd); -} - -struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { - float scale = 1.0f; // xavier - switch (ggml_n_dims(tensor)) { - case 1: - scale /= sqrtf((float) tensor->ne[0]); - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = scale * frand_normal(rnd); - } - break; - case 2: - scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = scale * frand_normal(rnd); - } - } - break; - case 3: - scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = scale * frand_normal(rnd); - } - } - } - break; - case 4: - scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]); - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = scale * frand_normal(rnd); - } - } - } - } - break; - default: - die("Unsupported tensor->n_dims"); - }; - return tensor; -} - -struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (ggml_n_dims(tensor)) { - case 1: - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); - *dst = frand_uniform(rnd); - } - break; - case 2: - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); - *dst = frand_uniform(rnd); - } - } - break; - case 3: - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); - *dst = frand_uniform(rnd); - } - } - } - break; - case 4: - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); - *dst = frand_uniform(rnd); - } - } - } - } - break; - default: - die("Unsupported tensor->n_dims"); - }; - return tensor; -} - -float frand() { - return (float)rand()/((float)(RAND_MAX) + 1.0f); -} - -float frand_normal(struct random_normal_distribution * rnd) { - return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); -} - -float frand_uniform(struct random_uniform_distribution * rnd) { - return rnd->rd(rnd->gen); -} - -int clamp(const int v, const int min, const int max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -float fclamp(const float v, const float min, const float max) { - return ((v < min) ? (min) : (v > max) ? (max) : v); -} - -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == 1); - GGML_ASSERT(tensor->ne[2] == 1); - GGML_ASSERT(tensor->ne[3] == 1); -} - -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == 1); - GGML_ASSERT(tensor->ne[3] == 1); -} - -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == 1); -} - -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->ne[0] == ne0); - GGML_ASSERT(tensor->ne[1] == ne1); - GGML_ASSERT(tensor->ne[2] == ne2); - GGML_ASSERT(tensor->ne[3] == ne3); -} - -int64_t get_example_targets_batch( - struct llama_context * lctx, - struct ggml_tensor * tokens_input, - struct ggml_tensor * target_probs, - int64_t example_id, - const size_t * samples_offs, - const size_t * samples_begin, - const size_t * samples_size, - size_t samples_count, - const llama_token * train_data, - size_t n_train_data, - bool separate_with_eos, - bool separate_with_bos, - bool fill_with_next_samples, - bool sample_random_offsets -) { - GGML_ASSERT(samples_count > 0); - GGML_ASSERT(ggml_is_matrix(tokens_input)); - GGML_ASSERT(ggml_is_3d(target_probs)); - int64_t n_vocab = target_probs->ne[0]; - int64_t n_tokens = tokens_input->ne[0]; - int64_t n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_vocab == target_probs->ne[0]); - GGML_ASSERT(n_tokens == target_probs->ne[1]); - GGML_ASSERT(n_batch == target_probs->ne[2]); - - int64_t used_samples = 0; - - ggml_set_f32(target_probs, 0.0f); - llama_token bos = llama_token_bos(llama_get_model(lctx)); - llama_token eos = llama_token_eos(llama_get_model(lctx)); - // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); - for (int k=0; k= sample_size && fill_with_next_samples) { - if (!sample_separation_eos) { - // insert eos token to separate samples - sample_separation_eos = true; - } else if (!sample_separation_bos) { - // insert bos token to separate samples - sample_separation_bos = true; - token = bos; - } else { - // sample separation is done, continue with next sample - sample_separation_eos = !separate_with_eos; - sample_separation_bos = !separate_with_bos; - sample_offs = 0; - sample_idx = (example_id + used_samples) % samples_count; - sample_begin = samples_begin[sample_idx]; - sample_size = samples_size[sample_idx]; - ++used_samples; - } - } - // note: no else-if here - if (sample_offs < sample_size) { - token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); - ++sample_offs; - } - ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); - if (i+1> rng; -} - -std::string mt19937_get_state(const std::mt19937& rng) { - std::stringstream s_rng_state; - s_rng_state.imbue(std::locale::classic()); - s_rng_state << rng; - return s_rng_state.str(); -} - -std::string mt19937_seed_to_state(unsigned seed) { - std::mt19937 rng(seed); - return mt19937_get_state(rng); -} - -std::string shuffle_samples( - const std::string & rng_state, - size_t * shuffled_offs, - size_t * shuffled_begins, - size_t * shuffled_sizes, - const size_t * begins, - const size_t * sizes, - size_t count) { - if (count == 0) return rng_state; - - std::mt19937 rng; - mt19937_set_state(rng, rng_state); - - // sort indices by random value for each index - std::vector idcs; - { - std::vector rnd; - idcs.resize(count); - rnd.resize(count); - for (unsigned i=0; i h_string; - std::hash h_ull; - size_t h = h_string(std::string(fn)); - h = hash_combine(h, h_ull((unsigned long long) sample_count)); - for (size_t i=0; i< sample_count; ++i) { - h = hash_combine(h, h_ull((unsigned long long) samples_begin[i])); - h = hash_combine(h, h_ull((unsigned long long) samples_size[i])); - } - return h; -} - -std::string replace_str(const char * s, const char * needle, const char * replacement) { - std::string str = s; - size_t pos = str.find(needle); - if (pos != std::string::npos) { - str.replace(pos, strlen(needle), replacement); - } - return str; -} - -void print_duration(double fmillis) { - if (fmillis < 1000.0f) { - printf("%.1fms", (float) fmillis); - return; - } - const int64_t one_sec = 1000; - const int64_t one_min = one_sec * 60; - const int64_t one_hour = one_min * 60; - const int64_t one_day = one_hour * 24; - - int64_t millis = (int64_t) fmillis; - int64_t days = millis/one_day; - int64_t hours = (millis - days*one_day)/one_hour; - int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min; - int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec; - - // to print int64_t either cast to (long long int) or use macro PRId64 from - if (days > 0) { - printf("%lldd ", (long long int) days); - } - printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds); -} - -float cosine_decay(int64_t step, int64_t decay_steps, float minimum) { - if (step > decay_steps) { - step = decay_steps; - } - const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - minimum)*cosine_decay + minimum; - return decay; -} - -float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int64_t) (restart_step_mult * decay_steps); - } - return cosine_decay(step, decay_steps, minimum); -} - -float learning_schedule( - int64_t step, - int64_t warmup_steps, - int64_t cos_decay_steps, - float learning_rate, - float overall_minimum, - float cos_decay_minimum, - float cos_decay_restart_step_mult, - bool enable_restart) { - - float result = - (step < warmup_steps) - ? (float) step / (float) warmup_steps - : enable_restart - ? cosine_decay_restart( - step - warmup_steps, - cos_decay_steps, - cos_decay_minimum, - cos_decay_restart_step_mult) - : cosine_decay( - step, - cos_decay_steps, - cos_decay_minimum); - - float min = overall_minimum / learning_rate; - result = min + result * (1.0f - min); - return result; -} - -static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(a != NULL); - GGML_ASSERT(b != NULL); - GGML_ASSERT(a->type == b->type); - GGML_ASSERT(ggml_are_same_shape(a, b)); - GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - - return true; -} - -void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { - if (dst == NULL) { - return; - } - struct ggml_tensor * t = ggml_get_tensor(ctx, name); - GGML_ASSERT(are_same_layout(dst, t)); - memcpy(dst->data, t->data, ggml_nbytes(t)); - - if (strlen(ggml_get_name(dst)) == 0) { - ggml_set_name(dst, name); - } -} - -// gguf constants -static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; -static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; -static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; -static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; -static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; -static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; -static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; -static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; -static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; -static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; -static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; -static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; -static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; - -static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; -static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; - -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; -static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; - -static const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; -static const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; -static const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; -static const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; -static const char * LLM_KV_TRAINING_EPOCH_COUNT = "training.epoch_count"; -static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash"; -static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE = "training.shuffle.rng_state"; -static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count"; -static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE = "training.shuffle.next_sample"; - -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - die_fmt("key not found in model: %s", skey.c_str()); \ - } \ -} - -void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); - GGML_ASSERT(file_version == 0); - - GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); - GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); - GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); - - uint64_t nx; - GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); - opt->nx = (size_t) nx; - - // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know - - std::string opt_type; - GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); - if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_TYPE_ADAM; - - GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); - GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); - - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - copy_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_TYPE_LBFGS; - - GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); - GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); - GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); - GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); - GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); - GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); - GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); - - ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); - - copy_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - copy_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - copy_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - copy_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - copy_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - copy_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - copy_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - copy_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - } else { - die("unknown optimizer type\n"); - } -} - -void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); - gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); - gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - - switch (opt->params.type) { - case GGML_OPT_TYPE_ADAM: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); - - ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); - ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); - if (opt->adam.pf) { - ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); - } - - gguf_add_tensor(fctx, opt->adam.m); - gguf_add_tensor(fctx, opt->adam.v); - if (opt->adam.pf) { - gguf_add_tensor(fctx, opt->adam.pf); - } - } break; - case GGML_OPT_TYPE_LBFGS: - { - gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); - gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); - gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); - gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); - - ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); - ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); - ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); - ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); - ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); - if (opt->lbfgs.pf) { - ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); - } - ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); - ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); - ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); - ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); - - gguf_add_tensor(fctx, opt->lbfgs.x); - gguf_add_tensor(fctx, opt->lbfgs.xp); - gguf_add_tensor(fctx, opt->lbfgs.g); - gguf_add_tensor(fctx, opt->lbfgs.gp); - gguf_add_tensor(fctx, opt->lbfgs.d); - if (opt->lbfgs.pf) { - gguf_add_tensor(fctx, opt->lbfgs.pf); - } - gguf_add_tensor(fctx, opt->lbfgs.lmal); - gguf_add_tensor(fctx, opt->lbfgs.lmys); - gguf_add_tensor(fctx, opt->lbfgs.lms); - gguf_add_tensor(fctx, opt->lbfgs.lmy); - } break; - } -} - -bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) { - if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) { - return false; - } - - uint32_t file_version; - GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); - GGML_ASSERT(file_version <= 1); - - if (file_version == 0) { - - GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); - GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); - GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); - - } else if (file_version == 1) { - - GGUF_GET_KEY(fctx, train->train_its, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT); - GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT); - GGUF_GET_KEY(fctx, train->train_tokens, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT); - GGUF_GET_KEY(fctx, train->train_epochs, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT); - - GGUF_GET_KEY(fctx, train->shuffle_samples_hash, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH); - GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE); - GGUF_GET_KEY(fctx, train->shuffle_sample_count, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT); - GGUF_GET_KEY(fctx, train->shuffle_next_sample, gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE); - } - - load_opt_context_gguf(fctx, f_ggml_ctx, train->opt); - return true; -} - -void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) { - gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 1); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, train->train_samples); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT, train->train_tokens); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT, train->train_epochs); - - gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash); - gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE, train->shuffle_rng_state_current.c_str()); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count); - gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE, (uint64_t) train->shuffle_next_sample); - - save_opt_context_gguf(fctx, train->opt); -} - - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); - } - if (ret != 1) { - die("unexpectedly reached end of file"); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} - -// mark each byte with its utf8 unit number. -// returns the number of utf8 characters. -// e.g. when bytes == '\x61\xD0\xB0\x62', -// then utf8_units will become [0,0,1,0] -// utf8_nunits will become [1,2,2,1] and 3 is returned. -// bytes where utf8_units is zero, are the begin of an utf8 character. -static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) { - size_t offs = 0; - size_t count_utf8 = 0; - while(offs < count) { - int len = (int) utf8_len(bytes[offs]); - for (int i=0; i & out_tokens, - std::vector & out_samples_begin, - std::vector & out_samples_size) { - struct llama_file f(filename, "rb"); - - if (f.size == 0) { - out_tokens.clear(); - out_samples_begin.clear(); - out_samples_size.clear(); - printf("%s: warning: empty or not existing training data file '%s'\n", - __func__, filename); - return out_tokens.size(); - } - - // account for possible leading whitespace that will be added by tokenizer - // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] - const int n_max_tokens_overhead = 1; - - std::vector buf; - buf.resize(f.size); - - f.read_raw(buf.data(), f.size); - - std::vector utf8_units; - std::vector utf8_nunits; - utf8_units.resize(buf.size()); - utf8_nunits.resize(buf.size()); - mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size()); - - if (sample_start.size() == 0) { - // tokenize all data at once - out_tokens.resize(buf.size() + n_max_tokens_overhead); - - int n_tokens = llama_tokenize( - llama_get_model(lctx), - buf.data(), - (int) buf.size(), - out_tokens.data(), - (int) out_tokens.size(), - false, false); - if (n_tokens < 0) { - out_tokens.resize(-n_tokens); - n_tokens = llama_tokenize( - llama_get_model(lctx), - buf.data(), - (int) buf.size(), - out_tokens.data(), - (int) out_tokens.size(), - false, false); - } - if (n_tokens >= 0) { - out_tokens.resize(n_tokens); - } - - // generate sample starts at all token positions - out_samples_begin.clear(); - out_samples_begin.push_back(0); - out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); - size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; - for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { - out_samples_begin.push_back(sample_begin); - out_samples_size.push_back(context_length); - } - } else { - // split data into samples and tokenize each sample - std::string data_str(buf.data(), buf.size()); - out_samples_begin.clear(); - out_samples_size.clear(); - out_tokens.clear(); - - // find all positions of pattern sample_start - size_t sample_begin = data_str.find(sample_start, 0); - while (sample_begin != std::string::npos) { - out_samples_begin.push_back(sample_begin); - const size_t search_start = sample_begin + sample_start.size(); - sample_begin = data_str.find(sample_start, search_start); - } - if (out_samples_begin.size() == 0) { - printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n", - __func__, sample_start.c_str()); - out_samples_begin.push_back(0); - } - - out_samples_size.resize(out_samples_begin.size(), 0); - - std::vector buf_sample; - std::vector tok_sample; - - const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); - size_t found_too_big_sample = 0; - size_t found_too_small_sample = 0; - size_t found_empty_sample = 0; - size_t found_min_sample_size = SIZE_MAX; - size_t found_max_sample_size = 0; - - size_t max_token_text_size = 0; - int n_vocab = llama_n_vocab(llama_get_model(lctx)); - for (llama_token token=0; token < n_vocab; ++token) { - max_token_text_size = std::max( - max_token_text_size, - strlen(llama_token_get_text(llama_get_model(lctx), token))); - } - - // upper bound of context byte length. - // strings with this byte length should always tokenize to at least context_length tokens. - size_t context_byte_len = max_token_text_size*context_length; - - for (unsigned i=0; i 0) { - // sample end is in the middle of an utf8 character. - // advance sample_end to the begin of the next utf8 character. - sample_end += utf8_nunits[sample_end] - utf8_units[sample_end]; - } - size_t sample_size = sample_end - sample_begin; - if (sample_size == 0) { - ++found_empty_sample; - } - - if (sample_size > 0) { - // llama_tokenize expects zero terminated string, - // copy sample into buffer and zero terminate it. - buf_sample.resize(sample_size); - memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); - - // printf("sample: '%s'\n", buf_sample.data()); - - // tokenize the sample - tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); - int n_tokens = llama_tokenize(llama_get_model(lctx), - buf_sample.data(), - (int) buf_sample.size(), - tok_sample.data(), - (int) tok_sample.size(), - false, false); - if (n_tokens < 0) { - tok_sample.resize(-n_tokens); - n_tokens = llama_tokenize(llama_get_model(lctx), - buf_sample.data(), - (int) buf_sample.size(), - tok_sample.data(), - (int) tok_sample.size(), - false, false); - GGML_ASSERT(n_tokens >= 0); - } - GGML_ASSERT(n_tokens <= (int) tok_sample.size()); - - if ((size_t) n_tokens > context_length) { - ++found_too_big_sample; - } else if ((size_t) n_tokens < context_length) { - ++found_too_small_sample; - } - found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens); - found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens); - - // write out tokens, start and size of sample - // overwrite the string start position with the token start position - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = (size_t) n_tokens; - out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens); - } else { - out_samples_begin[i] = out_tokens.size(); - out_samples_size[i] = 0; - } - - } - if (found_too_big_sample > 0) { - printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n", - __func__, found_too_big_sample, found_max_sample_size, context_length); - } - - if (found_too_small_sample > 0) { - printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n", - __func__, found_too_small_sample, found_min_sample_size, context_length); - } - - if (found_empty_sample) { - printf("%s: warning: found %zu empty samples.\n", - __func__, found_empty_sample); - } - } - printf("%s: total number of samples: %zu\n", - __func__, out_samples_begin.size()); - - GGML_ASSERT(out_samples_begin.size() == out_samples_size.size()); - - return out_tokens.size(); -} - -std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) { - std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest); - return replace_str(filename, pattern_it, sit.c_str()); -} - -struct train_params_common get_default_train_params_common() { - struct train_params_common params; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.gguf"; - params.fn_checkpoint_out = "checkpoint-ITERATION.gguf"; - params.pattern_fn_it = "ITERATION"; - params.fn_latest = "LATEST"; - - params.print_usage = false; - - params.save_every = 10; - - params.seed = -1; - - params.n_ctx = 128; - params.n_threads = 6; - params.n_batch = 8; - params.n_gradient_accumulation = 1; - params.n_epochs = -1; - params.n_gpu_layers = 0; - - params.custom_n_ctx = false; - - params.use_flash = false; - params.use_checkpointing = true; - - params.sample_start = ""; - params.include_sample_start = false; - params.escape = false; - params.overlapping_samples = false; - params.fill_with_next_samples = false; - params.separate_with_eos = false; - params.separate_with_bos = true; - params.sample_random_offsets = false; - params.force_reshuffle = false; - - params.opt_past = 0; - params.opt_delta = 1e-5f; - params.opt_max_no_improvement = 0; - - params.warmup = 100; - params.cos_decay_steps = 1000; - params.cos_decay_restart = 1.1f; - params.cos_decay_min = 0.1f; - params.enable_restart = false; - - params.adam_n_iter = 256; - params.adam_alpha = 1e-3f; - params.adam_min_alpha = 0; - params.adam_decay = 1e-1f; - params.adam_decay_min_ndim = 2; - params.adam_beta1 = 0.9f; - params.adam_beta2 = 0.999f; - params.adam_gclip = 1.0f; - params.adam_eps_f = 0.0f; - - return params; -} - -void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) { - // fprintf(stderr, "usage: %s [options]\n", argv[0]); - // fprintf(stderr, "\n"); - // fprintf(stderr, "options:\n"); - // fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); - fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); - fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); - fprintf(stderr, " --pattern-fn-it STR pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it); - fprintf(stderr, " --fn-latest STR string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest); - fprintf(stderr, " --save-every N save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); - fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); - fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); - fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); - fprintf(stderr, " --grad-acc N Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation); - fprintf(stderr, " --sample-start STR Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str()); - fprintf(stderr, " --include-sample-start Include the sample start in the samples. (default off)\n"); - fprintf(stderr, " --escape process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --overlapping-samples Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n"); - fprintf(stderr, " --fill-with-next-samples Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n"); - fprintf(stderr, " --separate-with-eos When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : ""); - fprintf(stderr, " --separate-with-bos When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : ""); - fprintf(stderr, " --no-separate-with-eos When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : ""); - fprintf(stderr, " --no-separate-with-bos When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : ""); - fprintf(stderr, " --sample-random-offsets Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : ""); - fprintf(stderr, " --force-reshuffle Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n"); - fprintf(stderr, " --no-flash Don't use flash attention \n"); - fprintf(stderr, " --use-flash Use flash attention (default)\n"); - fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); - fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); - fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); - fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); - fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); - fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); - fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); - fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); - fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); - fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); - fprintf(stderr, " --epochs N Maximum number epochs to process. (default %d)\n", params->n_epochs); - fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); - fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); - fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); - fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); - fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); - fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); - fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); - fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); - fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); - fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers); - fprintf(stderr, "\n"); -} - -bool consume_common_train_arg( - int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param -) { - int& i = *idx; - std::string arg = argv[i]; - const std::string arg_prefix = "--"; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (arg == "--train-data") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->fn_train_data = argv[i]; - } else if (arg == "--checkpoint-in") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->fn_checkpoint_in = argv[i]; - } else if (arg == "--checkpoint-out") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->fn_checkpoint_out = argv[i]; - } else if (arg == "--pattern-fn-it") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->pattern_fn_it = argv[i]; - } else if (arg == "--fn-latest") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->fn_latest = argv[i]; - } else if (arg == "--save-every") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->save_every = std::stoi(argv[i]); - } else if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->seed = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->n_ctx = std::stoi(argv[i]); - params->custom_n_ctx = true; - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->n_batch = std::stoi(argv[i]); - } else if (arg == "--grad-acc") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->n_gradient_accumulation = std::max(1, std::stoi(argv[i])); - } else if (arg == "--sample-start") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->sample_start = std::string(argv[i]); - } else if (arg == "--escape") { - params->escape = true; - } else if (arg == "--include-sample-start") { - params->include_sample_start = true; - } else if (arg == "--overlapping-samples") { - params->overlapping_samples = true; - } else if (arg == "--fill-with-next-samples") { - params->fill_with_next_samples = true; - } else if (arg == "--separate-with-eos") { - params->separate_with_eos = true; - } else if (arg == "--separate-with-bos") { - params->separate_with_bos = true; - } else if (arg == "--no-separate-with-eos") { - params->separate_with_eos = false; - } else if (arg == "--no-separate-with-bos") { - params->separate_with_bos = false; - } else if (arg == "--sample-random-offsets") { - params->sample_random_offsets = true; - } else if (arg == "--force-reshuffle") { - params->force_reshuffle = true; - } else if (arg == "--no-flash") { - params->use_flash = false; - } else if (arg == "--use-flash") { - params->use_flash = true; - } else if (arg == "--no-checkpointing") { - params->use_checkpointing = false; - } else if (arg == "--use-checkpointing") { - params->use_checkpointing = true; - } else if (arg == "--warmup") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->warmup = std::stoi(argv[i]); - } else if (arg == "--cos-decay-steps") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->cos_decay_steps = std::stoi(argv[i]); - } else if (arg == "--cos-decay-restart") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->cos_decay_restart = std::stof(argv[i]); - } else if (arg == "--cos-decay-min") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->cos_decay_min = std::stof(argv[i]); - } else if (arg == "--enable-restart") { - params->enable_restart = true; - } else if (arg == "--disable-restart") { - params->enable_restart = false; - } else if (arg == "--opt-past") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->opt_past = std::stoi(argv[i]); - } else if (arg == "--opt-delta") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->opt_delta = std::stof(argv[i]); - } else if (arg == "--opt-max-no-improvement") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->opt_max_no_improvement = std::stoi(argv[i]); - } else if (arg == "--adam-epsf") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_eps_f = std::stof(argv[i]); - } else if (arg == "--epochs") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->n_epochs = std::stoi(argv[i]); - } else if (arg == "--adam-iter") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_n_iter = std::stoi(argv[i]); - } else if (arg == "--adam-alpha") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_alpha = std::stof(argv[i]); - } else if (arg == "--adam-min-alpha") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_min_alpha = std::stof(argv[i]); - } else if (arg == "--adam-decay") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_decay = std::stof(argv[i]); - } else if (arg == "--adam-decay-min-ndim") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_decay_min_ndim = std::stoi(argv[i]); - } else if (arg == "--adam-beta1") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_beta1 = std::stof(argv[i]); - } else if (arg == "--adam-beta2") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_beta2 = std::stof(argv[i]); - } else if (arg == "--adam-gclip") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - params->adam_gclip = std::stof(argv[i]); - } else if (arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - *invalid_param = true; - return true; - } - if (llama_supports_gpu_offload()) { - params->n_gpu_layers = std::stoi(argv[i]); - } else { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } else if (arg == "-h" || arg == "--help") { - params->print_usage = true; - return true; - } else { - return false; - } - return true; -} - -void finish_processing_train_args(struct train_params_common * params) { - if (params->escape) { - string_process_escapes(params->sample_start); - } -} - -void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) { - struct train_opt_callback_data * data = (struct train_opt_callback_data *) vdata; - struct train_params_common * params = data->params; - struct train_state * train = data->train; - struct ggml_opt_context * opt = train->opt; - int n_batch = params->n_batch; - int n_ctx = params->n_ctx; - - if (accum_step == 0) { - // time measurement - int64_t now = ggml_time_ms(); - if (now > data->last_time && opt->iter > data->first_iter) { - double dt = (double) (now - data->last_time); - if (data->millis_per_iter == 0.0) { - data->millis_per_iter = dt; - } else { - const double gain = 0.7; - data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain; - } - } - - double remaining_millis = 0.0; - if (data->millis_per_iter > 0.0) { - const int n_iter = params->adam_n_iter; - const int done_iter = opt->iter - data->first_iter; - const int remaining_iter = n_iter - done_iter; - remaining_millis = remaining_iter * data->millis_per_iter; - } - - // file saving - const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every); - if (save_now) { - int new_iters = opt->iter - data->last_save_iter; - train->train_its += new_iters; - train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx; - - if (data->save_cb) { - data->save_cb(data->save_data, train); - } - - data->last_save_iter = opt->iter; - } - - // exclude file saving from time measurement, by measuring last_time after saving - data->last_time = ggml_time_ms(); - - *sched = learning_schedule( - opt->iter, - params->warmup, - params->cos_decay_steps, - params->adam_alpha, - params->adam_min_alpha, - params->cos_decay_min, - params->cos_decay_restart, - params->enable_restart); - - int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); - if (impr_plot > 0) impr_plot = 0; - if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0; - printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f", - __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count, - *sched, opt->loss_after); - - - if (data->millis_per_iter > 0) { - printf(" dt="); - print_duration(data->millis_per_iter); - printf(" eta="); - print_duration(remaining_millis); - } - - float improvement = opt->loss_before - opt->loss_after; - const float plot_scale = 10.0f; - int bar_len = (int)(1 + improvement*plot_scale + 0.5); - printf(" |"); - for (int i=0; i"); - printf("\n"); - } - - int64_t used_samples = get_example_targets_batch( - data->lctx, - data->tokens_input, - data->target_probs, - train->shuffle_next_sample, - data->shuffled_samples_offs, - data->shuffled_samples_begin, - data->shuffled_samples_size, - data->samples_count, - data->tokens_data, - data->tokens_size, - params->separate_with_eos, - params->separate_with_bos, - params->fill_with_next_samples, - params->sample_random_offsets); - - train->train_samples += used_samples; - train->shuffle_next_sample += used_samples; - - if (train->shuffle_next_sample >= train->shuffle_sample_count) { - ++train->train_epochs; - printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs); - // note: we may have used some samples from the current shuffling more than once - train->shuffle_rng_state_current = train->shuffle_rng_state_next; - train->shuffle_rng_state_next = shuffle_samples( - train->shuffle_rng_state_current, - data->shuffled_samples_offs, - data->shuffled_samples_begin, - data->shuffled_samples_size, - data->samples_begin, - data->samples_size, - data->samples_count); - train->shuffle_next_sample = 0; - } - - const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs); - if (last_epoch_reached) { - // allow optimization iteration at last epoch to be completed before canceling - if (data->iter_at_last_epoch < 0) { - data->iter_at_last_epoch = opt->iter; - } else if (opt->iter > data->iter_at_last_epoch) { - *cancel = true; - } - } -} diff --git a/common/train.h b/common/train.h deleted file mode 100644 index 263d940c0..000000000 --- a/common/train.h +++ /dev/null @@ -1,233 +0,0 @@ -// Various helper functions and utilities for training - -#pragma once - -#include -#include -#include - -#include "ggml.h" -#include "llama.h" - -#define LLAMA_TRAIN_MAX_NODES 16384 - -typedef std::string mt19937_state; - -struct train_state { - struct ggml_opt_context * opt; - - uint64_t train_its; - uint64_t train_samples; - uint64_t train_tokens; - uint64_t train_epochs; - - size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes) - mt19937_state shuffle_rng_state_current; - mt19937_state shuffle_rng_state_next; - size_t shuffle_sample_count; - size_t shuffle_next_sample; -}; - -struct train_params_common { - const char * fn_train_data; - const char * fn_checkpoint_in; - const char * fn_checkpoint_out; - const char * pattern_fn_it; - const char * fn_latest; - - bool print_usage; - - int save_every; - - uint32_t seed; - - int n_ctx; - int n_threads; - int n_batch; - int n_gradient_accumulation; - int n_epochs; - int n_gpu_layers; - - bool custom_n_ctx; - - bool use_flash; - bool use_checkpointing; - - std::string sample_start; - bool include_sample_start; - bool escape; - bool overlapping_samples; - bool fill_with_next_samples; - bool separate_with_eos; - bool separate_with_bos; - bool sample_random_offsets; - - bool force_reshuffle; - - int warmup; - int cos_decay_steps; - float cos_decay_restart; - float cos_decay_min; - bool enable_restart; - - int opt_past; - float opt_delta; - int opt_max_no_improvement; - - int adam_n_iter; - float adam_alpha; - float adam_min_alpha; - float adam_decay; - int adam_decay_min_ndim; - float adam_beta1; - float adam_beta2; - float adam_gclip; - float adam_eps_f; -}; - -typedef void (*save_train_files_callback)(void * data, struct train_state * train); - -struct train_opt_callback_data { - struct train_params_common * params; - struct train_state * train; - save_train_files_callback save_cb; - void * save_data; - struct llama_context * lctx; - int last_save_iter; - llama_token * tokens_data; - size_t tokens_size; - size_t * samples_begin; - size_t * samples_size; - size_t * shuffled_samples_offs; - size_t * shuffled_samples_begin; - size_t * shuffled_samples_size; - size_t samples_count; - struct ggml_tensor * tokens_input; - struct ggml_tensor * target_probs; - int first_iter; - int first_epoch; - int iter_at_last_epoch; - int64_t last_time; - double millis_per_iter; -}; - -struct train_state * init_train_state(); -void free_train_state(struct train_state * state); - -struct train_params_common get_default_train_params_common(); -void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params); - -bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param); -void finish_processing_train_args(struct train_params_common * params); - -struct random_normal_distribution; -struct random_uniform_distribution; - -struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max); -struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max); - -void free_random_normal_distribution (struct random_normal_distribution * rnd); -void free_random_uniform_distribution(struct random_uniform_distribution * rnd); - -struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd); -struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd); - -// generate random float in interval [0,1) -float frand(); -float frand_normal (struct random_normal_distribution * rnd); -float frand_uniform(struct random_uniform_distribution * rnd); - -int clamp (const int v, const int min, const int max); -float fclamp(const float v, const float min, const float max); - -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0); -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1); -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2); -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); - -size_t tokenize_file( - struct llama_context * lctx, - const char * filename, - const std::string & sample_start, - bool include_sample_start, - bool overlapping_samples, - unsigned context_length, - std::vector & out_tokens, - std::vector & out_samples_begin, - std::vector & out_samples_size); - -int64_t get_example_targets_batch( - struct llama_context * lctx, - struct ggml_tensor * tokens_input, - struct ggml_tensor * target_probs, - int64_t example_id, - const size_t * samples_offs, - const size_t * samples_begin, - const size_t * samples_size, - size_t samples_count, - const llama_token * train_data, - size_t n_train_data, - bool separate_with_eos, - bool separate_with_bos, - bool fill_with_next_samples, - bool sample_random_offsets); - - -void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state); -mt19937_state mt19937_get_state(const std::mt19937& rng); -mt19937_state mt19937_seed_to_state(unsigned seed); - -mt19937_state shuffle_samples( - const mt19937_state & rng_state, - size_t * shuffled_offs, - size_t * shuffled_begins, - size_t * shuffled_sizes, - const size_t * begins, - const size_t * sizes, - size_t count); - -size_t hash_combine(size_t h1, size_t h2); - -size_t compute_samples_hash( - const char* fn, - const size_t* samples_begin, - const size_t* samples_size, - size_t sample_count); - - -std::string replace_str(const char * s, const char * needle, const char * replacement); - -void print_duration(double milliseconds); - -float cosine_decay( - int64_t step, - int64_t decay_steps, - float minimum); - -float cosine_decay_restart( - int64_t step, - int64_t decay_steps, - float minimum, - float restart_step_mult); - -float learning_schedule( - int64_t step, - int64_t warmup_steps, - int64_t decay_steps, - float learning_rate, - float overall_minimum, - float cos_decay_minimum, - float cos_decay_restart_step_mult, - bool enable_restart); - -void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name); - -void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt); -void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt); - -bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train); -void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train); - -std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration); - -void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6df318c19..d63a96c1c 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(cvector-generator) - add_subdirectory(baby-llama) add_subdirectory(batched-bench) add_subdirectory(batched) add_subdirectory(convert-llama2c-to-ggml) diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt deleted file mode 100644 index 71b82105c..000000000 --- a/examples/baby-llama/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-baby-llama) -add_executable(${TARGET} baby-llama.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp deleted file mode 100644 index 3ce91070b..000000000 --- a/examples/baby-llama/baby-llama.cpp +++ /dev/null @@ -1,1639 +0,0 @@ -#include "ggml.h" -#include "train.h" - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#ifdef LLAMA_DEFAULT_RMS_EPS -constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; -#else -constexpr float rms_norm_eps = 5e-6f; -#endif - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - -static struct ggml_tensor * randomize_tensor( - struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax -) { - switch (ndims) { - case 1: - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin; - } - break; - case 2: - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; - } - } - break; - case 3: - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; - } - } - } - break; - case 4: - for (int i3 = 0; i3 < ne[3]; i3++) { - for (int i2 = 0; i2 < ne[2]; i2++) { - for (int i1 = 0; i1 < ne[1]; i1++) { - for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; - } - } - } - } - break; - default: - assert(false); - } - - return tensor; -} - -struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - - bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); - } -}; - -static uint32_t get_n_ff(const struct llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - -struct llama_hparams_lora { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_lora = 64; - - bool operator!=(const llama_hparams_lora & other) const { - return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0; - } -}; - -struct llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; -}; - -struct llama_layer_lora { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wqa; - struct ggml_tensor * wqb; - struct ggml_tensor * wka; - struct ggml_tensor * wkb; - struct ggml_tensor * wva; - struct ggml_tensor * wvb; - struct ggml_tensor * woa; - struct ggml_tensor * wob; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; -}; - - -struct llama_kv_cache { - struct ggml_context * ctx = NULL; - - struct ggml_tensor * k; - struct ggml_tensor * v; - - // llama_ctx_buffer buf; - - int n; // number of tokens currently in the cache -}; - -struct llama_model { - struct ggml_context * ctx = NULL; - - llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; -}; - -struct llama_model_lora { - struct ggml_context * ctx = NULL; - - llama_hparams_lora hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * outputa; - struct ggml_tensor * outputb; - - std::vector layers; -}; - -static void init_model(struct llama_model * model) { - const auto & hparams = model->hparams; - - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; - - const uint32_t n_ff = get_n_ff(&hparams); - - struct ggml_context * ctx = model->ctx; - - model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab}); - - model->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - // std::string layers_i = "layers." + std::to_string(i); - - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); - - layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); - - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); - - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); - } -} - - -static void init_model_lora(struct llama_model_lora * model) { - const auto & hparams = model->hparams; - - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_mult = hparams.n_mult; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; - const uint32_t n_lora = hparams.n_lora; - - const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; - - struct ggml_context * ctx = model->ctx; - - model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab}); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd}); - model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab}); - model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab}); - - model->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - // std::string layers_i = "layers." + std::to_string(i); - - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd}); - - layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); - layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd}); - - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd}); - - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); - } -} - -static void set_param_model(struct llama_model * model) { - const auto& hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct ggml_context* ctx = model->ctx; - - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->output); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wq); - ggml_set_param(ctx, layer.wk); - ggml_set_param(ctx, layer.wv); - ggml_set_param(ctx, layer.wo); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); - } -} - -static void set_param_model_lora(struct llama_model_lora * model) { - const auto& hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct ggml_context* ctx = model->ctx; - - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->outputa); - ggml_set_param(ctx, model->outputb); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wqa); - ggml_set_param(ctx, layer.wqb); - ggml_set_param(ctx, layer.wka); - ggml_set_param(ctx, layer.wkb); - ggml_set_param(ctx, layer.wva); - ggml_set_param(ctx, layer.wvb); - ggml_set_param(ctx, layer.woa); - ggml_set_param(ctx, layer.wob); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); - } -} - -static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { - const auto & hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(model->tok_embeddings , rnd); - randomize_tensor_normal(model->norm , rnd); - randomize_tensor_normal(model->output , rnd); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, rnd); - - randomize_tensor_normal(layer.wq, rnd); - randomize_tensor_normal(layer.wk, rnd); - randomize_tensor_normal(layer.wv, rnd); - randomize_tensor_normal(layer.wo, rnd); - - randomize_tensor_normal(layer.ffn_norm, rnd); - - randomize_tensor_normal(layer.w1, rnd); - randomize_tensor_normal(layer.w2, rnd); - randomize_tensor_normal(layer.w3, rnd); - } - - free_random_normal_distribution(rnd); -} - - -static void randomize_model_lora( - struct llama_model_lora * model, int seed, float mean, float std, float min, float max -) { - const auto & hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(model->tok_embeddings, rnd); - randomize_tensor_normal(model->norm , rnd); - randomize_tensor_normal(model->outputa , rnd); - randomize_tensor_normal(model->outputb , rnd); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, rnd); - - randomize_tensor_normal(layer.wqa, rnd); - randomize_tensor_normal(layer.wqb, rnd); - randomize_tensor_normal(layer.wka, rnd); - randomize_tensor_normal(layer.wkb, rnd); - randomize_tensor_normal(layer.wva, rnd); - randomize_tensor_normal(layer.wvb, rnd); - randomize_tensor_normal(layer.woa, rnd); - randomize_tensor_normal(layer.wob, rnd); - - randomize_tensor_normal(layer.ffn_norm, rnd); - - randomize_tensor_normal(layer.w1, rnd); - randomize_tensor_normal(layer.w2, rnd); - randomize_tensor_normal(layer.w3, rnd); - } - - free_random_normal_distribution(rnd); -} - -static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { - const auto & hparams = model->hparams; - - const uint32_t n_ctx = hparams.n_ctx; - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx*n_batch; - const int64_t n_elements = n_embd*n_mem; - - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - - // struct ggml_init_params params; - // params.mem_size = cache.buf.size; - // params.mem_buffer = cache.buf.addr; - // params.no_alloc = false; - if (!cache->ctx) { - struct ggml_init_params params; - params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; - params.mem_buffer = NULL; - params.no_alloc = false; - - cache->ctx = ggml_init(params); - - if (!cache->ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - exit(1); - } - } - - cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); -} - -static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { - const auto & hparams = model->hparams; - - const uint32_t n_ctx = hparams.n_ctx; - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx*n_batch; - const int64_t n_elements = n_embd*n_mem; - - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - - // struct ggml_init_params params; - // params.mem_size = cache.buf.size; - // params.mem_buffer = cache.buf.addr; - // params.no_alloc = false; - if (!cache->ctx) { - struct ggml_init_params params; - params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; - params.mem_buffer = NULL; - params.no_alloc = false; - - cache->ctx = ggml_init(params); - - if (!cache->ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - } - - cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; -} - -static struct ggml_tensor * forward( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past -) { - const int N = n_tokens; - - struct llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - - // inpL shape [n_embd,N,1,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [n_embd, N, 1, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] - // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] - // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - } - - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Q shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // K shape [n_embd/n_head, n_past + N, n_head, 1] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - // KQ shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // split cached V into n_head heads - //// V shape [n_past + N, n_embd/n_head, n_head, 1] - // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] - struct ggml_tensor * V = - ggml_view_3d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(vc), - n_ctx*ggml_element_size(vc)*n_embd/n_head, - il*n_ctx*ggml_element_size(vc)*n_embd); - - // KQV shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - - // cur = ffn_norm*cur - // cur shape [n_embd,N,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - } - - // tmp shape [n_ff,N,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - - // SILU activation - // cur shape [n_ff,N,1,1] - cur = ggml_silu(ctx0, cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul(ctx0, cur, tmp); - - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - } - - // cur shape [n_embd,N,1,1] - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - // inpL shape [n_embd,N,1,1] - inpL = cur; - } - - // norm - { - - // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // inpL = norm*inpL - // inpL shape [n_embd,N,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - -static struct ggml_tensor * forward_batch( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch -) { - const int N = n_tokens; - - struct llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [N, n_embd, n_batch, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wv, - cur), - n_embd, N, n_batch), - 1, 0, 2, 3)); - - assert_shape_3d(Vcur, N, n_embd, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] - // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_2d(ctx0, kc, - ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), - ggml_element_size(kc)*n_embd*n_ctx, - (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); - vc = ggml_set_2d(ctx0, vc, - ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), - ggml_element_size(vc)*n_ctx*n_embd, - ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); - - assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); - assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); - } - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, n_past + N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_4d(ctx0, - ggml_view_3d(ctx0, - kc, - n_embd, - (n_past + N), - n_batch, - n_embd*ggml_element_size(kc), - n_ctx*n_embd*ggml_element_size(kc), - il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), - n_embd/n_head, n_head, n_past + N, n_batch), - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); - - // K * Q - // KQ shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); - assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); - - // split cached V into n_head heads - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] - struct ggml_tensor * V = - ggml_view_4d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, n_batch, - ggml_element_size(vc)*n_ctx, - ggml_element_size(vc)*n_ctx*n_embd/n_head, - ggml_element_size(vc)*n_ctx*n_embd, - il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); - assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - -static struct ggml_tensor * forward_lora( - struct llama_model_lora * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past -) { - const int N = n_tokens; - - struct llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - - // inpL shape [n_embd,N,1,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope(ctx0, - ggml_reshape_3d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wqa, - ggml_mul_mat(ctx0, - model->layers[il].wqb, - cur)), - n_embd/n_head, n_head, N), - KQ_pos, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, - ggml_reshape_3d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wka, - ggml_mul_mat(ctx0, - model->layers[il].wkb, - cur)), - n_embd/n_head, n_head, N), - KQ_pos, n_rot, 0); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [n_embd, N, 1, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, - ggml_transpose(ctx0, - ggml_reshape_2d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wva, - ggml_mul_mat(ctx0, - model->layers[il].wvb, - cur)), - n_embd, N))); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] - // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] - // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - } - - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Q shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // K shape [n_embd/n_head, n_past + N, n_head, 1] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - // KQ shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // split cached V into n_head heads - //// V shape [n_past + N, n_embd/n_head, n_head, 1] - // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] - struct ggml_tensor * V = - ggml_view_3d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(vc), - n_ctx*ggml_element_size(vc)*n_embd/n_head, - il*n_ctx*ggml_element_size(vc)*n_embd); - - // KQV shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].woa, - ggml_mul_mat(ctx0, - model->layers[il].wob, - cur)); - } - - // inpFF shape [n_embd,N,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - - // cur = ffn_norm*cur - // cur shape [n_embd,N,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - } - - // tmp shape [n_ff,N,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - - // SILU activation - // cur shape [n_ff,N,1,1] - cur = ggml_silu(ctx0, cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul(ctx0, cur, tmp); - - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - } - - // cur shape [n_embd,N,1,1] - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - // inpL shape [n_embd,N,1,1] - inpL = cur; - } - - // norm - { - - // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // inpL = norm*inpL - // inpL shape [n_embd,N,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - //embeddings = inpL; - } - - - // lm_head - // inpL shape [n_vocab,N,1,1] - inpL = ggml_mul_mat(ctx0, - model->outputa, - ggml_mul_mat(ctx0, - model->outputb, - inpL)); - - // ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - -static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - assert(ggml_is_matrix(logits)); - assert(ggml_is_matrix(probs)); - assert(ggml_is_vector(best_samples)); - assert(logits->ne[1] == best_samples->ne[0]); - assert(logits->ne[0] == probs->ne[0]); - assert(logits->ne[1] == probs->ne[1]); - for (int i = 0; i < logits->ne[1]; ++i) { - float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]); - ggml_set_i32_1d(best_samples, i, 0); - for (int k = 0; k < logits->ne[0]; ++k) { - float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); - if (logit > max_logit) { - max_logit = logit; - ggml_set_i32_1d(best_samples, i, k); - } - } - float psum = 0; - for (int k = 0; k < logits->ne[0]; ++k) { - float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k); - float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit); - psum += p; - ggml_set_f32_1d(probs, i * probs->ne[0] + k, p); - } - for (int k = 0; k < logits->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); - ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum); - } - } -} - -static void sample_softmax_batch( - struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, - struct ggml_tensor * best_samples -) { - GGML_ASSERT(ggml_is_matrix(best_samples)); - GGML_ASSERT(ggml_is_3d(logits)); - GGML_ASSERT(ggml_is_3d(probs)); - int n_tokens = best_samples->ne[0]; - int n_batch = best_samples->ne[1]; - int n_vocab = logits->ne[0]; - GGML_ASSERT(n_tokens == logits->ne[1]); - GGML_ASSERT(n_batch == logits->ne[2]); - GGML_ASSERT(n_vocab == probs->ne[0]); - GGML_ASSERT(n_tokens == probs->ne[1]); - GGML_ASSERT(n_batch == probs->ne[2]); - - for (int k = 0; k < n_batch; ++k) { - struct ggml_tensor * best_samples_k = ggml_view_1d(ctx, - best_samples, - best_samples->ne[0], - k*best_samples->nb[1]); - struct ggml_tensor * logits_k = ggml_view_2d(ctx, - logits, - logits->ne[0], - logits->ne[1], - logits->nb[1], - k*logits->nb[2]); - struct ggml_tensor * probs_k = ggml_view_2d(ctx, - probs, - probs->ne[0], - probs->ne[1], - probs->nb[1], - k*probs->nb[2]); - sample_softmax(logits_k, probs_k, best_samples_k); - } -} - -static void print_row(struct ggml_tensor * probs, int i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); - printf(" %.2f", p); - } - printf("\n"); -} - -static void print_matrix(struct ggml_tensor * probs) { - assert(ggml_is_matrix(probs)); - for (int i = 0; i < probs->ne[1]; ++i) { - for (int k = 0; k < probs->ne[0]; ++k) { - float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); - printf(" %.2f", p); - } - printf("\n"); - } -} - -static void print_token(int token, int n_vocab) { - for (int k = 0; k < token; ++k) { - printf(" "); - } - printf("X"); - for (int k = token+1; k < n_vocab; ++k) { - printf(" "); - } - printf("\n"); -} - -static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { - for (int i=0; ine[0]; ++i) { - int token = ggml_get_i32_1d(tokens, i); - print_token(token, n_vocab); - } -} - -static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { - int n_tokens = tokens_input->ne[0]; - int n_vocab = targets->ne[0]; - float randomness = 0.0f; - // ggml_set_zero(targets); - ggml_set_f32(targets, -1.0f); - ggml_set_i32_1d(tokens_input, 0, 0); - for (int i=1; i 1.0f) ? 1.0f : z; // clamp to [0..1] - int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1)); - ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f); - if (ine[0]; - int n_batch = tokens_input->ne[1]; - GGML_ASSERT(n_tokens == targets->ne[1]); - GGML_ASSERT(n_batch == targets->ne[2]); - - for (int k=0; kne[0], - k*tokens_input->nb[1]); - struct ggml_tensor * targets_k = ggml_view_2d(ctx, - targets, - targets->ne[0], - targets->ne[1], - targets->nb[1], - k*targets->nb[2]); - get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k); - } -} - -static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { - int n_tokens = tokens_input->ne[0]; - int n_vocab = targets->ne[0]; - for (int i=0; i work_buffer; - - for (int ex=0; ex1 NUMA node + + GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + + GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + + GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, /* = GGML_DEFAULT_N_THREADS */ + struct ggml_threadpool * threadpool /* = NULL */ ); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + + // TODO: move to backend interface + GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_sve (void); + GGML_API int ggml_cpu_has_matmul_int8(void); + // get the sve vector length in bytes + GGML_API int ggml_cpu_get_sve_cnt(void); + + // Internal types and functions exposed for tests and benchmarks + + typedef void (*ggml_from_float_to_mat_t) + (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs); + typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); + typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, + const void * GGML_RESTRICT y, int nr, int nc); + typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, + const void * GGML_RESTRICT y, int nr, int nc); + + struct ggml_type_traits_cpu { + ggml_from_float_to_mat_t from_float_to_mat; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously + int64_t ncols; // number of columns to process simultaneously + ggml_gemv_t gemv; + ggml_gemm_t gemm; + }; + + GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); + + GGML_API void ggml_cpu_init(void); + + // + // CPU backend + // + + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + + GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2d93f31fa..8a0bcbff8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -573,6 +573,13 @@ extern "C" { GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) }; + struct ggml_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -618,59 +625,6 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - - // the compute plan that needs to be prepared for ggml_graph_compute() - // since https://github.com/ggerganov/ggml/issues/287 - struct ggml_cplan { - size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` - uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` - - int n_threads; - struct ggml_threadpool * threadpool; - - // abort ggml_graph_compute when true - ggml_abort_callback abort_callback; - void * abort_callback_data; - }; - - struct ggml_init_params { - // memory pool - size_t mem_size; // bytes - void * mem_buffer; // if NULL, memory will be allocated internally - bool no_alloc; // don't allocate memory for the tensor data - }; - - // numa strategies - enum ggml_numa_strategy { - GGML_NUMA_STRATEGY_DISABLED = 0, - GGML_NUMA_STRATEGY_DISTRIBUTE = 1, - GGML_NUMA_STRATEGY_ISOLATE = 2, - GGML_NUMA_STRATEGY_NUMACTL = 3, - GGML_NUMA_STRATEGY_MIRROR = 4, - GGML_NUMA_STRATEGY_COUNT - }; // // GUID @@ -693,9 +647,6 @@ extern "C" { // accepts a UTF-8 path, even on Windows GGML_API FILE * ggml_fopen(const char * fname, const char * mode); - GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems - GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node - GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); @@ -797,8 +748,7 @@ extern "C" { int64_t ne2, int64_t ne3); - GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); - GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); @@ -808,35 +758,25 @@ extern "C" { GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); - GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); - GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - // Converts a flat index into coordinates - GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); - GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); - GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - - GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); - - GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); - GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - - GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); - GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); GGML_ATTRIBUTE_FORMAT(2, 3) GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); + // Tensor flags + GGML_API void ggml_set_input(struct ggml_tensor * tensor); + GGML_API void ggml_set_output(struct ggml_tensor * tensor); + GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API void ggml_set_loss(struct ggml_tensor * tensor); + // // operations on tensors with backpropagation // @@ -2052,9 +1992,6 @@ extern "C" { // automatic differentiation // - GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API void ggml_set_loss(struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate); @@ -2086,27 +2023,6 @@ extern "C" { GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); - - // ggml_graph_plan() has to be called before ggml_graph_compute() - // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan( - const struct ggml_cgraph * cgraph, - int n_threads, /* = GGML_DEFAULT_N_THREADS */ - struct ggml_threadpool * threadpool /* = NULL */ ); - GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); - - // same as ggml_graph_compute() but the work data is allocated as a part of the context - // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); - GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); @@ -2277,6 +2193,8 @@ extern "C" { } lbfgs; }; + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); // optimize the function defined by the tensor f @@ -2308,12 +2226,6 @@ extern "C" { ggml_opt_callback callback, void * callback_data); - // - // tensor flags - // - GGML_API void ggml_set_input(struct ggml_tensor * tensor); - GGML_API void ggml_set_output(struct ggml_tensor * tensor); - // // quantization // @@ -2482,8 +2394,6 @@ extern "C" { GGML_API int ggml_cpu_has_avx512_bf16(void); GGML_API int ggml_cpu_has_amx_int8 (void); GGML_API int ggml_cpu_has_fma (void); - GGML_API int ggml_cpu_has_neon (void); - GGML_API int ggml_cpu_has_sve (void); GGML_API int ggml_cpu_has_arm_fma (void); GGML_API int ggml_cpu_has_metal (void); GGML_API int ggml_cpu_has_f16c (void); @@ -2500,17 +2410,9 @@ extern "C" { GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); - GGML_API int ggml_cpu_has_matmul_int8(void); GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_llamafile (void); - // get the sve vector length in bytes - GGML_API int ggml_cpu_get_sve_cnt(void); - - // - // Internal types and functions exposed for tests and benchmarks - // - #ifdef __cplusplus // restrict not standard in C++ #define GGML_RESTRICT @@ -2519,14 +2421,6 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - typedef void (*ggml_from_float_to_mat_t) - (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs); - typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, - const void * GGML_RESTRICT y, size_t by, int nrc); - typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); - typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); struct ggml_type_traits { const char * type_name; @@ -2537,13 +2431,6 @@ extern "C" { ggml_to_float_t to_float; ggml_from_float_t from_float; ggml_from_float_t from_float_ref; - ggml_from_float_to_mat_t from_float_to_mat; - ggml_vec_dot_t vec_dot; - enum ggml_type vec_dot_type; - int64_t nrows; // number of rows to process simultaneously - int64_t ncols; // number of columns to process simultaneously - ggml_gemv_t gemv; - ggml_gemm_t gemm; }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 0764a8d90..82b81cf12 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1366,10 +1366,12 @@ endif() add_library(ggml ../include/ggml.h + ../include/ggml-cpu.h ../include/ggml-alloc.h ../include/ggml-backend.h ../include/ggml-cpp.h ggml.c + ggml-cpu.c ggml-alloc.c ggml-backend.cpp ggml-quants.c diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index eb30f8944..81f62ff4f 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -7,6 +7,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu.h" #include "ggml-cpu-impl.h" #include diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c2afdf391..0b8ebac53 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -8,6 +8,7 @@ #include #endif +#include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" @@ -566,6 +567,8 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-kompute.h" #endif +#include "ggml-cpu.h" + struct ggml_backend_registry { std::vector backends; std::vector devices; @@ -713,616 +716,6 @@ ggml_backend_t ggml_backend_init_best(void) { return ggml_backend_dev_init(dev, NULL); } -// CPU backend - buffer - -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { - uintptr_t data = (uintptr_t)buffer->context; - - // align the buffer - if (data % TENSOR_ALIGNMENT != 0) { - data = GGML_PAD(data, TENSOR_ALIGNMENT); - } - - return (void *)data; -} - -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_aligned_free(buffer->context, buffer->size); -} - -static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - memcpy(dst->data, src->data, ggml_nbytes(src)); - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - memset(buffer->context, value, buffer->size); -} - -static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { - /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, -}; - -static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, -}; - -// CPU backend - buffer type - -static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * data = ggml_aligned_malloc(size); - - if (data == NULL) { - GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); - return NULL; - } - - return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); -} - -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; - - GGML_UNUSED(buft); -} - -ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type; -} - -static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_Mapped"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type; -} - -#ifdef GGML_USE_CPU_HBM - -// buffer type HBM - -#include - -static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_HBM"; - - GGML_UNUSED(buft); -} - -static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { - hbw_free(buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr; - int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); - if (result != 0) { - GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); - return NULL; - } - - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type_hbm; -} -#endif - -static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { - static ggml_backend_buffer_type_t bufts[] = { -#ifdef GGML_USE_CPU_HBM - ggml_backend_cpu_hbm_buffer_type(), -#endif - NULL - }; - - return bufts; - - GGML_UNUSED(device); -} - -// CPU backend - backend (stream) - -struct ggml_backend_cpu_context { - int n_threads; - ggml_threadpool_t threadpool; - - uint8_t * work_data; - size_t work_size; - - ggml_abort_callback abort_callback; - void * abort_callback_data; -}; - -static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { - return "CPU"; - - GGML_UNUSED(backend); -} - -static void ggml_backend_cpu_free(ggml_backend_t backend) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - delete[] cpu_ctx->work_data; - delete cpu_ctx; - delete backend; -} - -struct ggml_backend_plan_cpu { - struct ggml_cplan cplan; - struct ggml_cgraph cgraph; -}; - -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - - struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; - - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); - cpu_plan->cgraph = *cgraph; // FIXME: deep copy - - if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; - if (cpu_plan->cplan.work_data == NULL) { - delete cpu_plan; - return NULL; - } - } - - cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; - cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; - - return cpu_plan; -} - -static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - - delete[] cpu_plan->cplan.work_data; - delete cpu_plan; - - GGML_UNUSED(backend); -} - -static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - - return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); - - GGML_UNUSED(backend); -} - -static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); - - if (cpu_ctx->work_size < cplan.work_size) { - delete[] cpu_ctx->work_data; - cpu_ctx->work_data = new uint8_t[cplan.work_size]; - if (cpu_ctx->work_data == NULL) { - cpu_ctx->work_size = 0; - return GGML_STATUS_ALLOC_FAILED; - } - cpu_ctx->work_size = cplan.work_size; - } - cplan.work_data = (uint8_t *)cpu_ctx->work_data; - - cplan.abort_callback = cpu_ctx->abort_callback; - cplan.abort_callback_data = cpu_ctx->abort_callback_data; - - return ggml_graph_compute(cgraph, &cplan); -} - -static const struct ggml_backend_i ggml_backend_cpu_i = { - /* .get_name = */ ggml_backend_cpu_get_name, - /* .free = */ ggml_backend_cpu_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, - /* .graph_compute = */ ggml_backend_cpu_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_cpu_guid(void) { - static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; - return &guid; -} - -ggml_backend_t ggml_backend_cpu_init(void) { - struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; - if (ctx == NULL) { - return NULL; - } - - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->threadpool = NULL; - ctx->work_data = NULL; - ctx->work_size = 0; - ctx->abort_callback = NULL; - ctx->abort_callback_data = NULL; - - ggml_backend_t cpu_backend = new ggml_backend { - /* .guid = */ ggml_backend_cpu_guid(), - /* .interface = */ ggml_backend_cpu_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ ctx, - }; - - if (cpu_backend == NULL) { - delete ctx; - return NULL; - } - - return cpu_backend; -} - -bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); -} - -void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { - GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); - - struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; - ctx->n_threads = n_threads; -} - -void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { - GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); - - struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; - - if (ctx->threadpool && ctx->threadpool != threadpool) { - // already had a different threadpool, pause/suspend it before switching - ggml_threadpool_pause(ctx->threadpool); - } - ctx->threadpool = threadpool; -} - -void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { - GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); - - struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; -} - -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { - GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); -} - -// CPU backend - device - -struct ggml_backend_cpu_device_context { - std::string description = "CPU"; - - ggml_backend_cpu_device_context() { -#ifdef __APPLE__ - size_t len = 0; - if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { - description.resize(len); - sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT - } -#elif defined(__linux__) - FILE * f = fopen("/proc/cpuinfo", "r"); - if (f) { - char buf[1024]; - while (fgets(buf, sizeof(buf), f)) { - if (strncmp(buf, "model name", 10) == 0) { - char * p = strchr(buf, ':'); - if (p) { - p++; - while (std::isspace(*p)) { - p++; - } - while (std::isspace(p[strlen(p) - 1])) { - p[strlen(p) - 1] = '\0'; - } - description = p; - break; - } - } - } - fclose(f); - } -#elif defined(_WIN32) - HKEY hKey; - if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, - TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), - 0, - KEY_READ, - &hKey) == ERROR_SUCCESS) { - DWORD cpu_brand_size = 0; - if (RegQueryValueExA(hKey, - TEXT("ProcessorNameString"), - NULL, - NULL, - NULL, - &cpu_brand_size) == ERROR_SUCCESS) { - description.resize(cpu_brand_size); - if (RegQueryValueExA(hKey, - TEXT("ProcessorNameString"), - NULL, - NULL, - (LPBYTE)&description[0], // NOLINT - &cpu_brand_size) == ERROR_SUCCESS) { - if (description.find('\0') != std::string::npos) { - description.resize(description.find('\0')); - } - } - } - RegCloseKey(hKey); - } -#endif - } -}; - -static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { - return "CPU"; - - GGML_UNUSED(dev); -} - -static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { - struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; - - return ctx->description.c_str(); -} - -static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; - - GGML_UNUSED(dev); -} - -static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_CPU; - - GGML_UNUSED(dev); -} - -static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_cpu_device_get_name(dev); - props->description = ggml_backend_cpu_device_get_description(dev); - props->type = ggml_backend_cpu_device_get_type(dev); - ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, - /* .events = */ false, - }; -} - -static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { - return ggml_backend_cpu_init(); - - GGML_UNUSED(dev); - GGML_UNUSED(params); -} - -static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(dev); -} - -static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - return ggml_backend_cpu_buffer_from_ptr(ptr, size); - - GGML_UNUSED(dev); - GGML_UNUSED(max_tensor_size); -} - -static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - switch (op->op) { - case GGML_OP_CPY: - return - op->type != GGML_TYPE_IQ2_XXS && - op->type != GGML_TYPE_IQ2_XS && - op->type != GGML_TYPE_IQ1_S && - op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float - case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; - case GGML_OP_ROPE_BACK: - return op->src[2] == NULL && (op->op_params[2] & 4) == 0; - case GGML_OP_IM2COL_BACK: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; - case GGML_OP_OUT_PROD: - return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; - default: - return true; - } - - GGML_UNUSED(dev); -} - -static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { - /* .get_name = */ ggml_backend_cpu_device_get_name, - /* .get_description = */ ggml_backend_cpu_device_get_description, - /* .get_memory = */ ggml_backend_cpu_device_get_memory, - /* .get_type = */ ggml_backend_cpu_device_get_type, - /* .get_props = */ ggml_backend_cpu_device_get_props, - /* .init_backend = */ ggml_backend_cpu_device_init_backend, - /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, - /* .supports_op = */ ggml_backend_cpu_device_supports_op, - /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, - /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -// CPU backend - backend (reg) - -static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { - return "CPU"; - - GGML_UNUSED(reg); -} - -static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { - return 1; - - GGML_UNUSED(reg); -} - -static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); - - static ggml_backend_cpu_device_context ctx; - static ggml_backend_device ggml_backend_cpu_device = { - /* .iface = */ ggml_backend_cpu_device_i, - /* .reg = */ reg, - /* .context = */ &ctx, - }; - - return &ggml_backend_cpu_device; -} - -static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_cpu_set_n_threads; - } - if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { - return (void *)ggml_backend_cpu_get_extra_bufts; - } - - return NULL; - - GGML_UNUSED(reg); -} - -static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { - /* .get_name = */ ggml_backend_cpu_reg_get_name, - /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, - /* .get_device = */ ggml_backend_cpu_reg_get_device, - /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, -}; - -ggml_backend_reg_t ggml_backend_cpu_reg(void) { - static struct ggml_backend_reg ggml_backend_cpu_reg = { - /* .iface = */ ggml_backend_cpu_reg_i, - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_reg; -} - // multi-buffer buffer struct ggml_backend_multi_buffer_context { @@ -2642,3 +2035,627 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t return true; } + + + +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-cpu.h" +#include "ggml-impl.h" +#include +#include + +// ggml-backend interface + +// CPU backend - buffer + +static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { + uintptr_t data = (uintptr_t)buffer->context; + + // align the buffer + if (data % TENSOR_ALIGNMENT != 0) { + data = GGML_PAD(data, TENSOR_ALIGNMENT); + } + + return (void *)data; +} + +static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, buffer->size); +} + +static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + memcpy((char *)tensor->data + offset, data, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { + /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, +}; + +static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, +}; + +// CPU backend - buffer type + +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * data = ggml_aligned_malloc(size); + + if (data == NULL) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); +} + +static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type; +} + +static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_Mapped"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type; +} + +#ifdef GGML_USE_CPU_HBM + +// buffer type HBM + +#include + +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type_hbm; +} +#endif + +static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { + static ggml_backend_buffer_type_t bufts[] = { +#ifdef GGML_USE_CPU_HBM + ggml_backend_cpu_hbm_buffer_type(), +#endif + NULL + }; + + return bufts; + + GGML_UNUSED(device); +} + +// CPU backend - backend (stream) + +struct ggml_backend_cpu_context { + int n_threads; + ggml_threadpool_t threadpool; + + uint8_t * work_data; + size_t work_size; + + ggml_abort_callback abort_callback; + void * abort_callback_data; +}; + +static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { + return "CPU"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_cpu_free(ggml_backend_t backend) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + delete[] cpu_ctx->work_data; + delete cpu_ctx; + delete backend; +} + +struct ggml_backend_plan_cpu { + struct ggml_cplan cplan; + struct ggml_cgraph cgraph; +}; + +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; + + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); + cpu_plan->cgraph = *cgraph; // FIXME: deep copy + + if (cpu_plan->cplan.work_size > 0) { + cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; + if (cpu_plan->cplan.work_data == NULL) { + delete cpu_plan; + return NULL; + } + } + + cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + + return cpu_plan; +} + +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; + + delete[] cpu_plan->cplan.work_data; + delete cpu_plan; + + GGML_UNUSED(backend); +} + +static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; + + return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + + GGML_UNUSED(backend); +} + +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); + + if (cpu_ctx->work_size < cplan.work_size) { + delete[] cpu_ctx->work_data; + cpu_ctx->work_data = new uint8_t[cplan.work_size]; + if (cpu_ctx->work_data == NULL) { + cpu_ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + cpu_ctx->work_size = cplan.work_size; + } + cplan.work_data = (uint8_t *)cpu_ctx->work_data; + + cplan.abort_callback = cpu_ctx->abort_callback; + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + + return ggml_graph_compute(cgraph, &cplan); +} + +static const struct ggml_backend_i ggml_backend_cpu_i = { + /* .get_name = */ ggml_backend_cpu_get_name, + /* .free = */ ggml_backend_cpu_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cpu_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_cpu_guid(void) { + static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; + return &guid; +} + +ggml_backend_t ggml_backend_cpu_init(void) { + // initialize CPU backend now to avoid slowing the first graph computation + ggml_cpu_init(); + + struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; + if (ctx == NULL) { + return NULL; + } + + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; + + ggml_backend_t cpu_backend = new ggml_backend { + /* .guid = */ ggml_backend_cpu_guid(), + /* .interface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, + }; + + if (cpu_backend == NULL) { + delete ctx; + return NULL; + } + + return cpu_backend; +} + +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); +} + +void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->n_threads = n_threads; +} + +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + + if (ctx->threadpool && ctx->threadpool != threadpool) { + // already had a different threadpool, pause/suspend it before switching + ggml_threadpool_pause(ctx->threadpool); + } + ctx->threadpool = threadpool; +} + +void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { + GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); + return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); +} + +// CPU backend - device + +struct ggml_backend_cpu_device_context { + std::string description = "CPU"; + + ggml_backend_cpu_device_context() { +#ifdef __APPLE__ + size_t len = 0; + if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) { + description.resize(len); + sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT + } +#elif defined(__linux__) + FILE * f = fopen("/proc/cpuinfo", "r"); + if (f) { + char buf[1024]; + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "model name", 10) == 0) { + char * p = strchr(buf, ':'); + if (p) { + p++; + while (std::isspace(*p)) { + p++; + } + while (std::isspace(p[strlen(p) - 1])) { + p[strlen(p) - 1] = '\0'; + } + description = p; + break; + } + } + } + fclose(f); + } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) == ERROR_SUCCESS) { + DWORD cpu_brand_size = 0; + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + NULL, + &cpu_brand_size) == ERROR_SUCCESS) { + description.resize(cpu_brand_size); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)&description[0], // NOLINT + &cpu_brand_size) == ERROR_SUCCESS) { + if (description.find('\0') != std::string::npos) { + description.resize(description.find('\0')); + } + } + } + RegCloseKey(hKey); + } +#endif + } +}; + +static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { + return "CPU"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { + struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; + + return ctx->description.c_str(); +} + +static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_CPU; + + GGML_UNUSED(dev); +} + +static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_cpu_device_get_name(dev); + props->description = ggml_backend_cpu_device_get_description(dev); + props->type = ggml_backend_cpu_device_get_type(dev); + ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_cpu_init(); + + GGML_UNUSED(dev); + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_CPY: + return + op->type != GGML_TYPE_IQ2_XXS && + op->type != GGML_TYPE_IQ2_XS && + op->type != GGML_TYPE_IQ1_S && + op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float + case GGML_OP_MUL_MAT: + return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; + case GGML_OP_ROPE_BACK: + return op->src[2] == NULL && (op->op_params[2] & 4) == 0; + case GGML_OP_IM2COL_BACK: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_OUT_PROD: + return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32; + default: + return true; + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { + /* .get_name = */ ggml_backend_cpu_device_get_name, + /* .get_description = */ ggml_backend_cpu_device_get_description, + /* .get_memory = */ ggml_backend_cpu_device_get_memory, + /* .get_type = */ ggml_backend_cpu_device_get_type, + /* .get_props = */ ggml_backend_cpu_device_get_props, + /* .init_backend = */ ggml_backend_cpu_device_init_backend, + /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_cpu_device_supports_op, + /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// CPU backend - backend (reg) + +static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { + return "CPU"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_cpu_device_context ctx; + static ggml_backend_device ggml_backend_cpu_device = { + /* .iface = */ ggml_backend_cpu_device_i, + /* .reg = */ reg, + /* .context = */ &ctx, + }; + + return &ggml_backend_cpu_device; +} + +static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_cpu_set_n_threads; + } + if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { + return (void *)ggml_backend_cpu_get_extra_bufts; + } + + return NULL; + + GGML_UNUSED(reg); +} + +static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { + /* .get_name = */ ggml_backend_cpu_reg_get_name, + /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, + /* .get_device = */ ggml_backend_cpu_reg_get_device, + /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_cpu_reg(void) { + static struct ggml_backend_reg ggml_backend_cpu_reg = { + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_reg; +} diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c new file mode 100644 index 000000000..4b8ffb629 --- /dev/null +++ b/ggml/src/ggml-cpu.c @@ -0,0 +1,13715 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows +#define _USE_MATH_DEFINES // For M_PI on MSVC + +#include "ggml-aarch64.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-cpu-impl.h" +#include "ggml-cpu.h" +#include "ggml-impl.h" +#include "ggml-quants.h" +#include "ggml.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +#include // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__gnu_linux__) +#include +#endif + +#ifdef GGML_USE_OPENMP +#include +#endif + +#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) +#undef GGML_USE_LLAMAFILE +#endif + +#ifdef GGML_USE_LLAMAFILE +#include +#endif + +#if defined(_MSC_VER) +// disable "possible loss of data" to avoid hundreds of casts +// we should just be careful :) +#pragma warning(disable: 4244 4267) + +// disable POSIX deprecation warnings +// these functions are never going away, anyway +#pragma warning(disable: 4996) + +// unreachable code because of multiple instances of code after GGML_ABORT +#pragma warning(disable: 4702) +#endif + +// Note: once we move threading into a separate C++ file +// will use std::hardware_destructive_interference_size instead of hardcoding it here +// and we'll use C++ attribute syntax. +#define GGML_CACHE_LINE 64 + +#if defined(__clang__) || defined(__GNUC__) +#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE))) +#endif + +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define GGML_TSAN_ENABLED 1 +#endif +#else // __has_feature +#if defined(__SANITIZE_THREAD__) +#define GGML_TSAN_ENABLED 1 +#endif +#endif // __has_feature + +#define UNUSED GGML_UNUSED +#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) + +#if defined(GGML_USE_ACCELERATE) +#include +#endif + +// floating point type used to accumulate sums +typedef double ggml_float; + +#define GGML_GELU_FP16 +#define GGML_GELU_QUICK_FP16 + +#define GGML_SOFT_MAX_UNROLL 4 +#define GGML_VEC_DOT_UNROLL 2 +#define GGML_VEC_MAD_UNROLL 32 + +// +// global data +// + +// precomputed gelu table for f16 (128 KB) +static ggml_fp16_t ggml_table_gelu_f16[1 << 16]; + +// precomputed quick gelu table for f16 (128 KB) +static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; + +// precomputed f32 table for f16 (256 KB) (ggml-impl.h) +float ggml_table_f32_f16[1 << 16]; + +#if defined(__ARM_ARCH) +struct ggml_arm_arch_features_type { + int has_neon; + int has_i8mm; + int has_sve; + int sve_cnt; +} ggml_arm_arch_features = {-1, -1, -1, 0}; +#endif + + +#if defined(_WIN32) + +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif +#include + + +#if !defined(__clang__) +#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE)) + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; +typedef atomic_int atomic_flag; + +#define ATOMIC_FLAG_INIT 0 + +typedef enum { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst +} memory_order; + +static void atomic_store(atomic_int * ptr, LONG val) { + InterlockedExchange(ptr, val); +} +static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) { + // TODO: add support for explicit memory order + InterlockedExchange(ptr, val); +} +static LONG atomic_load(atomic_int * ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} +static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) { + // TODO: add support for explicit memory order + return InterlockedCompareExchange(ptr, 0, 0); +} +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} +static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) { + // TODO: add support for explicit memory order + return InterlockedExchangeAdd(ptr, inc); +} +static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { + return InterlockedExchange(ptr, 1); +} +static void atomic_flag_clear(atomic_flag * ptr) { + InterlockedExchange(ptr, 0); +} +static void atomic_thread_fence(memory_order mo) { + MemoryBarrier(); +} +#else // clang +#include +#endif + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; +static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { + (void) unused; + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void * unused) { + (void) unused; + int ret = (int) WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + return ret; +} + +static int sched_yield (void) { + Sleep (0); + return 0; +} +#else + +#include +#include +#include +#if defined(__FreeBSD__) +#include +#endif + +typedef void * thread_ret_t; + +#include +#include +#include + +#endif + +typedef pthread_t ggml_thread_t; + +#ifdef GGML_USE_CPU_HBM +#include +#endif + +#if defined(__APPLE__) +#include +#include +#include +#endif + +// +// cache line +// + +#if defined(__cpp_lib_hardware_interference_size) +#define CACHE_LINE_SIZE hardware_destructive_interference_size +#else +#if defined(__POWER9_VECTOR__) +#define CACHE_LINE_SIZE 128 +#else +#define CACHE_LINE_SIZE 64 +#endif +#endif + +static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); + + +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); +static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); + +static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, + }, + [GGML_TYPE_F16] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, + .nrows = 1, + }, + [GGML_TYPE_Q4_0] = { + .vec_dot = ggml_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [GGML_TYPE_Q4_1] = { + .vec_dot = ggml_vec_dot_q4_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [4] = { // GGML_TYPE_Q4_2 + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, + }, + [5] = { // GGML_TYPE_Q4_3 + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, + }, + [GGML_TYPE_Q5_0] = { + .vec_dot = ggml_vec_dot_q5_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q5_1] = { + .vec_dot = ggml_vec_dot_q5_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, + }, + [GGML_TYPE_Q8_0] = { + .vec_dot = ggml_vec_dot_q8_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [GGML_TYPE_Q8_1] = { + .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, + }, + [GGML_TYPE_Q2_K] = { + .vec_dot = ggml_vec_dot_q2_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q3_K] = { + .vec_dot = ggml_vec_dot_q3_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q4_K] = { + .vec_dot = ggml_vec_dot_q4_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q5_K] = { + .vec_dot = ggml_vec_dot_q5_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q6_K] = { + .vec_dot = ggml_vec_dot_q6_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ2_XXS] = { + .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ2_XS] = { + .vec_dot = ggml_vec_dot_iq2_xs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ3_XXS] = { + .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ3_S] = { + .vec_dot = ggml_vec_dot_iq3_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ2_S] = { + .vec_dot = ggml_vec_dot_iq2_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ1_S] = { + .vec_dot = ggml_vec_dot_iq1_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ1_M] = { + .vec_dot = ggml_vec_dot_iq1_m_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ4_NL] = { + .vec_dot = ggml_vec_dot_iq4_nl_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_IQ4_XS] = { + .vec_dot = ggml_vec_dot_iq4_xs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_BF16] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, + .vec_dot_type = GGML_TYPE_BF16, + .nrows = 1, + }, + [GGML_TYPE_Q4_0_4_4] = { + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 4, + .gemv = ggml_gemv_q4_0_4x4_q8_0, + .gemm = ggml_gemm_q4_0_4x4_q8_0, + }, + [GGML_TYPE_Q4_0_4_8] = { + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + .ncols = 4, + .gemv = ggml_gemv_q4_0_4x8_q8_0, + .gemm = ggml_gemm_q4_0_4x8_q8_0, + }, + [GGML_TYPE_Q4_0_8_8] = { + .nrows = 1, + .ncols = 8, + .gemv = ggml_gemv_q4_0_8x8_q8_0, + .gemm = ggml_gemm_q4_0_8x8_q8_0, + }, + [GGML_TYPE_TQ1_0] = { + .vec_dot = ggml_vec_dot_tq1_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_TQ2_0] = { + .vec_dot = ggml_vec_dot_tq2_0_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, +}; + +const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { + return &type_traits_cpu[type]; +} + +// +// simd mappings +// + +// we define a common set of C macros which map to specific intrinsics based on the current architecture +// we then implement the fundamental computation operations below using only these macros +// adding support for new architectures requires to define the corresponding SIMD macros +// +// GGML_F32_STEP / GGML_F16_STEP +// number of elements to process in a single step +// +// GGML_F32_EPR / GGML_F16_EPR +// number of elements to fit in a single register +// + +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) + +#define GGML_SIMD + +// F32 NEON + +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 float32x4_t +#define GGML_F32x4_ZERO vdupq_n_f32(0.0f) +#define GGML_F32x4_SET1(x) vdupq_n_f32(x) +#define GGML_F32x4_LOAD vld1q_f32 +#define GGML_F32x4_STORE vst1q_f32 +#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) +#define GGML_F32x4_ADD vaddq_f32 +#define GGML_F32x4_MUL vmulq_f32 +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 NEON + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define GGML_F16_STEP 32 + #define GGML_F16_EPR 8 + + #define GGML_F16x8 float16x8_t + #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) + #define GGML_F16x8_SET1(x) vdupq_n_f16(x) + #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) + #define GGML_F16x8_STORE vst1q_f16 + #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) + #define GGML_F16x8_ADD vaddq_f16 + #define GGML_F16x8_MUL vmulq_f16 + #define GGML_F16x8_REDUCE(res, x) \ + do { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ + (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + } while (0) + + #define GGML_F16_VEC GGML_F16x8 + #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO + #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) + #define GGML_F16_VEC_FMA GGML_F16x8_FMA + #define GGML_F16_VEC_ADD GGML_F16x8_ADD + #define GGML_F16_VEC_MUL GGML_F16x8_MUL + #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE +#else + // if FP16 vector arithmetic is not supported, we use FP32 instead + // and take advantage of the vcvt_ functions to convert to/from FP16 + + #define GGML_F16_STEP 16 + #define GGML_F16_EPR 4 + + #define GGML_F32Cx4 float32x4_t + #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) + #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define GGML_F32Cx4_ADD vaddq_f32 + #define GGML_F32Cx4_MUL vmulq_f32 + #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + + #define GGML_F16_VEC GGML_F32Cx4 + #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO + #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) + #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA + #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD + #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL + #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE +#endif + +#elif defined(__AVX512F__) + +#define GGML_SIMD + +// F32 AVX512 + +#define GGML_F32_STEP 64 +#define GGML_F32_EPR 16 + +#define GGML_F32x16 __m512 +#define GGML_F32x16_ZERO _mm512_setzero_ps() +#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) +#define GGML_F32x16_LOAD _mm512_loadu_ps +#define GGML_F32x16_STORE _mm512_storeu_ps +// _mm512_fmadd_ps is defined in AVX512F so no guard is required +#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32x16_ADD _mm512_add_ps +#define GGML_F32x16_MUL _mm512_mul_ps +#define GGML_F32x16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x16 +#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD +#define GGML_F32_VEC_STORE GGML_F32x16_STORE +#define GGML_F32_VEC_FMA GGML_F32x16_FMA +#define GGML_F32_VEC_ADD GGML_F32x16_ADD +#define GGML_F32_VEC_MUL GGML_F32x16_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE + +// F16 AVX512 + +// F16 AVX + +#define GGML_F16_STEP 64 +#define GGML_F16_EPR 16 + +// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead + +#define GGML_F32Cx16 __m512 +#define GGML_F32Cx16_ZERO _mm512_setzero_ps() +#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) + +// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F +// so F16C guard isn't required +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) +#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) + +#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32Cx16_ADD _mm512_add_ps +#define GGML_F32Cx16_MUL _mm512_mul_ps +#define GGML_F32Cx16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F32Cx16 +#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + +#elif defined(__AVX__) + +#define GGML_SIMD + +// F32 AVX + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 8 + +#define GGML_F32x8 __m256 +#define GGML_F32x8_ZERO _mm256_setzero_ps() +#define GGML_F32x8_SET1(x) _mm256_set1_ps(x) +#define GGML_F32x8_LOAD _mm256_loadu_ps +#define GGML_F32x8_STORE _mm256_storeu_ps +#if defined(__FMA__) + #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) +#else + #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) +#endif +#define GGML_F32x8_ADD _mm256_add_ps +#define GGML_F32x8_MUL _mm256_mul_ps +#define GGML_F32x8_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ + _mm256_extractf128_ps(x[0], 1)); \ + const __m128 t1 = _mm_hadd_ps(t0, t0); \ + res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ +} while (0) +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x8 +#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD +#define GGML_F32_VEC_STORE GGML_F32x8_STORE +#define GGML_F32_VEC_FMA GGML_F32x8_FMA +#define GGML_F32_VEC_ADD GGML_F32x8_ADD +#define GGML_F32_VEC_MUL GGML_F32x8_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE + +// F16 AVX + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 8 + +// F16 arithmetic is not supported by AVX, so we use F32 instead + +#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8_ZERO _mm256_setzero_ps() +#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) + +#if defined(__F16C__) +// the _mm256_cvt intrinsics require F16C +#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) +#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) { + tmp[i] = GGML_FP16_TO_FP32(x[i]); + } + + return _mm256_loadu_ps(tmp); +} +static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { + float arr[8]; + + _mm256_storeu_ps(arr, y); + + for (int i = 0; i < 8; i++) + x[i] = GGML_FP32_TO_FP16(arr[i]); +} +#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#endif + +#define GGML_F32Cx8_FMA GGML_F32x8_FMA +#define GGML_F32Cx8_ADD _mm256_add_ps +#define GGML_F32Cx8_MUL _mm256_mul_ps +#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE + +#define GGML_F16_VEC GGML_F32Cx8 +#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE + +#elif defined(__POWER9_VECTOR__) + +#define GGML_SIMD + +// F32 POWER9 + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 vector float +#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_SET1 vec_splats +#define GGML_F32x4_LOAD(p) vec_xl(0, p) +#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) +#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) +#define GGML_F32x4_ADD vec_add +#define GGML_F32x4_MUL vec_mul +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + res = vec_extract(x[0], 0) + \ + vec_extract(x[0], 1) + \ + vec_extract(x[0], 2) + \ + vec_extract(x[0], 3); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 POWER9 +#define GGML_F16_STEP GGML_F32_STEP +#define GGML_F16_EPR GGML_F32_EPR +#define GGML_F16_VEC GGML_F32x4 +#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F16_VEC_FMA GGML_F32x4_FMA +#define GGML_F16_VEC_ADD GGML_F32x4_ADD +#define GGML_F16_VEC_MUL GGML_F32x4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE +// Use vec_xl, not vec_ld, in case the load address is not aligned. +#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ + vec_extract_fp32_from_shortl(vec_xl(0, p)) +#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define GGML_F16_VEC_STORE(p, r, i) \ + if (i & 0x1) \ + vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ + r[i - GGML_ENDIAN_BYTE(0)]), \ + 0, p - GGML_F16_EPR) + +#elif defined(__wasm_simd128__) + +#define GGML_SIMD + +// F32 WASM + +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 v128_t +#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F32x4_LOAD wasm_v128_load +#define GGML_F32x4_STORE wasm_v128_store +#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) +#define GGML_F32x4_ADD wasm_f32x4_add +#define GGML_F32x4_MUL wasm_f32x4_mul +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 WASM + +#define GGML_F16_STEP 16 +#define GGML_F16_EPR 4 + +inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(p[0]); + tmp[1] = GGML_FP16_TO_FP32(p[1]); + tmp[2] = GGML_FP16_TO_FP32(p[2]); + tmp[3] = GGML_FP16_TO_FP32(p[3]); + + return wasm_v128_load(tmp); +} + +inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { + float tmp[4]; + + wasm_v128_store(tmp, x); + + p[0] = GGML_FP32_TO_FP16(tmp[0]); + p[1] = GGML_FP32_TO_FP16(tmp[1]); + p[2] = GGML_FP32_TO_FP16(tmp[2]); + p[3] = GGML_FP32_TO_FP16(tmp[3]); +} + +#define GGML_F16x4 v128_t +#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) +#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) +#define GGML_F16x4_FMA GGML_F32x4_FMA +#define GGML_F16x4_ADD wasm_f32x4_add +#define GGML_F16x4_MUL wasm_f32x4_mul +#define GGML_F16x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} + +#define GGML_F16_VEC GGML_F16x4 +#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F16x4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F16x4_FMA +#define GGML_F16_VEC_ADD GGML_F16x4_ADD +#define GGML_F16_VEC_MUL GGML_F16x4_MUL +#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE + +#elif defined(__SSE3__) + +#define GGML_SIMD + +// F32 SSE + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 __m128 +#define GGML_F32x4_ZERO _mm_setzero_ps() +#define GGML_F32x4_SET1(x) _mm_set1_ps(x) +#define GGML_F32x4_LOAD _mm_loadu_ps +#define GGML_F32x4_STORE _mm_storeu_ps +#if defined(__FMA__) + // TODO: Does this work? + #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) +#else + #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) +#endif +#define GGML_F32x4_ADD _mm_add_ps +#define GGML_F32x4_MUL _mm_mul_ps +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ + res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ +} +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 SSE + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 4 + +static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(x[0]); + tmp[1] = GGML_FP16_TO_FP32(x[1]); + tmp[2] = GGML_FP16_TO_FP32(x[2]); + tmp[3] = GGML_FP16_TO_FP32(x[3]); + + return _mm_loadu_ps(tmp); +} + +static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { + float arr[4]; + + _mm_storeu_ps(arr, y); + + x[0] = GGML_FP32_TO_FP16(arr[0]); + x[1] = GGML_FP32_TO_FP16(arr[1]); + x[2] = GGML_FP32_TO_FP16(arr[2]); + x[3] = GGML_FP32_TO_FP16(arr[3]); +} + +#define GGML_F32Cx4 __m128 +#define GGML_F32Cx4_ZERO _mm_setzero_ps() +#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) +#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define GGML_F32Cx4_FMA GGML_F32x4_FMA +#define GGML_F32Cx4_ADD _mm_add_ps +#define GGML_F32Cx4_MUL _mm_mul_ps +#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + +#define GGML_F16_VEC GGML_F32Cx4 +#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + +#elif defined(__loongarch_asx) + +#define GGML_SIMD + +// F32 LASX +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 8 + +#define GGML_F32x8 __m256 +#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) +#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) +#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) +#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) +#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) +#define GGML_F32x8_ADD __lasx_xvfadd_s +#define GGML_F32x8_MUL __lasx_xvfmul_s +#define GGML_F32x8_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ + } \ + float *tmp_p = (float *)&x[0]; \ + res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ +} while (0) +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x8 +#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD +#define GGML_F32_VEC_STORE GGML_F32x8_STORE +#define GGML_F32_VEC_FMA GGML_F32x8_FMA +#define GGML_F32_VEC_ADD GGML_F32x8_ADD +#define GGML_F32_VEC_MUL GGML_F32x8_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE + +// F16 LASX + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 8 + +// F16 arithmetic is not supported by AVX, so we use F32 instead + +#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) +#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) + +static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) { + tmp[i] = GGML_FP16_TO_FP32(x[i]); + } + + return (__m256)__lasx_xvld(tmp, 0); +} +static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { + float arr[8]; + + __lasx_xvst(y, arr, 0); + + for (int i = 0; i < 8; i++) { + x[i] = GGML_FP32_TO_FP16(arr[i]); + } +} +#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) + +#define GGML_F32Cx8_FMA GGML_F32x8_FMA +#define GGML_F32Cx8_ADD __lasx_xvfadd_s +#define GGML_F32Cx8_MUL __lasx_xvfmul_s +#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE + +#define GGML_F16_VEC GGML_F32Cx8 +#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE + +#elif defined(__loongarch_sx) + +#define GGML_SIMD + +// F32 LSX + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 __m128 +#define GGML_F32x4_ZERO __lsx_vldi(0) +#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) +#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) +#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) +#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) +#define GGML_F32x4_ADD __lsx_vfadd_s +#define GGML_F32x4_MUL __lsx_vfmul_s +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ + } \ + __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ + tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ + tmp = __lsx_vsrli_d((__m128i)t0, 32); \ + tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 LSX + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 4 + +static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(x[0]); + tmp[1] = GGML_FP16_TO_FP32(x[1]); + tmp[2] = GGML_FP16_TO_FP32(x[2]); + tmp[3] = GGML_FP16_TO_FP32(x[3]); + + return __lsx_vld(tmp, 0); +} + +static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { + float arr[4]; + + __lsx_vst(y, arr, 0); + + x[0] = GGML_FP32_TO_FP16(arr[0]); + x[1] = GGML_FP32_TO_FP16(arr[1]); + x[2] = GGML_FP32_TO_FP16(arr[2]); + x[3] = GGML_FP32_TO_FP16(arr[3]); +} + +#define GGML_F32Cx4 __m128 +#define GGML_F32Cx4_ZERO __lsx_vldi(0) +#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) +#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x) +#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) +#define GGML_F32Cx4_FMA GGML_F32x4_FMA +#define GGML_F32Cx4_ADD __lsx_vfadd_s +#define GGML_F32Cx4_MUL __lsx_vfmul_s +#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + +#define GGML_F16_VEC GGML_F32Cx4 +#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + +#endif + +// GGML_F32_ARR / GGML_F16_ARR +// number of registers to use per step +#ifdef GGML_SIMD +#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) +#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) +#endif + +// +// Threading defs +// + +typedef pthread_t ggml_thread_t; + +#if defined(_WIN32) + +typedef CONDITION_VARIABLE ggml_cond_t; +typedef SRWLOCK ggml_mutex_t; + +#define ggml_mutex_init(m) InitializeSRWLock(m) +#define ggml_mutex_destroy(m) +#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m) +#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m) +#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m) +#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m) + +#define ggml_cond_init(c) InitializeConditionVariable(c) +#define ggml_cond_destroy(c) +#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED) +#define ggml_cond_broadcast(c) WakeAllConditionVariable(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +typedef pthread_cond_t ggml_cond_t; +typedef pthread_mutex_t ggml_mutex_t; + +#define ggml_mutex_init(m) pthread_mutex_init(m, NULL) +#define ggml_mutex_destroy(m) pthread_mutex_destroy(m) +#define ggml_mutex_lock(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock(m) pthread_mutex_unlock(m) +#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m) + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 +#define ggml_cond_init(c) pthread_cond_init(c, NULL) +#define ggml_cond_destroy(c) pthread_cond_destroy(c) +#define ggml_cond_wait(c, m) pthread_cond_wait(c, m) +#define ggml_cond_broadcast(c) pthread_cond_broadcast(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + +// Threadpool def +struct ggml_threadpool { + ggml_mutex_t mutex; // mutex for cond.var + ggml_cond_t cond; // cond.var for waiting for new work + + struct ggml_cgraph * cgraph; + struct ggml_cplan * cplan; + + // synchronization primitives + atomic_int n_graph; // incremented when there is work to be done (i.e each graph) + atomic_int GGML_CACHE_ALIGN n_barrier; + atomic_int GGML_CACHE_ALIGN n_barrier_passed; + atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. + + // these are atomic as an annotation for thread-sanitizer + atomic_bool stop; // Used for stopping the threadpool altogether + atomic_bool pause; // Used for pausing the threadpool or individual threads + atomic_bool abort; // Used for aborting processing of a graph + + struct ggml_compute_state * workers; // per thread state + int n_threads_max; // number of threads in the pool + atomic_int n_threads_cur; // number of threads used in the current graph + + int32_t prio; // Scheduling priority + uint32_t poll; // Polling level (0 - no polling) + + enum ggml_status ec; +}; + +// Per-thread state +struct ggml_compute_state { +#ifndef GGML_USE_OPENMP + ggml_thread_t thrd; + bool cpumask[GGML_MAX_N_THREADS]; + int last_graph; + bool pending; +#endif + struct ggml_threadpool * threadpool; + int ith; +}; + +struct ggml_compute_params { + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + + struct ggml_threadpool * threadpool; +}; + +// +// fundamental operations +// + +inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + +#if defined(GGML_SIMD) + float sumf = 0.0f; + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + + sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F32_VEC_REDUCE(sumf, sum); + + // leftovers + for (int i = np; i < n; ++i) { + sumf += x[i]*y[i]; + } +#else + // scalar + ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += (ggml_float)(x[i]*y[i]); + } +#endif + + *s = sumf; +} + +static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + int i = 0; + ggml_float sumf = 0; + +#if defined(__AVX512BF16__) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 64 <= n; i += 64) { + c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), + m512bh(_mm512_loadu_si512((y + i)))); + c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), + m512bh(_mm512_loadu_si512((y + i + 32)))); + } + sumf += (ggml_float)_mm512_reduce_add_ps(c1); + sumf += (ggml_float)_mm512_reduce_add_ps(c2); + +#elif defined(__AVX512F__) +#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); + } + sumf += (ggml_float)_mm512_reduce_add_ps(c1); + sumf += (ggml_float)_mm512_reduce_add_ps(c2); + +#undef LOAD +#elif defined(__AVX2__) +#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + __m256 c4 = _mm256_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); + c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); + c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); + } + __m128 g; + c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), + _mm256_add_ps(c2, c4)); + g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), + _mm256_castps256_ps128(c1)); + g = _mm_add_ps(g, _mm_movehl_ps(g, g)); + g = _mm_add_ss(g, _mm_movehdup_ps(g)); + sumf += (ggml_float)_mm_cvtss_f32(g); + +#undef LOAD +#endif + + for (; i < n; ++i) { + sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * + GGML_BF16_TO_FP32(y[i])); + } + *s = sumf; +} + +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + ggml_float sumf = 0.0; + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + + sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F16_VEC_REDUCE(sumf, sum); + + // leftovers + for (int i = np; i < n; ++i) { + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + } +#else + for (int i = 0; i < n; ++i) { + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + } +#endif + + *s = sumf; +} + +// compute GGML_VEC_DOT_UNROLL dot products at once +// xs - x row stride in bytes +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { + ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; + + ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); + } + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + + sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + } + } + } + + // reduce sum0..sum3 to sum0 + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + } + + // leftovers + for (int i = np; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + } + } +#else + for (int i = 0; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + } + } +#endif + + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + s[i] = sumf[i]; + } +} + +inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] += x[i]*v; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] += x[i]*v; + } +#endif +} + +inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#endif +} + +// xs and vs are byte strides of x and v +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { + + const float * restrict x[GGML_VEC_MAD_UNROLL]; + const float * restrict v[GGML_VEC_MAD_UNROLL]; + + for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { + x[i] = (const float *) ((const char *) xv + i*xs); + v[i] = (const float *) ((const char *) vv + i*vs); + } + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; + + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + vx[k] = GGML_F32_VEC_SET1(v[k][0]); + } + + GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); + } + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + for (int i = np; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#else + // scalar + for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { + for (int i = 0; i < n; ++i) { + y[i] += x[k][i]*v[k][0]; + } + } +#endif +} + +//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { +#if defined(GGML_USE_ACCELERATE) + vDSP_vsmul(y, 1, &v, y, 1, n); +#elif defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_MUL(ay[j], vx); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] *= v; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] *= v; + } +#endif +} + +inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_MUL(ay[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + } +#endif +} + +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } +inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } +inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } +inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } +inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } +inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } +inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } +inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } +inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } +// TODO: optimize performance +inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } + +static const float GELU_COEF_A = 0.044715f; +static const float GELU_QUICK_COEF = -1.702f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +inline static float ggml_gelu_f32(float x) { + return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + y[i] = ggml_table_gelu_f16[i16[i]]; + } +} + +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + if (x[i] <= -10.0f) { + y[i] = 0.0f; + } else if (x[i] >= 10.0f) { + y[i] = x[i]; + } else { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + } + } +} +#else +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_f32(x[i]); + } +} +#endif + +inline static float ggml_gelu_quick_f32(float x) { + return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); +} + +//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +// const uint16_t * i16 = (const uint16_t *) x; +// for (int i = 0; i < n; ++i) { +// y[i] = ggml_table_gelu_quick_f16[i16[i]]; +// } +//} + +#ifdef GGML_GELU_QUICK_FP16 +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); + } +} +#else +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_quick_f32(x[i]); + } +} +#endif + +// Sigmoid Linear Unit (SiLU) function +inline static float ggml_silu_f32(float x) { + return x/(1.0f + expf(-x)); +} + +#if __FINITE_MATH_ONLY__ +#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" +#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461" +#endif + +#if defined(__ARM_NEON) && defined(__aarch64__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static float32x4_t ggml_v_expf(float32x4_t x) { + const float32x4_t r = vdupq_n_f32(0x1.8p23f); + const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); + const float32x4_t n = vsubq_f32(z, r); + const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, + vdupq_n_f32(0x1.7f7d1cp-20f)); + const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); + const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); + const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); + const float32x4_t u = vmulq_f32(b, b); + const float32x4_t j = vfmaq_f32( + vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), + vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), + vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); + if (!vpaddd_u64(vreinterpretq_u64_u32(c))) + return vfmaq_f32(k, j, k); + const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); + const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); + const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); + return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), + vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static float32x4_t ggml_v_silu(float32x4_t x) { + const float32x4_t one = vdupq_n_f32(1.0f); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t neg_x = vsubq_f32(zero, x); + const float32x4_t exp_neg_x = ggml_v_expf(neg_x); + const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); + return vdivq_f32(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX512F__) && defined(__AVX512DQ__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m512 ggml_v_expf(__m512 x) { + const __m512 r = _mm512_set1_ps(0x1.8p23f); + const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); + const __m512 n = _mm512_sub_ps(z, r); + const __m512 b = + _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), + _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); + const __mmask16 d = + _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); + const __m512 u = _mm512_mul_ps(b, b); + const __m512 j = _mm512_fmadd_ps( + _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, + _mm512_set1_ps(0x1.573e2ep-5f)), + u, + _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, + _mm512_set1_ps(0x1.fffdb6p-2f))), + u, + _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F))); + const __m512 res = _mm512_scalef_ps(j, n); + if (_mm512_kortestz(d, d)) + return res; + const __m512 zero = _mm512_setzero_ps(); + const __m512 alt = _mm512_mask_blend_ps( + _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero); + return _mm512_mask_blend_ps(d, res, alt); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m512 ggml_v_silu(__m512 x) { + const __m512 one = _mm512_set1_ps(1); + const __m512 zero = _mm512_setzero_ps(); + const __m512 neg_x = _mm512_sub_ps(zero, x); + const __m512 exp_neg_x = ggml_v_expf(neg_x); + const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); + return _mm512_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__AVX2__) && defined(__FMA__) + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m256 ggml_v_expf(__m256 x) { + const __m256 r = _mm256_set1_ps(0x1.8p23f); + const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); + const __m256 n = _mm256_sub_ps(z, r); + const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), + _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); + const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); + const __m256 k = _mm256_castsi256_ps( + _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); + const __m256i c = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(126), _CMP_GT_OQ)); + const __m256 u = _mm256_mul_ps(b, b); + const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, + _mm256_set1_ps(0x1.573e2ep-5f)), u, + _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, + _mm256_set1_ps(0x1.fffdb6p-2f))), + u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) + return _mm256_fmadd_ps(j, k, k); + const __m256i g = _mm256_and_si256( + _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), + _mm256_set1_epi32(0x82000000u)); + const __m256 s1 = + _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); + const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); + const __m256i d = _mm256_castps_si256( + _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), + _mm256_set1_ps(192), _CMP_GT_OQ)); + return _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), + _mm256_andnot_ps( + _mm256_castsi256_ps(d), + _mm256_or_ps( + _mm256_and_ps(_mm256_castsi256_ps(c), + _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), + _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m256 ggml_v_silu(__m256 x) { + const __m256 one = _mm256_set1_ps(1); + const __m256 zero = _mm256_setzero_ps(); + const __m256 neg_x = _mm256_sub_ps(zero, x); + const __m256 exp_neg_x = ggml_v_expf(neg_x); + const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); + return _mm256_div_ps(x, one_plus_exp_neg_x); +} + +#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON + +#if defined(__FMA__) +#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) +#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) +#else +#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) +#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) +#endif + +// adapted from arm limited optimized routine +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m128 ggml_v_expf(__m128 x) { + const __m128 r = _mm_set1_ps(0x1.8p23f); + const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); + const __m128 n = _mm_sub_ps(z, r); + const __m128 b = + NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); + const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); + const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); + const __m128i c = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); + const __m128 u = _mm_mul_ps(b, b); + const __m128 j = + MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, + MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), + u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm_movemask_epi8(c)) + return MADD128(j, k, k); + const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), + _mm_set1_epi32(0x82000000u)); + const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); + const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); + const __m128i d = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); + return _mm_or_ps( + _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), + _mm_andnot_ps(_mm_castsi128_ps(d), + _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), + _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m128 ggml_v_silu(__m128 x) { + const __m128 one = _mm_set1_ps(1); + const __m128 zero = _mm_setzero_ps(); + const __m128 neg_x = _mm_sub_ps(zero, x); + const __m128 exp_neg_x = ggml_v_expf(neg_x); + const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); + return _mm_div_ps(x, one_plus_exp_neg_x); +} + +#endif // __ARM_NEON / __AVX2__ / __SSE2__ + +static void ggml_vec_silu_f32(const int n, float * y, const float * x) { + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i))); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i))); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); + } +#endif + for (; i < n; ++i) { + y[i] = ggml_silu_f32(x[i]); + } +} + +static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { + int i = 0; + ggml_float sum = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), + _mm512_set1_ps(max))); + _mm512_storeu_ps(y + i, val); + sum += (ggml_float)_mm512_reduce_add_ps(val); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), + _mm256_set1_ps(max))); + _mm256_storeu_ps(y + i, val); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), + _mm256_castps256_ps128(val)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(max))); + _mm_storeu_ps(y + i, val); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); +#else + __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); + val = _mm_add_ps(val, tmp); + tmp = _mm_movehl_ps(tmp, val); + val = _mm_add_ss(val, tmp); +#endif + sum += (ggml_float)_mm_cvtss_f32(val); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), + vdupq_n_f32(max))); + vst1q_f32(y + i, val); + sum += (ggml_float)vaddvq_f32(val); + } +#endif + for (; i < n; ++i) { + float val = expf(x[i] - max); + sum += (ggml_float)val; + y[i] = val; + } + return sum; +} + +static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) { + // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i) + + int i = 0; + ggml_float sum = 0; + for (; i < n; ++i) { + float val = x[i] - max; + y[i] = val; + sum += (ggml_float)expf(val); + } + return sum = (ggml_float)logf(sum); +} + +inline static float ggml_silu_backward_f32(float x, float dy) { + const float s = 1.0f/(1.0f + expf(-x)); + return dy*s*(1.0f + x*(1.0f - s)); +} + +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { + for (int i = 0; i < n; ++i) { + dx[i] = ggml_silu_backward_f32(x[i], dy[i]); + } +} + +inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE + ggml_float sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += (ggml_float)x[i]; + } + *s = sum; +#else + vDSP_sve(x, 1, s, n); +#endif +} + +inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { + ggml_float sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += (ggml_float)x[i]; + } + *s = sum; +} + +inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += GGML_FP16_TO_FP32(x[i]); + } + *s = sum; +} + +inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += GGML_BF16_TO_FP32(x[i]); + } + *s = sum; +} + +inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE + float max = -INFINITY; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + } + *s = max; +#else + vDSP_maxv(x, 1, s, n); +#endif +} + +inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { + ggml_vec_norm_f32(n, s, x); + *s = 1.f/(*s); +} + +inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { + float max = -INFINITY; + int idx = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + if (max == x[i]) { idx = i; } + } + *s = idx; +} + +// Helpers for polling loops +#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) ) +static inline void ggml_thread_cpu_relax(void) { + __asm__ volatile("yield" ::: "memory"); +} +#elif defined(__x86_64__) +static inline void ggml_thread_cpu_relax(void) { + _mm_pause(); +} +#else +static inline void ggml_thread_cpu_relax(void) {;} +#endif + +// +// NUMA support +// + +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 + +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node + uint32_t n_cpus; +}; + +struct ggml_numa_nodes { + enum ggml_numa_strategy numa_strategy; + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; + uint32_t n_nodes; + uint32_t total_cpus; // hardware threads on system + uint32_t current_node; // node on which main process is execting +#if defined(__gnu_linux__) + cpu_set_t cpuset; // cpuset from numactl +#else + uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype +#endif +}; + +// +// ggml state +// + +struct ggml_state { + struct ggml_numa_nodes numa; +}; + +// global state +static struct ggml_state g_state = {0}; +static atomic_flag g_state_critical = ATOMIC_FLAG_INIT; + +// TODO: move to threading file +// critical section via spin lock +void ggml_critical_section_start(void) { + while (atomic_flag_test_and_set(&g_state_critical)) { + // spin + sched_yield(); + } +} + +void ggml_critical_section_end(void) { + atomic_flag_clear(&g_state_critical); +} + +static void ggml_barrier(struct ggml_threadpool * tp) { + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + +#ifdef GGML_USE_OPENMP + #pragma omp barrier +#else + int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed); + + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst); + + if (n_barrier == (n_threads - 1)) { + // last thread + atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed); + + // exit barrier (fill seq-cst fence) + atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst); + return; + } + + // wait for other threads + while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { + ggml_thread_cpu_relax(); + } + + // exit barrier (full seq-cst fence) + // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead + #ifdef GGML_TSAN_ENABLED + atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst); + #else + atomic_thread_fence(memory_order_seq_cst); + #endif +#endif +} + +#if defined(__gnu_linux__) +static cpu_set_t ggml_get_numa_affinity(void) { + cpu_set_t cpuset; + pthread_t thread; + thread = pthread_self(); + CPU_ZERO(&cpuset); + pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + return cpuset; +} +#else +static uint32_t ggml_get_numa_affinity(void) { + return 0; // no NUMA support +} +#endif + +void ggml_numa_init(enum ggml_numa_strategy numa_flag) { + if (g_state.numa.n_nodes > 0) { + fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); + + return; + } + +#if defined(__gnu_linux__) + struct stat st; + char path[256]; + int rv; + + // set numa scheme + g_state.numa.numa_strategy = numa_flag; + + GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); + + g_state.numa.cpuset = ggml_get_numa_affinity(); + + // enumerate nodes + while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state.numa.n_nodes; + } + + // enumerate CPUs + while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state.numa.total_cpus; + } + + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); + + // figure out which node we're on + uint current_cpu; + int getcpu_ret = 0; +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) + getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); +#else + // old glibc doesn't have a wrapper for this call. Fall back on direct syscall +# if !defined(SYS_getcpu) && defined(SYS_get_cpu) +# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name +# endif + getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node); +#endif + + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { + g_state.numa.n_nodes = 0; + return; + } + + GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); + + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { + struct ggml_numa_node * node = &g_state.numa.nodes[n]; + GGML_PRINT_DEBUG("CPUs on node %u:", n); + node->n_cpus = 0; + for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) == 0) { + node->cpus[node->n_cpus++] = c; + GGML_PRINT_DEBUG(" %u", c); + } + } + GGML_PRINT_DEBUG("\n"); + } + + if (ggml_is_numa()) { + FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); + if (fptr != NULL) { + char buf[42]; + if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { + GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + } + fclose(fptr); + } + } +#else + UNUSED(numa_flag); + // TODO +#endif +} + +bool ggml_is_numa(void) { + return g_state.numa.n_nodes > 1; +} + +#if defined(__ARM_ARCH) + +#if defined(__linux__) && defined(__aarch64__) +#include +#elif defined(__APPLE__) +#include +#endif + +#if !defined(HWCAP2_I8MM) +#define HWCAP2_I8MM 0 +#endif + +static void ggml_init_arm_arch_features(void) { +#if defined(__linux__) && defined(__aarch64__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); + +#if defined(__ARM_FEATURE_SVE) + ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); +#endif +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_neon = oldp; + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_i8mm = oldp; + + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; +#else +// Run-time CPU feature detection not implemented for this platform, fallback to compile time +#if defined(__ARM_NEON) + ggml_arm_arch_features.has_neon = 1; +#else + ggml_arm_arch_features.has_neon = 0; +#endif + +#if defined(__ARM_FEATURE_MATMUL_INT8) + ggml_arm_arch_features.has_i8mm = 1; +#else + ggml_arm_arch_features.has_i8mm = 0; +#endif + +#if defined(__ARM_FEATURE_SVE) + ggml_arm_arch_features.has_sve = 1; + ggml_arm_arch_features.sve_cnt = 16; +#else + ggml_arm_arch_features.has_sve = 0; + ggml_arm_arch_features.sve_cnt = 0; +#endif +#endif +} +#endif + +struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { + GGML_ASSERT(!ggml_get_no_alloc(ctx)); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + + ggml_set_i32(result, value); + + return result; +} + +struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { + GGML_ASSERT(!ggml_get_no_alloc(ctx)); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + + ggml_set_f32(result, value); + + return result; +} + +struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + } + } break; + case GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + + return tensor; +} + +struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + } + } break; + case GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(ggml_bf16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + + return tensor; +} + +int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]); + } + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + return ((int8_t *)(tensor->data))[i]; + } + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + return ((int16_t *)(tensor->data))[i]; + } + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + return ((int32_t *)(tensor->data))[i]; + } + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + } + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + return ((float *)(tensor->data))[i]; + } + default: + { + GGML_ABORT("fatal error"); + } + } +} + +void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + ((float *)(tensor->data))[i] = value; + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case GGML_TYPE_F16: + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_BF16: + return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); + case GGML_TYPE_F32: + return ((float *) data)[0]; + default: + GGML_ABORT("fatal error"); + } +} + +void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case GGML_TYPE_F16: + { + ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_BF16: + { + ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); + } break; + case GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]); + } + switch (tensor->type) { + case GGML_TYPE_I8: + { + return ((int8_t *)(tensor->data))[i]; + } + case GGML_TYPE_I16: + { + return ((int16_t *)(tensor->data))[i]; + } + case GGML_TYPE_I32: + { + return ((int32_t *)(tensor->data))[i]; + } + case GGML_TYPE_F16: + { + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } + case GGML_TYPE_BF16: + { + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + } + case GGML_TYPE_F32: + { + return ((float *)(tensor->data))[i]; + } + default: + { + GGML_ABORT("fatal error"); + } + } +} + +void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = { 0, 0, 0, 0 }; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + switch (tensor->type) { + case GGML_TYPE_I8: + { + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_BF16: + { + ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + } break; + case GGML_TYPE_F32: + { + ((float *)(tensor->data))[i] = value; + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + return ((int8_t *) data)[0]; + case GGML_TYPE_I16: + return ((int16_t *) data)[0]; + case GGML_TYPE_I32: + return ((int32_t *) data)[0]; + case GGML_TYPE_F16: + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_BF16: + return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); + case GGML_TYPE_F32: + return ((float *) data)[0]; + default: + GGML_ABORT("fatal error"); + } +} + +void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { + void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; + switch (tensor->type) { + case GGML_TYPE_I8: + { + ((int8_t *)(data))[0] = value; + } break; + case GGML_TYPE_I16: + { + ((int16_t *)(data))[0] = value; + } break; + case GGML_TYPE_I32: + { + ((int32_t *)(data))[0] = value; + } break; + case GGML_TYPE_F16: + { + ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_BF16: + { + ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); + } break; + case GGML_TYPE_F32: + { + ((float *)(data))[0] = value; + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +// ggml_compute_forward_dup + +static void ggml_compute_forward_dup_same_cont( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + GGML_ASSERT(src0->type == dst->type); + + const size_t nb0 = ggml_type_size(src0->type); + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + if (ie0 < ie1) { + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb0), + (ie1 - ie0) * nb0); + } +} + +static void ggml_compute_forward_dup_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + GGML_TENSOR_UNARY_OP_LOCALS + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + + if (ggml_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (ggml_get_type_traits(dst->type)->from_float) { + ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } + return; + } + + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } +} + +static void ggml_compute_forward_dup_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + GGML_TENSOR_UNARY_OP_LOCALS + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + + if (ggml_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_bf16_t)) { + if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (ggml_get_type_traits(dst->type)->from_float) { + ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } + return; + } + + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_BF16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } +} + +static void ggml_compute_forward_dup_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + GGML_TENSOR_UNARY_OP_LOCALS + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + // TODO: simplify + if (nb00 == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (ggml_get_type_traits(dst->type)->from_float) { + ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(float)); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_BF16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ABORT("fatal error"); // TODO: implement + } +} + +// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. +static void ggml_compute_forward_dup_bytes( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(src0->type == dst->type); + + GGML_TENSOR_UNARY_OP_LOCALS; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + ggml_compute_forward_dup_same_cont(params, dst); + return; + } + + const size_t type_size = ggml_type_size(src0->type); + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == type_size && nb0 == type_size) { + // copy by rows + const size_t rs = ne00 * type_size; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + size_t id = 0; + char * dst_ptr = (char *) dst->data; + const size_t rs = ne00 * type_size; + + if (nb00 == type_size) { + // src0 is contigous on first dimension, copy by rows + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, type_size); + + id += type_size; + } + } + id += rs * (ne01 - ir1); + } + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, type_size); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } +} + +static void ggml_compute_forward_dup( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (src0->type == dst->type) { + ggml_compute_forward_dup_bytes(params, dst); + return; + } + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_dup_f16(params, dst); + } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_dup_bf16(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_dup_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_add + +static void ggml_compute_forward_add_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + + for (int64_t r = 0; r < nr0; ++r) { +#ifdef GGML_USE_ACCELERATE + vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); +#else + ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); +#endif + } + } + } else { + // src1 is not contiguous + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; + } + } + } +} + +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + } + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + if (dst->type == GGML_TYPE_F16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } + } + else { + // src1 is not contiguous + GGML_ABORT("fatal error"); + } +} + +static void ggml_compute_forward_add_bf16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + } + + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + if (dst->type == GGML_TYPE_BF16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } + } + else { + // src1 is not contiguous + GGML_ABORT("fatal error"); + } +} + +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); + } + } + } + else { + // src1 is not contiguous + GGML_ABORT("fatal error"); + } +} + +static void ggml_compute_forward_add_bf16_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_BF16); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(ggml_bf16_t)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i])); + } + } + } + else { + // src1 is not contiguous + GGML_ABORT("fatal error"); + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + const enum ggml_type dtype = dst->type; + ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; + ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + if (quantize_row_q != NULL) { + quantize_row_q(wdata, dst_row, ne00); + } else { + memcpy(dst_row, wdata, ne0*nb0); + } + } +} + +static void ggml_compute_forward_add( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f32(params, dst); + } + else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, dst); + } + else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_BF16: + { + if (src1->type == GGML_TYPE_BF16) { + ggml_compute_forward_add_bf16_bf16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_bf16_f32(params, dst); + } + else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + { + ggml_compute_forward_add_q_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_add1 + +static void ggml_compute_forward_add1_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_add1_f32); + + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) src1->data), 0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_add1_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + *(float *) src1->data); +#endif + } +} + +static void ggml_compute_forward_add1_f16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_f16_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + // scalar to add + const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_q_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; + ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float; + + // we don't support permuted src0 + GGML_ASSERT(nb00 == ggml_type_size(type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); + void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); + + assert(ne0 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne0); + // add src1 + ggml_vec_acc1_f32(ne0, wdata, v); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne0); + } +} + +static void ggml_compute_forward_add1_bf16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_bf16_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + // scalar to add + const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_BF16); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add1_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add1_f16_f16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_f16_f32(params, dst); + } + else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_BF16: + { + if (src1->type == GGML_TYPE_BF16) { + ggml_compute_forward_add1_bf16_bf16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_bf16_f32(params, dst); + } + else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + { + ggml_compute_forward_add1_q_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_acc + +static void ggml_compute_forward_acc_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during acc + // nb0 is implicitly element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + if (params->ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + + // src0 and dst as viewed during acc + const size_t nb0 = ggml_element_size(src0); + + const size_t nb00 = nb0; + const size_t nb01 = nb1; + const size_t nb02 = nb2; + const size_t nb03 = nb3; + + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); +#else + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + } +} + +static void ggml_compute_forward_acc( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_acc_f32(params, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sub + +static void ggml_compute_forward_sub_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + + for (int64_t r = 0; r < nr0; ++r) { +#ifdef GGML_USE_ACCELERATE + vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); +#else + ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); +#endif + } + } + } else { + // src1 is not contiguous + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] - *src1_ptr; + } + } + } +} + +static void ggml_compute_forward_sub( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sub_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_mul + +static void ggml_compute_forward_mul_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + + for (int64_t r = 0 ; r < nr0; ++r) { +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_mul_f32); + + vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); +#else + ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); +#endif + } + } + } else { + // src1 is not contiguous + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne00; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); + } + } + } +} + +static void ggml_compute_forward_mul( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mul_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_div + +static void ggml_compute_forward_div_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + + for (int64_t r = 0; r < nr0; ++r) { +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_div_f32); + + vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); +#else + ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); +#endif + } + } + } else { + // src1 is not contiguous + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne00; ++i0) { + const int64_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); + } + } + } +} + +static void ggml_compute_forward_div( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_div_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sqr + +static void ggml_compute_forward_sqr_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqr_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sqr( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqr_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sqrt + +static void ggml_compute_forward_sqrt_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqrt_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sqrt( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqrt_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_log + +static void ggml_compute_forward_log_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_log_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_log( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_log_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sin + +static void ggml_compute_forward_sin_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sin_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sin( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sin_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_cos + +static void ggml_compute_forward_cos_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_cos_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_cos( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cos_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sum + +static void ggml_compute_forward_sum_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_scalar(dst)); + assert(src0->nb[0] == sizeof(float)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + ggml_float sum = 0; + ggml_float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32_ggf(ne00, + &row_sum, + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + sum += row_sum; + } + } + } + ((float *) dst->data)[0] = sum; +} + +static void ggml_compute_forward_sum_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_scalar(dst)); + + assert(src0->nb[0] == sizeof(ggml_fp16_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f16_ggf(ne00, + &row_sum, + (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); +} + +static void ggml_compute_forward_sum_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_scalar(dst)); + + assert(src0->nb[0] == sizeof(ggml_bf16_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_bf16_ggf(ne00, + &row_sum, + (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); +} + +static void ggml_compute_forward_sum( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_sum_f16(params, dst); + } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_sum_bf16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sum_rows + +static void ggml_compute_forward_sum_rows_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(ne0 == 1); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + for (int64_t i3 = 0; i3 < ne03; i3++) { + for (int64_t i2 = 0; i2 < ne02; i2++) { + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float row_sum = 0; + ggml_vec_sum_f32(ne00, &row_sum, src_row); + dst_row[0] = row_sum; + } + } + } +} + +static void ggml_compute_forward_sum_rows( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_rows_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_mean + +static void ggml_compute_forward_mean_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + assert(ne0 == 1); + assert(ne1 == ne01); + assert(ne2 == ne02); + assert(ne3 == ne03); + + UNUSED(ne0); + UNUSED(ne1); + UNUSED(ne2); + UNUSED(ne3); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32(ne00, + (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + + *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; + } + } + } +} + +static void ggml_compute_forward_mean( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mean_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_argmax + +static void ggml_compute_forward_argmax_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + assert(dst->nb[0] == sizeof(float)); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; + + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src = (float *) ((char *) src0->data + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + int v = 0; + ggml_vec_argmax_f32(ne00, &v, src); + dst_[0] = v; + } +} + +static void ggml_compute_forward_argmax( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_argmax_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_count_equal + +static void ggml_compute_forward_count_equal_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_I32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_I64); + + const int64_t nr = ggml_nrows(src0); + + const int ith = params->ith; + const int nth = params->nth; + + int64_t * sums = (int64_t *) params->wdata; + int64_t sum_thread = 0; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir / (ne02*ne01); + const int64_t i02 = (ir - i03*ne03) / ne01; + const int64_t i01 = ir - i03*ne03 - i02*ne02; + + const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; + const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; + + for (int64_t i00 = 0; i00 < ne00; ++i00) { + const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); + const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); + + sum_thread += val0 == val1; + } + } + if (ith != 0) { + sums[ith] = sum_thread; + } + ggml_barrier(params->threadpool); + + if (ith != 0) { + return; + } + + for (int ith_other = 1; ith_other < nth; ++ith_other) { + sum_thread += sums[ith_other]; + } + *((int64_t *) dst->data) = sum_thread; +} + +static void ggml_compute_forward_count_equal( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_I32: + { + ggml_compute_forward_count_equal_i32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_repeat + +static void ggml_compute_forward_repeat_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + GGML_TENSOR_UNARY_OP_LOCALS + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_cpy_f32(ne00, + (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + GGML_TENSOR_UNARY_OP_LOCALS + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); + ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); + // ggml_vec_cpy_f16(ne00, y, x) + for (int i = 0; i < ne00; ++i) { + y[i] = x[i]; + } + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_I16: + { + ggml_compute_forward_repeat_f16(params, dst); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_repeat_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_repeat_back + +static void ggml_compute_forward_repeat_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + GGML_TENSOR_UNARY_OP_LOCALS + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_concat + +static void ggml_compute_forward_concat_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int32_t dim = ggml_get_op_params_i32(dst, 0); + + GGML_ASSERT(dim >= 0 && dim < 4); + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; + + const float * x; + + // TODO: smarter multi-theading + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + } else { + x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + } + + float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; + } + } + } + } +} + +static void ggml_compute_forward_concat( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_concat_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_abs + +static void ggml_compute_forward_abs_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_abs_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_abs( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_abs_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sgn + +static void ggml_compute_forward_sgn_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_sgn_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sgn( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sgn_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_neg + +static void ggml_compute_forward_neg_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_neg_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_neg( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_neg_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_step + +static void ggml_compute_forward_step_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_step_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_step( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_step_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_tanh + +static void ggml_compute_forward_tanh_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_tanh_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_tanh( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_tanh_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_elu + +static void ggml_compute_forward_elu_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_elu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_elu( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_elu_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_relu + +static void ggml_compute_forward_relu_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_relu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_relu( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_relu_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_sigmoid + +static void ggml_compute_forward_sigmoid_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_sigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sigmoid( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sigmoid_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_gelu + +static void ggml_compute_forward_gelu_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_gelu_quick + +static void ggml_compute_forward_gelu_quick_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_quick_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu_quick( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_quick_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_silu + +static void ggml_compute_forward_silu_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_silu_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_silu( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_silu_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} +// ggml_compute_forward_leaky_relu + +static void ggml_compute_forward_leaky_relu_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_leaky_relu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); + } +} + +static void ggml_compute_forward_leaky_relu( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_leaky_relu_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_silu_back + +static void ggml_compute_forward_silu_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * grad = dst->src[1]; + + assert(ggml_is_contiguous_1(grad)); + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, grad)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_silu_backward_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1])), + (float *) ((char *) grad->data + i1*(grad->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_silu_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_silu_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + + +static void ggml_compute_forward_hardswish_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_hardswish_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} +static void ggml_compute_forward_hardswish( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardswish_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_hardsigmoid_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_hardsigmoid_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_hardsigmoid( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_hardsigmoid_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_exp_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + ggml_vec_exp_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_exp( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_exp_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + + +// ggml_compute_forward_norm + +static void ggml_compute_forward_norm_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + GGML_ASSERT(eps > 0.0f); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)x[i00]; + } + + float mean = sum/ne00; + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_float sum2 = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sum2 += (ggml_float)(v*v); + } + + float variance = sum2/ne00; + const float scale = 1.0f/sqrtf(variance + eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +static void ggml_compute_forward_norm( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_norm_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_group_rms_norm + +static void ggml_compute_forward_rms_norm_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + GGML_ASSERT(eps > 0.0f); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)(x[i00] * x[i00]); + } + + const float mean = sum/ne00; + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + memcpy(y, x, ne00 * sizeof(float)); + // for (int i00 = 0; i00 < ne00; i00++) { + // y[i00] = x[i00]; + // } + + const float scale = 1.0f/sqrtf(mean + eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +static void ggml_compute_forward_rms_norm( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rms_norm_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_rms_norm_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_BINARY_OP_LOCALS + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + // src1 is same shape as src0 => same indices + const int64_t i11 = i01; + const int64_t i12 = i02; + const int64_t i13 = i03; + + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + + ggml_float sum_xx = 0.0; + ggml_float sum_xdz = 0.0; + + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum_xx += (ggml_float)(x[i00] * x[i00]); + sum_xdz += (ggml_float)(x[i00] * dz[i00]); + } + + //const float mean = (float)(sum_xx)/ne00; + const float mean_eps = (float)(sum_xx)/ne00 + eps; + const float sum_eps = (float)(sum_xx) + eps*ne00; + //const float mean_xdz = (float)(sum_xdz)/ne00; + // we could cache rms from forward pass to improve performance. + // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. + //const float rms = sqrtf(mean_eps); + const float rrms = 1.0f / sqrtf(mean_eps); + //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) + + { + // z = rms_norm(x) + // + // rms_norm(src0) = + // scale( + // src0, + // div( + // 1, + // sqrt( + // add( + // scale( + // sum( + // sqr( + // src0)), + // (1.0/N)), + // eps)))); + + // postorder: + // ## op args grad + // 00 param src0 grad[#00] + // 01 const 1 + // 02 sqr (#00) grad[#02] + // 03 sum (#02) grad[#03] + // 04 const 1/N + // 05 scale (#03, #04) grad[#05] + // 06 const eps + // 07 add (#05, #06) grad[#07] + // 08 sqrt (#07) grad[#08] + // 09 div (#01,#08) grad[#09] + // 10 scale (#00,#09) grad[#10] + // + // backward pass, given grad[#10] + // #10: scale + // grad[#00] += scale(grad[#10],#09) + // grad[#09] += sum(mul(grad[#10],#00)) + // #09: div + // grad[#08] += neg(mul(grad[#09], div(#09,#08))) + // #08: sqrt + // grad[#07] += mul(grad[#08], div(0.5, #08)) + // #07: add + // grad[#05] += grad[#07] + // #05: scale + // grad[#03] += scale(grad[#05],#04) + // #03: sum + // grad[#02] += repeat(grad[#03], #02) + // #02: + // grad[#00] += scale(mul(#00, grad[#02]), 2.0) + // + // substitute and simplify: + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#02] = repeat(grad[#03], #02) + // grad[#02] = repeat(scale(grad[#05],#04), #02) + // grad[#02] = repeat(scale(grad[#07],#04), #02) + // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps)) + // a = b*c + d*e + // a = b*c*f/f + d*e*f/f + // a = (b*c*f + d*e*f)*(1/f) + // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c)) + // a = (b + d*e/c)*c + // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms + // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms + // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms + // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms + // a = (dz + x*div(-mean_xdz,mean_eps))*rrms + // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms) + // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + } + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // post-order: + // dx := x + // dx := scale(dx,-mean_xdz/mean_eps) + // dx := add(dx, dz) + // dx := scale(dx, rrms) + float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_vec_cpy_f32 (ne00, dx, x); + // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); + ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); + ggml_vec_acc_f32 (ne00, dx, dz); + ggml_vec_scale_f32(ne00, dx, rrms); + } + } + } +} + +static void ggml_compute_forward_rms_norm_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rms_norm_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_group_norm + +static void ggml_compute_forward_group_norm_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + // TODO: optimize + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + + int n_channels = src0->ne[2]; + int n_groups = dst->op_params[0]; + int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; + for (int i = ith; i < n_groups; i += nth) { + int start = i * n_channels_per_group; + int end = start + n_channels_per_group; + if (end > n_channels) { + end = n_channels; + } + int step = end - start; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + ggml_float sum = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + ggml_float sumr = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sumr += (ggml_float)x[i00]; + } + sum += sumr; + } + } + const float mean = sum / (ne00 * ne01 * step); + + ggml_float sum2 = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + + ggml_float sumr = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sumr += (ggml_float)(v * v); + } + sum2 += sumr; + } + } + const float variance = sum2 / (ne00 * ne01 * step); + const float scale = 1.0f / sqrtf(variance + eps); + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + ggml_vec_scale_f32(ne00, y, scale); + } + } + } + } +} + +static void ggml_compute_forward_group_norm( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_group_norm_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_mul_mat + +static void ggml_compute_forward_mul_mat_one_chunk( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const int64_t num_rows_per_vec_dot, + const int64_t ir0_start, + const int64_t ir0_end, + const int64_t ir1_start, + const int64_t ir1_end) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end); + + // threads with no work simply yield (not sure if it helps) + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int64_t i13 = (ir1 / (ne12 * ne1)); + const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char*)wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } +} + +static void ggml_compute_forward_mul_mat( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float; + ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; + int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows; + int64_t const matmul_num_cols = type_traits_cpu[type].ncols; + int64_t const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave; + ggml_gemv_t const gemv = type_traits_cpu[type].gemv; + ggml_gemm_t const gemm = type_traits_cpu[type].gemm; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + +#if GGML_USE_LLAMAFILE + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + const bool src1_cont = ggml_is_contiguous(src1); + + if (src1_cont) { + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + nb01/ggml_type_size(src0->type), + (const char *)src1->data + i12*nb12 + i13*nb13, + nb11/ggml_type_size(src1->type), + (char *)dst->data + i12*nb2 + i13*nb3, + nb1/ggml_type_size(dst->type), + ith, nth, + src0->type, + src1->type, + dst->type)) + goto UseGgmlGemm1; + return; + } +UseGgmlGemm1:; +#endif + + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + assert(params->wsize >= ne13*nbw3); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + int64_t i11_processed = 0; + if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + 4, ne10, blck_size_interleave); + } + i11_processed = ne11 - ne11 % 4; + } + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + } + + if (ith == 0) { + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); + } + + ggml_barrier(params->threadpool); + +#if GGML_USE_LLAMAFILE + if (src1->type != vec_dot_type) { + const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + nb01/ggml_type_size(src0->type), + (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, + row_size/ggml_type_size(vec_dot_type), + (char *)dst->data + i12*nb2 + i13*nb3, + nb1/ggml_type_size(dst->type), + ith, nth, + src0->type, + vec_dot_type, + dst->type)) + goto UseGgmlGemm2; + return; + } +UseGgmlGemm2:; +#endif + + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int64_t nr0 = ne0; + + // This is the size of the rest of the dimensions of the result + const int64_t nr1 = ne1 * ne2 * ne3; + + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t num_rows_per_vec_dot = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + + // Now select a reasonable chunk size. + int chunk_size = 16; + + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } + + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + + // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. + // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915 + // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. + if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) { + // distribute the thread work across the inner or outer loop based on which one is larger + nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + } + + // The number of elements in each chunk + const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + + if ((ggml_n_dims(src0) == 2) && gemv) { + const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; + src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; + if (src0_start >= src0_end) return; + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (gemm && (ne11 > 3)) { + gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) { + gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); + } + return; + } + + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = ith; + + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; + + const int64_t ir0_start = dr0 * ith0; + const int64_t ir0_end = MIN(ir0_start + dr0, nr0); + + const int64_t ir1_start = dr1 * ith1; + const int64_t ir1_end = MIN(ir1_start + dr1, nr1); + + ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + + if (nth >= nchunk0 * nchunk1) { + break; + } + + current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); + } +} + +// ggml_compute_forward_mul_mat_id + +static void ggml_compute_forward_mul_mat_id( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * ids = dst->src[2]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float; + int64_t const matmul_num_cols = type_traits_cpu[type].ncols; + ggml_gemv_t const gemv = type_traits_cpu[type].gemv; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // row groups + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert + + char * wdata_src1_end = (src1->type == vec_dot_type) ? + (char *) params->wdata : + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; + + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] + + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + assert(params->wsize >= ne13*nbw3); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + } + +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] + + if (ith == 0) { + // initialize matrix_row_counts + memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); + + // group rows by src0 matrix + for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int id = 0; id < n_ids; ++id) { + const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]); + + assert(i02 >= 0 && i02 < n_as); + + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1}; + matrix_row_counts[i02] += 1; + } + } + } + + ggml_barrier(params->threadpool); + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + const char * src0_cur = (const char *) src0->data + cur_a*nb02; + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows + + if (((ggml_n_dims(src0) - 1) == 2) && gemv) { + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; + src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; + if (src0_cur_start >= src0_cur_end) return; + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12)); + + gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, + (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); + } + continue; + } + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + // threads with no work simply yield (not sure if it helps) + //if (ir010 >= ir011 || ir110 >= ir111) { + // sched_yield(); + // continue; + //} + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t _i12 = ir1; // logical row index for this expert + + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11)*row_size + : (i11*nb11 + i12*nb12)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); + } + + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } + } + +#undef MMID_MATRIX_ROW +} + +// ggml_compute_forward_out_prod + +static void ggml_compute_forward_out_prod_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (ith == 0) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } + ggml_barrier(params->threadpool); + + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // block-tiling attempt + const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); + const int64_t blck_1 = 16; + + for (int64_t bir = ir0; bir < ir1; bir += blck_1) { + const int64_t bir1 = MIN(bir + blck_1, ir1); + for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { + const int64_t bne01 = MIN(bi01 + blck_0, ne01); + for (int64_t ir = bir; ir < bir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + +#if GGML_VEC_MAD_UNROLL > 2 + const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL); + for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); + } + for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#else + for (int64_t i01 = bi01; i01 < bne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + } +#endif + } + } + } +} + +static void ggml_compute_forward_out_prod_q_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 dim0 + GGML_ASSERT(nb00 == ggml_type_size(type)); + + // dst dim0 cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (ith == 0) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } + ggml_barrier(params->threadpool); + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + + for (int64_t ir = ir0; ir < ir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + + for (int64_t i01 = 0; i01 < ne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + dequantize_row_q(s0, wdata, ne0); + ggml_vec_mad_f32(ne0, d, wdata, *s1); + } + } +} + +static void ggml_compute_forward_out_prod( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + { + ggml_compute_forward_out_prod_q_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + GGML_ABORT("fatal error"); // todo + // ggml_compute_forward_out_prod_f16_f32(params, dst); + } + case GGML_TYPE_F32: + { + ggml_compute_forward_out_prod_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_scale + +static void ggml_compute_forward_scale_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + // scale factor + float v; + memcpy(&v, dst->op_params, sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const size_t nb01 = src0->nb[1]; + + const size_t nb1 = dst->nb[1]; + + for (int i1 = ir0; i1 < ir1; i1++) { + if (dst->data != src0->data) { + // src0 is same shape as dst => same indices + memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + } + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); + } +} + +static void ggml_compute_forward_scale( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_scale_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_set + +static void ggml_compute_forward_set_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitly element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + if (params->ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + +static void ggml_compute_forward_set( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_set_f32(params, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_cpy + +static void ggml_compute_forward_cpy( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, dst); +} + +// ggml_compute_forward_cont + +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, dst); +} + +// ggml_compute_forward_reshape + +static void ggml_compute_forward_reshape( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(dst); +} + +// ggml_compute_forward_view + +static void ggml_compute_forward_view( + const struct ggml_compute_params * params, + const struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(dst); +} + +// ggml_compute_forward_permute + +static void ggml_compute_forward_permute( + const struct ggml_compute_params * params, + const struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(dst); +} + +// ggml_compute_forward_transpose + +static void ggml_compute_forward_transpose( + const struct ggml_compute_params * params, + const struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(dst); +} + +// ggml_compute_forward_get_rows + +static void ggml_compute_forward_get_rows_q( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(type)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + dequantize_row_q( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } +} + +static void ggml_compute_forward_get_rows_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(ggml_fp16_t)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + ggml_fp16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } +} + +static void ggml_compute_forward_get_rows_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(ggml_bf16_t)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + ggml_bf16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } +} + +static void ggml_compute_forward_get_rows_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(float)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), + (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); + } +} + +static void ggml_compute_forward_get_rows( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + { + ggml_compute_forward_get_rows_q(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_f16(params, dst); + } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_get_rows_bf16(params, dst); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_get_rows_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + + //static bool first = true; + //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); + //if (first) { + // first = false; + //} else { + // for (int k = 0; k < dst->ne[1]; ++k) { + // for (int j = 0; j < dst->ne[0]/16; ++j) { + // for (int i = 0; i < 16; ++i) { + // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // exit(0); + //} +} + +// ggml_compute_forward_get_rows_back + +static void ggml_compute_forward_get_rows_back_f32_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_is_contiguous(dst)); + + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + memset(dst->data, 0, ggml_nbytes(dst)); + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + for (int j = 0; j < nc; ++j) { + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + } + } +} + +static void ggml_compute_forward_get_rows_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + if (params->ith != 0) { + return; + } + + GGML_ASSERT(ggml_is_contiguous(dst)); + + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + memset(dst->data, 0, ggml_nbytes(dst)); + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) src0->data + i*src0->nb[1])); + } +} + +static void ggml_compute_forward_get_rows_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_back_f32_f16(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + + //static bool first = true; + //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); + //if (first) { + // first = false; + //} else { + // for (int k = 0; k < dst->ne[1]; ++k) { + // for (int j = 0; j < dst->ne[0]/16; ++j) { + // for (int i = 0; i < 16; ++i) { + // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // exit(0); + //} +} + +// ggml_compute_forward_diag + +static void ggml_compute_forward_diag_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + // TODO: handle transposed/permuted matrices + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(ne00 == ne0); + GGML_ASSERT(ne00 == ne1); + GGML_ASSERT(ne01 == 1); + GGML_ASSERT(ne02 == ne2); + GGML_ASSERT(ne03 == ne3); + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = 0; i1 < ne1; i1++) { + float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); + for (int i0 = 0; i0 < i1; i0++) { + d[i0] = 0; + } + d[i1] = s[i1]; + for (int i0 = i1+1; i0 < ne0; i0++) { + d[i0] = 0; + } + } + } + } +} + +static void ggml_compute_forward_diag( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_diag_mask_inf + +static void ggml_compute_forward_diag_mask_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const float value) { + + const struct ggml_tensor * src0 = dst->src[0]; + + const int ith = params->ith; + const int nth = params->nth; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = src0->data == dst->data; + + GGML_ASSERT(n_past >= 0); + + if (!inplace) { + if (ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + // TODO: handle transposed/permuted matrices + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + const int nr = src0->ne[1]; + const int nz = n/nr; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int k = 0; k < nz; k++) { + for (int j = ith; j < nr; j += nth) { + for (int i = n_past; i < nc; i++) { + if (i > n_past + j) { + *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; + } + } + } + } +} + +static void ggml_compute_forward_diag_mask_inf( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_diag_mask_zero( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_f32(params, dst, 0); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_soft_max + +static void ggml_compute_forward_soft_max_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + assert(ggml_is_contiguous(dst)); + assert(ggml_are_same_shape(src0, dst)); + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + //const int64_t ne11 = src1 ? src1->ne[1] : 1; + + // TODO: is this supposed to be ceil instead of floor? + // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 + const uint32_t n_head = ne02; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); + + for (int i1 = ir0; i1 < ir1; i1++) { + // ALiBi + const uint32_t h = (i1/ne01)%ne02; // head + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + + float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); + float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); + + // broadcast the mask across rows + ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; + float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; + + ggml_vec_cpy_f32 (nc, wp, sp); + ggml_vec_scale_f32(nc, wp, scale); + if (mp_f32) { + if (use_f16) { + for (int i = 0; i < nc; ++i) { + wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); + } + } else { + for (int i = 0; i < nc; ++i) { + wp[i] += slope*mp_f32[i]; + } + } + } + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(wp[i])); + } +#endif + + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, wp); + + ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max); + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(nc, dp, sum); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dp[i])); + assert(!isinf(dp[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + + +// ggml_compute_forward_soft_max_back + +static void ggml_compute_forward_soft_max_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src1, dst)); + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); + float *y = (float *)((char *) src1->data + i1*src1->nb[1]); + float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(dy[i])); + assert(!isnan(y[i])); + } +#endif + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.T*y + // dx = J * dy + // dxk = sum_i(Jki * dyi) + // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*dyk + // dxk = -yk * sum_i(yi * dyi) + yk*dyk + // dxk = -yk * dot(y, dy) + yk*dyk + // dxk = yk * (- dot(y, dy) + dyk) + // dxk = yk * (dyk - dot(y, dy)) + // + // post-order: + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + + // linear runtime, no additional memory + float dot_y_dy = 0; + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dx[i])); + assert(!isinf(dx[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_clamp + +static void ggml_compute_forward_clamp_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + float min; + float max; + memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + for (int j = ith; j < n; j += nth) { + float * dst_ptr = (float *) ((char *) dst->data + j*nb1); + float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + + for (int i = 0; i < nc; i++) { + dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); + } + } +} + +static void ggml_compute_forward_clamp( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_clamp_f32(params, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_Q8_K: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: + case GGML_TYPE_COUNT: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_rope + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / MAX(0.001f, high - low); + return 1 - MIN(1, MAX(0, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +static void ggml_rope_cache_init( + float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, + float * cache, float sin_sign, float theta_scale) { + // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py + float theta = theta_base; + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; + rope_yarn( + theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] + ); + cache[i0 + 1] *= sin_sign; + + theta *= theta_scale; + } +} + +void ggml_rope_yarn_corr_dims( + int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); +} + +static void ggml_compute_forward_rope_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + //const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + GGML_ASSERT(nb00 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + const float * freq_factors = NULL; + if (src2 != NULL) { + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + + const int32_t * pos = (const int32_t *) src1->data; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + if (!is_neox) { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[1]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } + } else { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const int64_t ic = i0/2; + + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } + } + + for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } + } + } +} + +// TODO: deduplicate f16/f32 code +static void ggml_compute_forward_rope_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const bool forward) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + //const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + const float * freq_factors = NULL; + if (src2 != NULL) { + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + + // backward process uses inverse rotation by cos and sin. + // cos and sin build a rotation matrix, where the inverse is the transpose. + // this essentially just switches the sign of sin. + const float sin_sign = forward ? 1.0f : -1.0f; + + const int32_t * pos = (const int32_t *) src1->data; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + if (!is_neox) { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } + } else { + for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { + const int64_t ic = i0/2; + + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } + } + + for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } + } + } + } +} + +static void ggml_compute_forward_rope( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_rope_f16(params, dst, true); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_f32(params, dst, true); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_rope_back + +static void ggml_compute_forward_rope_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_rope_f16(params, dst, false); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_f32(params, dst, false); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_conv_transpose_1d + +static void ggml_compute_forward_conv_transpose_1d_f16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (ith == 0) { + memset(params->wdata, 0, params->wsize); + + // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; + } + } + } + } + + // permute source data (src1) from (L x Cin) to (Cin x L) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + ggml_fp16_t * dst_data = wdata; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + + // total rows in dst + const int nr = ne1; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne02, &v, 0, + (ggml_fp16_t *) wdata_src + i1n, 0, + (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); + dst_data[i10*s0 + i00] += v; + } + } + } +} + +static void ggml_compute_forward_conv_transpose_1d_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (ith == 0) { + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + { + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i01*ne00*ne02; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ne02 + i02] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + nk; + float * dst_data = wdata; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne11 + i11] = src[i10]; + } + } + } + + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + + // total rows in dst + const int nr = ne1; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * const wdata = (float *) params->wdata + 0; + float * const wdata_src = wdata + nk; + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + float * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f32(ne02, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i00*ne02, 0, 1); + dst_data[i10*s0 + i00] += v; + } + } + } +} + +static void ggml_compute_forward_conv_transpose_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_transpose_1d_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_im2col_f32 +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_im2col_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; + + GGML_ASSERT(nb10 == sizeof(float)); + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + + +// ggml_compute_forward_im2col_f16 +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_im2col_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_im2col( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_im2col_f16(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_im2col_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_im2col_back_f32 + +static void ggml_compute_forward_im2col_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = is_2D ? ne3 : ne2; + const int64_t IC = is_2D ? ne2 : ne1; + const int64_t IH = is_2D ? ne1 : 1; + const int64_t IW = ne0; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne12 : 1; + const int64_t OW = ne11; + + int ofs0 = is_2D ? nb3 : nb2; + int ofs1 = is_2D ? nb2 : nb1; + + GGML_ASSERT(nb0 == sizeof(float)); + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + for (int64_t iih = 0; iih < IH; iih++) { + for (int64_t iiw = 0; iiw < IW; iiw++) { + + // micro kernel + float grad = 0.0f; + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + // For s0 > 1 some values were skipped over in the forward pass. + // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well. + const int64_t tmpw = (iiw + p0 - ikw*d0); + if (tmpw % s0 != 0) { + continue; + } + const int64_t iow = tmpw / s0; + + // Equivalent logic as above except for s1. + int64_t ioh; + if (is_2D) { + const int64_t tmph = iih + p1 - ikh*d1; + + if (tmph % s1 != 0) { + continue; + } + + ioh = tmph / s1; + } else { + ioh = 0; + } + + if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) { + continue; + } + + const float * const src_data = (const float *) src1->data + + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + grad += src_data[iic*(KH*KW) + ikh*KW + ikw]; + } + } + float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW] + dst_data[iih*IW + iiw] = grad; + } + } + } + } + } +} + +// ggml_compute_forward_conv_transpose_2d + +static void ggml_compute_forward_conv_transpose_2d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02*ne03; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (ith == 0) { + memset(params->wdata, 0, params->wsize); + + // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; + } + } + } + } + } + + // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + for (int i12 = 0; i12 < ne12; i12++) { + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + } + + memset(dst->data, 0, ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + + const int32_t stride = ggml_get_op_params_i32(dst, 0); + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; + + for (int i2 = ip0; i2 < ip1; i2++) { // Cout + float * dst_data = (float *)((char *) dst->data + i2*nb2); + ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + for (int i11 = 0; i11 < ne11; i11++) { + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i11*ne10*ne12 + i10*ne12; + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + } + } + } + } + } +} + +// ggml_compute_forward_pool_1d_sk_p0 + +static void ggml_compute_forward_pool_1d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const int k, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); + + if (params->ith != 0) { + return; + } + + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + float * drow = (float *)dst->data; + + const int64_t rs = dst->ne[0]; + + while (cdata < data_end) { + const void * srow = (const void *)cdata; + int j = 0; + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] = 0; break; + case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + for (int ki = 0; ki < k; ++ki) { + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + switch (op) { + case GGML_OP_POOL_AVG: drow[i] += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + ++j; + } + switch (op) { + case GGML_OP_POOL_AVG: drow[i] /= k; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + } + + cdata += src->nb[1]; + drow += rs; + } +} + +// ggml_compute_forward_pool_1d + +static void ggml_compute_forward_pool_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(k0 == s0); // only s = k supported + + ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); +} + +// ggml_compute_forward_pool_2d + +static void ggml_compute_forward_pool_2d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); + + if (params->ith != 0) { + return; + } + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; + + float * dplane = (float *)dst->data; + + const int ka = k0 * k1; + const int offset0 = -p0; + const int offset1 = -p1; + + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case GGML_OP_POOL_AVG: *out = 0; break; + case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + + const int ix = offset0 + ox * s0; + const int iy = offset1 + oy * s1; + + for (int ky = 0; ky < k1; ++ky) { + if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + if (j < 0 || j >= src->ne[0]) continue; + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + switch (op) { + case GGML_OP_POOL_AVG: *out += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + } + } + switch (op) { + case GGML_OP_POOL_AVG: *out /= ka; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + } + } + } + + cdata += src->nb[2]; + dplane += pa; + } +} + +// ggml_compute_forward_pool_2d_back + +static void ggml_compute_forward_pool_2d_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src = dst->src[0]; + const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst + + assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + + if (params->ith != 0) { + return; + } + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + char * cdata = (char *) dst->data; + const char * cdataf = (const char *) dstf->data; + const char * const data_end = cdata + ggml_nbytes(dst); + + GGML_ASSERT(params->ith == 0); + memset(cdata, 0, ggml_nbytes(dst)); + + const int64_t px = src->ne[0]; + const int64_t py = src->ne[1]; + const int64_t pa = px * py; + + const float * splane = (const float *) src->data; + + const int ka = k0 * k1; + const int offset0 = -p0; + const int offset1 = -p1; + + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + const float * const srow = splane + oy * px; + for (int ox = 0; ox < px; ++ox) { + const float grad0 = srow[ox]; + + const int ix = offset0 + ox * s0; + const int iy = offset1 + oy * s1; + + if (op == GGML_OP_POOL_MAX) { + float maxval = -FLT_MAX; + int kxmax = -1; + int kymax = -1; + + for (int ky = 0; ky < k1; ++ky) { + if (iy + ky < 0 || iy + ky >= dst->ne[1]) { + continue; + } + const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + if (j < 0 || j >= dst->ne[0]) { + continue; + } + + const float val = dst->type == GGML_TYPE_F32 ? + ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); + if (val <= maxval) { + continue; + } + + maxval = val; + kxmax = kx; + kymax = ky; + } + } + + if (kxmax == -1 || kymax == -1) { + continue; + } + + void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax)); + const int j = ix + kxmax; + if (dst->type == GGML_TYPE_F32) { + ((float *) drow)[j] += grad0; + } else { + ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); + } + } else if (op == GGML_OP_POOL_AVG) { + const float grad = grad0 / ka; + + for (int ky = 0; ky < k1; ++ky) { + if (iy + ky < 0 || iy + ky >= dst->ne[1]) { + continue; + } + void * drow = (void *)(cdata + dst->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + if (j < 0 || j >= dst->ne[0]) { + continue; + } + + if (dst->type == GGML_TYPE_F32) { + ((float *) drow)[j] += grad; + } else { + ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); + } + } + } + } else { + GGML_ASSERT(false); + } + } + } + + cdata += dst->nb[2]; + cdataf += dst->nb[2]; + splane += pa; + } +} + +// ggml_compute_forward_upscale + +static void ggml_compute_forward_upscale_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + const float sf0 = (float)ne0/src0->ne[0]; + const float sf1 = (float)ne1/src0->ne[1]; + const float sf2 = (float)ne2/src0->ne[2]; + const float sf3 = (float)ne3/src0->ne[3]; + + // TODO: optimize + + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const int64_t i01 = i1 / sf1; + for (int64_t i0 = 0; i0 < ne0; i0++) { + const int64_t i00 = i0 / sf0; + + const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; + } + } + } + } +} + +static void ggml_compute_forward_upscale( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_upscale_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + + +// ggml_compute_forward_pad + +static void ggml_compute_forward_pad_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + float * dst_ptr = (float *) dst->data; + + // TODO: optimize + + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + for (int64_t i3 = 0; i3 < ne3; ++i3) { + const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; + + const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + dst_ptr[dst_idx] = *src_ptr; + } else { + dst_ptr[dst_idx] = 0; + } + } + } + } + } +} + +static void ggml_compute_forward_pad( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_pad_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + + +// ggml_compute_forward_arange + +static void ggml_compute_forward_arange_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const float start = ggml_get_op_params_f32(dst, 0); + const float stop = ggml_get_op_params_f32(dst, 1); + const float step = ggml_get_op_params_f32(dst, 2); + + const int64_t steps = (int64_t) ceilf((stop - start) / step); + + GGML_ASSERT(ggml_nelements(dst) == steps); + + for (int64_t i = ith; i < steps; i+= nth) { + float value = start + step * i; + ((float *)dst->data)[i] = value; + } +} + +static void ggml_compute_forward_arange( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_arange_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_timestep_embedding_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + const int dim = ggml_get_op_params_i32(dst, 0); + const int max_period = ggml_get_op_params_i32(dst, 1); + + int half = dim / 2; + + for (int64_t i = 0; i < ne00; i++) { + float * embed_data = (float *)((char *) dst->data + i*nb1); + for (int64_t j = ith; j < half; j += nth) { + float timestep = ((float *)src0->data)[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); + } + if (dim % 2 != 0 && ith == 0) { + embed_data[dim] = 0.f; + } + } +} + +static void ggml_compute_forward_timestep_embedding( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_timestep_embedding_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_argsort + +static void ggml_compute_forward_argsort_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(nb0 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nr = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0); + + for (int64_t i = ith; i < nr; i += nth) { + int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); + const float * src_data = (float *)((char *) src0->data + i*nb01); + + for (int64_t j = 0; j < ne0; j++) { + dst_data[j] = j; + } + + // C doesn't have a functional sort, so we do a bubble sort instead + for (int64_t j = 0; j < ne0; j++) { + for (int64_t k = j + 1; k < ne0; k++) { + if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + int32_t tmp = dst_data[j]; + dst_data[j] = dst_data[k]; + dst_data[k] = tmp; + } + } + } + } +} + +static void ggml_compute_forward_argsort( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_argsort_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_flash_attn_ext + +static void ggml_compute_forward_flash_attn_ext_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * mask, + struct ggml_tensor * dst) { + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nev0 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t rk2 = neq2/nek2; + const int64_t rk3 = neq3/nek3; + + const int64_t rv2 = neq2/nev2; + const int64_t rv3 = neq3/nev3; + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + const uint32_t n_head = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type; + ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits(k_vec_dot_type)->from_float; + ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot; + ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float; + + GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type"); + GGML_ASSERT(v_to_float && "fattn: unsupported V-type"); + + // loop over n_batch and n_head + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + const uint32_t h = iq2; // head index + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + + float S = 0.0f; // sum + float M = -INFINITY; // maximum KQ value + + float * VKQ32 = (float *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator + float * V32 = (VKQ32 + 1*D); // (temporary) FP32 V buffer + ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator + ggml_fp16_t * Q_q = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16 + + if (v->type == GGML_TYPE_F16) { + memset(VKQ16, 0, D*sizeof(ggml_fp16_t)); + } else { + memset(VKQ32, 0, D*sizeof(float)); + } + + const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; + + // k indices + const int ik3 = iq3 / rk3; + const int ik2 = iq2 / rk2; + + // v indices + const int iv3 = iq3 / rv3; + const int iv2 = iq2 / rv2; + + const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); + q_to_vec_dot(pq, Q_q, D); + + // online softmax / attention + // loop over n_kv and n_head_kv + // ref: https://arxiv.org/pdf/2112.05682.pdf + for (int64_t ic = 0; ic < nek1; ++ic) { + const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + if (mv == -INFINITY) { + continue; + } + + float s; // KQ value + + const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); + kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1); + + s = s*scale; // scale KQ value + + if (logit_softcap != 0.0f) { + s = logit_softcap*tanhf(s); + } + + s += mv; // apply mask + + const float Mold = M; + + float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value + float vs = 1.0f; // post-softmax KQ value, expf(s - M) + + const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); + + if (v->type == GGML_TYPE_F16) { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + ggml_vec_scale_f16(D, VKQ16, ms); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + // V += v*expf(s - M) + ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs); + } else { + if (s > M) { + // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + ggml_vec_scale_f32(D, VKQ32, ms); + } else { + // no new maximum, ms == 1.0f, vs != 1.0f + vs = expf(s - M); + } + + v_to_float(v_data, V32, D); + + // V += v*expf(s - M) + ggml_vec_mad_f32(D, VKQ32, V32, vs); + } + + S = S*ms + vs; // scale and increment sum with partial sum + } + + if (v->type == GGML_TYPE_F16) { + for (int64_t d = 0; d < D; ++d) { + VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); + } + } + + // V /= S + const float S_inv = 1.0f/S; + ggml_vec_scale_f32(D, VKQ32, S_inv); + + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // original + //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); + + // permute(0, 2, 1, 3) + memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); + } +} + +static void ggml_compute_forward_flash_attn_ext( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * mask, + struct ggml_tensor * dst) { + switch (dst->op_params[3]) { + case GGML_PREC_DEFAULT: + case GGML_PREC_F32: + { + // uses F32 accumulators + ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_flash_attn_back + +static void ggml_compute_forward_flash_attn_back_f32( + const struct ggml_compute_params * params, + const bool masked, + struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + const struct ggml_tensor * k = dst->src[1]; + const struct ggml_tensor * v = dst->src[2]; + const struct ggml_tensor * d = dst->src[3]; + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ned, d, ne) + GGML_TENSOR_LOCALS(size_t, nbd, d, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int mxDM = MAX(D, Mup); + + // GGML_ASSERT(ne0 == D); + // GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned1 == N); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (ith == 0) { + memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + } + ggml_barrier(params->threadpool); + + const int64_t elem_q = ggml_nelements(q); + const int64_t elem_k = ggml_nelements(k); + + enum ggml_type result_type = dst->type; + GGML_ASSERT(ggml_blck_size(result_type) == 1); + const size_t tsize = ggml_type_size(result_type); + + const size_t offs_q = 0; + const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); + const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + offs_k; + void * grad_v = (char *) dst->data + offs_v; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // parallelize by k rows using ggml_vec_dot_f32 + + // total rows in k + const int nr = nek2*nek3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + // how often k2 (and v2) is repeated in q2 + int nrep = neq2/nek2; + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int ik3 = ir/(nek2); + const int ik2 = ir - ik3*nek2; + + const int iq3 = ik3; + const int id3 = ik3; + const int iv3 = ik3; + const int iv2 = ik2; + + for (int irep = 0; irep < nrep; ++irep) { + const int iq2 = ik2 + irep*nek2; + const int id2 = iq2; + + // (ik2 + irep*nek2) % nek2 == ik2 + for (int iq1 = 0; iq1 < neq1; ++iq1) { + const int id1 = iq1; + + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + const int64_t masked_begin = masked ? (P + iq1 + 1) : M; + for (int64_t ic = 0; ic < masked_begin; ++ic) { + // k indices + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); + } + + // scale + ggml_vec_scale_f32(masked_begin, S, scale); + + for (int64_t i = masked_begin; i < M; i++) { + S[i] = -INFINITY; + } + + // softmax + // exclude known -INF S[..] values from max and loop + // dont forget to set their SM values to zero + { + float max = -INFINITY; + ggml_vec_max_f32(masked_begin, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); +#else + sum = ggml_vec_soft_max_f32(Mup, SM, S, max); +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(masked_begin, SM, sum); + + } + + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for ik2,ik3: + // for irep: + // iq2 = ik2 + irep*nek2 + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,ik2,ik3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iv2,iv3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,ik2,ik3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iv2,iv3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,id1,id2,id3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // in post-order: + // + // S1 = qcur @ kcur.T + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,id1,id2,id3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,id1,id2,id3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur @ kcur.T * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,ik2,ik3] += S.T @ qcur + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + } + + // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] + // for ic: + // S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3] + // exclude known future zero S[..] values from operation + ggml_vec_set_f32(masked_begin, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + ggml_vec_mad_f32(masked_begin, + S, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } + + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (masked_begin, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + // already done by above ggml_vec_set_f32 + + // exclude known zero S[..] values from operation + ggml_vec_scale_f32(masked_begin, S, scale); + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // for ic: + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), + S[ic]); + } + + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // for ic: + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + // exclude known zero S[..] values from loop + for (int64_t ic = 0; ic < masked_begin; ++ic) { + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), + S[ic]); + } + + // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM + // for ic: + // grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3] * SM[:M] + // exclude known zero SM[..] values from mad + for (int64_t ic = 0; ic < D; ++ic) { + ggml_vec_mad_f32(masked_begin, + (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); + } + } + } + } +} + +static void ggml_compute_forward_flash_attn_back( + const struct ggml_compute_params * params, + const bool masked, + struct ggml_tensor * dst) { + + const struct ggml_tensor * q = dst->src[0]; + + switch (q->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_back_f32(params, masked, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_ssm_conv + +static void ggml_compute_forward_ssm_conv_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; // conv_x + const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1->ne[0]; // d_conv + const int ncs = src0->ne[0]; // d_conv - 1 + n_t + const int nr = src0->ne[1]; // d_inner + const int n_t = dst->ne[1]; // tokens per sequence + const int n_s = dst->ne[2]; // number of sequences in the batch + + GGML_ASSERT( dst->ne[0] == nr); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + const int ir = ir1 - ir0; + + for (int i3 = 0; i3 < n_s; ++i3) { + for (int i2 = 0; i2 < n_t; ++i2) { + // {d_conv - 1 + n_t, d_inner, n_seqs} + // sliding window + const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} + const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner} + float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} + + // TODO: transpose the output for smaller strides for big batches? + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // rowwise dot product + // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision + float sumf = 0.0f; + + // d_conv + for (int i0 = 0; i0 < nc; ++i0) { + sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; + } + x[i1] = sumf; + } + } + } +} + +static void ggml_compute_forward_ssm_conv( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_ssm_conv_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_ssm_scan + +static void ggml_compute_forward_ssm_scan_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; // s + const struct ggml_tensor * src1 = dst->src[1]; // x + const struct ggml_tensor * src2 = dst->src[2]; // dt + const struct ggml_tensor * src3 = dst->src[3]; // A + const struct ggml_tensor * src4 = dst->src[4]; // B + const struct ggml_tensor * src5 = dst->src[5]; // C + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nc = src0->ne[0]; // d_state + const int64_t nr = src0->ne[1]; // d_inner + const int64_t n_t = src1->ne[1]; // number of tokens per sequence + const int64_t n_s = src0->ne[2]; // number of sequences in the batch + + GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + GGML_ASSERT(src2->nb[0] == sizeof(float)); + GGML_ASSERT(src3->nb[0] == sizeof(float)); + GGML_ASSERT(src4->nb[0] == sizeof(float)); + GGML_ASSERT(src5->nb[0] == sizeof(float)); + // required for the dot product between s and C + GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); + // required for per-sequence offsets for states + GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); + // required to get correct offset for state destination (i.e. src1->nb[3]) + GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + const int ir = ir1 - ir0; + + for (int i3 = 0; i3 < n_s; ++i3) { + for (int i2 = 0; i2 < n_t; ++i2) { + const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} + const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} + const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} + const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} + const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} + const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} + float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} + float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} + + // use the output as the source for the next token-wise iterations + if (i2 > 0) { s0 = s; } + + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 + float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; + float x_dt = x[i1] * dt_soft_plus; + float sumf = 0.0f; + // d_state + for (int i0 = 0; i0 < nc; ++i0) { + int i = i0 + i1*nc; + // state = prev_state * dA + dB * x + float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); + // y = rowwise_dotprod(state, C) + sumf += state * C[i0]; + s[i] = state; + } + y[i1] = sumf; + } + } + } +} + +static void ggml_compute_forward_ssm_scan( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_ssm_scan_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_win_part + +static void ggml_compute_forward_win_part_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + UNUSED(params); + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + + const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t w = ((const int32_t *)(dst->op_params))[2]; + + assert(ne00 == ne0); + assert(ne3 == nep0*nep1); + + // TODO: optimize / multi-thread + for (int py = 0; py < nep1; ++py) { + for (int px = 0; px < nep0; ++px) { + const int64_t i3 = py*nep0 + px; + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i02 = py*w + i2; + const int64_t i01 = px*w + i1; + const int64_t i00 = i0; + + const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0; + const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; + + if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { + ((float *) dst->data)[i] = 0.0f; + } else { + ((float *) dst->data)[i] = ((float *) src0->data)[j]; + } + } + } + } + } + } +} + +static void ggml_compute_forward_win_part( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_part_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_win_unpart + +static void ggml_compute_forward_win_unpart_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + UNUSED(params); + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + + const int32_t w = ((const int32_t *)(dst->op_params))[0]; + + // padding + const int px = (w - ne1%w)%w; + //const int py = (w - ne2%w)%w; + + const int npx = (px + ne1)/w; + //const int npy = (py + ne2)/w; + + assert(ne0 == ne00); + + // TODO: optimize / multi-thread + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int ip2 = i2/w; + const int ip1 = i1/w; + + const int64_t i02 = i2%w; + const int64_t i01 = i1%w; + const int64_t i00 = i0; + + const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; + const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; + + ((float *) dst->data)[j] = ((float *) src0->data)[i]; + } + } + } +} + +static void ggml_compute_forward_win_unpart( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_unpart_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +//gmml_compute_forward_unary + +static void ggml_compute_forward_unary( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const enum ggml_unary_op op = ggml_get_unary_op(dst); + + switch (op) { + case GGML_UNARY_OP_ABS: + { + ggml_compute_forward_abs(params, dst); + } break; + case GGML_UNARY_OP_SGN: + { + ggml_compute_forward_sgn(params, dst); + } break; + case GGML_UNARY_OP_NEG: + { + ggml_compute_forward_neg(params, dst); + } break; + case GGML_UNARY_OP_STEP: + { + ggml_compute_forward_step(params, dst); + } break; + case GGML_UNARY_OP_TANH: + { + ggml_compute_forward_tanh(params, dst); + } break; + case GGML_UNARY_OP_ELU: + { + ggml_compute_forward_elu(params, dst); + } break; + case GGML_UNARY_OP_RELU: + { + ggml_compute_forward_relu(params, dst); + } break; + case GGML_UNARY_OP_SIGMOID: + { + ggml_compute_forward_sigmoid(params, dst); + } break; + case GGML_UNARY_OP_GELU: + { + ggml_compute_forward_gelu(params, dst); + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + ggml_compute_forward_gelu_quick(params, dst); + } break; + case GGML_UNARY_OP_SILU: + { + ggml_compute_forward_silu(params, dst); + } break; + case GGML_UNARY_OP_HARDSWISH: + { + ggml_compute_forward_hardswish(params, dst); + } break; + case GGML_UNARY_OP_HARDSIGMOID: + { + ggml_compute_forward_hardsigmoid(params, dst); + } break; + case GGML_UNARY_OP_EXP: + { + ggml_compute_forward_exp(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_get_rel_pos + +static void ggml_compute_forward_get_rel_pos_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + UNUSED(params); + + const struct ggml_tensor * src0 = dst->src[0]; + + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 + + GGML_TENSOR_UNARY_OP_LOCALS + + const int64_t w = ne1; + + ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; + ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; + + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const int64_t pos = (w - i1 - 1) + i2; + for (int64_t i0 = 0; i0 < ne0; ++i0) { + dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; + } + } + } +} + +static void ggml_compute_forward_get_rel_pos( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + { + ggml_compute_forward_get_rel_pos_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_add_rel_pos + +static void ggml_compute_forward_add_rel_pos_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; + + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; + if (!inplace) { + if (params->ith == 0) { + memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 + + float * src1_data = (float *) src1->data; + float * src2_data = (float *) src2->data; + float * dst_data = (float *) dst->data; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne13; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + for (int64_t i13 = ip0; i13 < ip1; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t jp0 = jp1 + i10; + const float src1_e = src1_data[jp0]; + const float src2_e = src2_data[jp0]; + + const int64_t jdh = jp0 * ne10; + const int64_t jdw = jdh - (ne10 - 1) * i10; + + for (int64_t j = 0; j < ne10; ++j) { + dst_data[jdh + j ] += src2_e; + dst_data[jdw + j*ne10] += src1_e; + } + } + } + } + } +} + +static void ggml_compute_forward_add_rel_pos( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_rel_pos_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_rwkv_wkv + +static void ggml_compute_forward_rwkv_wkv_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const size_t T = dst->src[1]->ne[3]; + const size_t C = dst->ne[0]; + const size_t H = dst->src[1]->ne[2]; + const size_t n_seqs = dst->src[5]->ne[1]; + + float * dst_data = (float *) dst->data; + float * state = ((float *) dst->data) + C * T; + + if (params->ith != 0) { + return; + } + + memset(dst_data, 0, T * C * sizeof(float)); + + float * k = (float *) dst->src[0]->data; + float * v = (float *) dst->src[1]->data; + float * r = (float *) dst->src[2]->data; + float * time_faaaa = (float *) dst->src[3]->data; + float * time_decay = (float *) dst->src[4]->data; + + size_t t_stride = H * (C / H); + + size_t h_stride = C / H; + size_t h_stride_2d = (C / H) * (C / H); + + // basically fused operations: + // dst = r @ (time_faaaa * (k @ v) + state), + // state = time_decay * state + (k @ v), + // recursive through each token + for (size_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + size_t state_offset = (C / H) * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; + + for (size_t h = 0; h < H; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (size_t i = 0; i < C / H; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_i_offset = h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float r_val = r[t_h_i_offset]; + float time_faaaa_val = time_faaaa[h_i_offset]; + // RWKV v6: different time_decay for each token. + float time_decay_val = time_decay[t_h_i_offset]; + + for (size_t j = 0; j < C / H; j ++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + float temp_val = kv_val * time_faaaa_val + prev_state_val; + dst_data[t_h_j_offset] += temp_val * r_val; + state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + } + } + } + } +} + +static void ggml_compute_forward_rwkv_wkv( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rwkv_wkv_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_map_unary + +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_unary_f32(params, dst, fun); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_map_binary + +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + if (params->ith != 0) { + return; + } + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(src1)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + +static void ggml_compute_forward_map_binary( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_binary_f32(params, dst, fun); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_custom1_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + + if (params->ith != 0) { + return; + } + + fun(dst, a); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_custom2_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + + if (params->ith != 0) { + return; + } + + fun(dst, a, b); +} + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, + const ggml_custom3_op_f32_t fun) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[1]; + + if (params->ith != 0) { + return; + } + + fun(dst, a, b, c); +} + +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + + struct ggml_map_custom1_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); + + p.fun(dst, a, params->ith, params->nth, p.userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + + struct ggml_map_custom2_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); + + p.fun(dst, a, b, params->ith, params->nth, p.userdata); +} + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * a = dst->src[0]; + const struct ggml_tensor * b = dst->src[1]; + const struct ggml_tensor * c = dst->src[2]; + + struct ggml_map_custom3_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); + + p.fun(dst, a, b, c, params->ith, params->nth, p.userdata); +} + +// ggml_compute_forward_cross_entropy_loss + +static void ggml_compute_forward_cross_entropy_loss_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); + + const int ith = params->ith; + const int nth = params->nth; + + float * sums = (float *) params->wdata; + float * st = ((float *) params->wdata) + nth + ith*nc; + float sum_thread = 0.0f; + + GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + for (int64_t i1 = ir0; i1 < ir1; ++i1) { + const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); + const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); + +#ifndef NDEBUG + for (int64_t i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max); + assert(sum_softmax >= 0.0); + + ggml_vec_add1_f32(nc, st, st, -sum_softmax); + ggml_vec_mul_f32(nc, st, st, s1); + + float sum_st = 0.0f; + ggml_vec_sum_f32(nc, &sum_st, st); + sum_thread += sum_st; + +#ifndef NDEBUG + for (int64_t i = 0; i < nc; ++i) { + assert(!isnan(st[i])); + assert(!isinf(st[i])); + } +#endif + } + sums[ith] = sum_thread; + ggml_barrier(params->threadpool); + + if (ith == 0) { + float * dp = (float *) dst->data; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f / (float) nr; + } +} + +static void ggml_compute_forward_cross_entropy_loss( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_cross_entropy_loss_back + +static void ggml_compute_forward_cross_entropy_loss_back_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * opt0 = dst->src[2]; + + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int64_t ith = params->ith; + const int64_t nth = params->nth; + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr; + + for (int64_t i1 = ir0; i1 < ir1; i1++) { + float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + +#ifndef NDEBUG + for (int64_t i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + + // soft_max + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); + assert(sum > 0.0); + ggml_vec_scale_f32(nc, ds0, 1.0/sum); + + // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + ggml_vec_sub_f32(nc, ds0, ds0, s1); + ggml_vec_scale_f32(nc, ds0, d_by_nr); + +#ifndef NDEBUG + for (int64_t i = 0; i < nc; ++i) { + assert(!isnan(ds0[i])); + assert(!isinf(ds0[i])); + } +#endif + } +} + +static void ggml_compute_forward_cross_entropy_loss_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +static void ggml_compute_forward_opt_step_adamw_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src0_grad = dst->src[1]; + const struct ggml_tensor * src0_grad_m = dst->src[2]; + const struct ggml_tensor * src0_grad_v = dst->src[3]; + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + /* const float gnorm = 1.0f; */ + int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t)); + const float alpha = ggml_get_op_params_f32(dst, 2); + const float beta1 = ggml_get_op_params_f32(dst, 3); + const float beta2 = ggml_get_op_params_f32(dst, 4); + const float eps = ggml_get_op_params_f32(dst, 5); + const float wd = ggml_get_op_params_f32(dst, 6); + + const float beta1h = alpha/(1.0f - powf(beta1, iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, iter)); + + for (int ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const size_t offset = i03*nb03 + i02*nb02 + i01*nb01; + + float * w = (float *) ((char *) src0->data + offset); // weight + const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad + float * m = (float *) ((char *) src0_grad_m->data + offset); + float * v = (float *) ((char *) src0_grad_v->data + offset); + + for (int i00 = 0; i00 < ne00; ++i00) { + m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); + v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); + + const float mh = m[i00]*beta1h; + const float vh = sqrtf(v[i00]*beta2h) + eps; + + // The weight decay is applied independently of the Adam momenta m and v. + // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss. + // See: https://arxiv.org/pdf/1711.05101v3.pdf + w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh; + } + } + + ggml_barrier(params->threadpool); + if (ith != 0) { + return; + } + + iter++; + memcpy(&dst->op_params[0], &iter, sizeof(int64_t)); +} + +static void ggml_compute_forward_opt_step_adamw( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_opt_step_adamw_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} +///////////////////////////////// + +static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + GGML_ASSERT(params); + + if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { + return; + } + + switch (tensor->op) { + case GGML_OP_DUP: + { + ggml_compute_forward_dup(params, tensor); + } break; + case GGML_OP_ADD: + { + ggml_compute_forward_add(params, tensor); + } break; + case GGML_OP_ADD1: + { + ggml_compute_forward_add1(params, tensor); + } break; + case GGML_OP_ACC: + { + ggml_compute_forward_acc(params, tensor); + } break; + case GGML_OP_SUB: + { + ggml_compute_forward_sub(params, tensor); + } break; + case GGML_OP_MUL: + { + ggml_compute_forward_mul(params, tensor); + } break; + case GGML_OP_DIV: + { + ggml_compute_forward_div(params, tensor); + } break; + case GGML_OP_SQR: + { + ggml_compute_forward_sqr(params, tensor); + } break; + case GGML_OP_SQRT: + { + ggml_compute_forward_sqrt(params, tensor); + } break; + case GGML_OP_LOG: + { + ggml_compute_forward_log(params, tensor); + } break; + case GGML_OP_SIN: + { + ggml_compute_forward_sin(params, tensor); + } break; + case GGML_OP_COS: + { + ggml_compute_forward_cos(params, tensor); + } break; + case GGML_OP_SUM: + { + ggml_compute_forward_sum(params, tensor); + } break; + case GGML_OP_SUM_ROWS: + { + ggml_compute_forward_sum_rows(params, tensor); + } break; + case GGML_OP_MEAN: + { + ggml_compute_forward_mean(params, tensor); + } break; + case GGML_OP_ARGMAX: + { + ggml_compute_forward_argmax(params, tensor); + } break; + case GGML_OP_COUNT_EQUAL: + { + ggml_compute_forward_count_equal(params, tensor); + } break; + case GGML_OP_REPEAT: + { + ggml_compute_forward_repeat(params, tensor); + } break; + case GGML_OP_REPEAT_BACK: + { + ggml_compute_forward_repeat_back(params, tensor); + } break; + case GGML_OP_CONCAT: + { + ggml_compute_forward_concat(params, tensor); + } break; + case GGML_OP_SILU_BACK: + { + ggml_compute_forward_silu_back(params, tensor); + } break; + case GGML_OP_NORM: + { + ggml_compute_forward_norm(params, tensor); + } break; + case GGML_OP_RMS_NORM: + { + ggml_compute_forward_rms_norm(params, tensor); + } break; + case GGML_OP_RMS_NORM_BACK: + { + ggml_compute_forward_rms_norm_back(params, tensor); + } break; + case GGML_OP_GROUP_NORM: + { + ggml_compute_forward_group_norm(params, tensor); + } break; + case GGML_OP_MUL_MAT: + { + ggml_compute_forward_mul_mat(params, tensor); + } break; + case GGML_OP_MUL_MAT_ID: + { + ggml_compute_forward_mul_mat_id(params, tensor); + } break; + case GGML_OP_OUT_PROD: + { + ggml_compute_forward_out_prod(params, tensor); + } break; + case GGML_OP_SCALE: + { + ggml_compute_forward_scale(params, tensor); + } break; + case GGML_OP_SET: + { + ggml_compute_forward_set(params, tensor); + } break; + case GGML_OP_CPY: + { + ggml_compute_forward_cpy(params, tensor); + } break; + case GGML_OP_CONT: + { + ggml_compute_forward_cont(params, tensor); + } break; + case GGML_OP_RESHAPE: + { + ggml_compute_forward_reshape(params, tensor); + } break; + case GGML_OP_VIEW: + { + ggml_compute_forward_view(params, tensor); + } break; + case GGML_OP_PERMUTE: + { + ggml_compute_forward_permute(params, tensor); + } break; + case GGML_OP_TRANSPOSE: + { + ggml_compute_forward_transpose(params, tensor); + } break; + case GGML_OP_GET_ROWS: + { + ggml_compute_forward_get_rows(params, tensor); + } break; + case GGML_OP_GET_ROWS_BACK: + { + ggml_compute_forward_get_rows_back(params, tensor); + } break; + case GGML_OP_DIAG: + { + ggml_compute_forward_diag(params, tensor); + } break; + case GGML_OP_DIAG_MASK_INF: + { + ggml_compute_forward_diag_mask_inf(params, tensor); + } break; + case GGML_OP_DIAG_MASK_ZERO: + { + ggml_compute_forward_diag_mask_zero(params, tensor); + } break; + case GGML_OP_SOFT_MAX: + { + ggml_compute_forward_soft_max(params, tensor); + } break; + case GGML_OP_SOFT_MAX_BACK: + { + ggml_compute_forward_soft_max_back(params, tensor); + } break; + case GGML_OP_ROPE: + { + ggml_compute_forward_rope(params, tensor); + } break; + case GGML_OP_ROPE_BACK: + { + ggml_compute_forward_rope_back(params, tensor); + } break; + case GGML_OP_CLAMP: + { + ggml_compute_forward_clamp(params, tensor); + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_compute_forward_conv_transpose_1d(params, tensor); + } break; + case GGML_OP_IM2COL: + { + ggml_compute_forward_im2col(params, tensor); + } break; + case GGML_OP_IM2COL_BACK: + { + ggml_compute_forward_im2col_back_f32(params, tensor); + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + ggml_compute_forward_conv_transpose_2d(params, tensor); + } break; + case GGML_OP_POOL_1D: + { + ggml_compute_forward_pool_1d(params, tensor); + } break; + case GGML_OP_POOL_2D: + { + ggml_compute_forward_pool_2d(params, tensor); + } break; + case GGML_OP_POOL_2D_BACK: + { + ggml_compute_forward_pool_2d_back(params, tensor); + } break; + case GGML_OP_UPSCALE: + { + ggml_compute_forward_upscale(params, tensor); + } break; + case GGML_OP_PAD: + { + ggml_compute_forward_pad(params, tensor); + } break; + case GGML_OP_ARANGE: + { + ggml_compute_forward_arange(params, tensor); + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + ggml_compute_forward_timestep_embedding(params, tensor); + } break; + case GGML_OP_ARGSORT: + { + ggml_compute_forward_argsort(params, tensor); + } break; + case GGML_OP_LEAKY_RELU: + { + ggml_compute_forward_leaky_relu(params, tensor); + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn_back(params, masked, tensor); + } break; + case GGML_OP_SSM_CONV: + { + ggml_compute_forward_ssm_conv(params, tensor); + } break; + case GGML_OP_SSM_SCAN: + { + ggml_compute_forward_ssm_scan(params, tensor); + } break; + case GGML_OP_WIN_PART: + { + ggml_compute_forward_win_part(params, tensor); + } break; + case GGML_OP_WIN_UNPART: + { + ggml_compute_forward_win_unpart(params, tensor); + } break; + case GGML_OP_UNARY: + { + ggml_compute_forward_unary(params, tensor); + } break; + case GGML_OP_GET_REL_POS: + { + ggml_compute_forward_get_rel_pos(params, tensor); + } break; + case GGML_OP_ADD_REL_POS: + { + ggml_compute_forward_add_rel_pos(params, tensor); + } break; + case GGML_OP_RWKV_WKV: + { + ggml_compute_forward_rwkv_wkv(params, tensor); + } break; + case GGML_OP_MAP_UNARY: + { + ggml_unary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_unary(params, tensor, fun); + } + break; + case GGML_OP_MAP_BINARY: + { + ggml_binary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_binary(params, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1_F32: + { + ggml_custom1_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom1_f32(params, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2_F32: + { + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom2_f32(params, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM3_F32: + { + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3_f32(params, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1: + { + ggml_compute_forward_map_custom1(params, tensor); + } + break; + case GGML_OP_MAP_CUSTOM2: + { + ggml_compute_forward_map_custom2(params, tensor); + } + break; + case GGML_OP_MAP_CUSTOM3: + { + ggml_compute_forward_map_custom3(params, tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + ggml_compute_forward_cross_entropy_loss(params, tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + ggml_compute_forward_cross_entropy_loss_back(params, tensor); + } + break; + case GGML_OP_OPT_STEP_ADAMW: + { + ggml_compute_forward_opt_step_adamw(params, tensor); + } + break; + case GGML_OP_NONE: + { + // nop + } break; + case GGML_OP_COUNT: + { + GGML_ABORT("fatal error"); + } + } +} + +// Android's libc implementation "bionic" does not support setting affinity +#if defined(__gnu_linux__) +static void set_numa_thread_affinity(int thread_n) { + if (!ggml_is_numa()) { + return; + } + + int node_num; + int rv; + size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + + switch(g_state.numa.numa_strategy) { + case GGML_NUMA_STRATEGY_DISTRIBUTE: + // run thread on node_num thread_n / (threads per node) + node_num = thread_n % g_state.numa.n_nodes; + break; + case GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state.numa.current_node; + break; + case GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); + } + return; + default: + return; + } + + struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (size_t i = 0; i < node->n_cpus; ++i) { + CPU_SET_S(node->cpus[i], setsize, cpus); + } + + rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); + } + + CPU_FREE(cpus); +} + +static void clear_numa_thread_affinity(void) { + if (!ggml_is_numa()) { + return; + } + + size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) { + CPU_SET_S(i, setsize, cpus); + } + + int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); + } + + CPU_FREE(cpus); +} +#else +// TODO: Windows etc. +// (the linux implementation may also work on BSD, someone should test) +static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } +static void clear_numa_thread_affinity(void) {} +#endif + +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { + int n_tasks = 0; + + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_CONT: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: + { + n_tasks = n_threads; + } break; + case GGML_OP_SUB: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT_EQUAL: + { + n_tasks = n_threads; + } break; + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_LEAKY_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_EXP: + { + n_tasks = 1; + } break; + + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: + { + n_tasks = n_threads; + } break; + default: + GGML_ABORT("fatal error"); + } + break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + case GGML_OP_CONCAT: + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + } break; + case GGML_OP_GET_ROWS: + { + // FIXME: get_rows can use additional threads, but the cost of launching additional threads + // decreases performance with GPU offloading + //n_tasks = n_threads; + n_tasks = 1; + } break; + case GGML_OP_SCALE: + case GGML_OP_SET: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: + { + n_tasks = n_threads; + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_SOFT_MAX: + { + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); + } break; + case GGML_OP_IM2COL: + case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_TRANSPOSE_1D: + case GGML_OP_CONV_TRANSPOSE_2D: + { + n_tasks = n_threads; + } break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + case GGML_OP_POOL_2D_BACK: + { + n_tasks = 1; + } break; + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_ARGSORT: + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_FLASH_ATTN_BACK: + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: + { + n_tasks = n_threads; + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_RWKV_WKV: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + { + n_tasks = 1; + } break; + case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p.n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p.n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM3: + { + struct ggml_map_custom3_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p.n_tasks, n_threads); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_OPT_STEP_ADAMW: + { + n_tasks = n_threads; + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ABORT("fatal error"); + } + default: + { + fprintf(stderr, "%s: op not implemented: ", __func__); + if (node->op < GGML_OP_COUNT) { + fprintf(stderr, "%s\n", ggml_op_name(node->op)); + } else { + fprintf(stderr, "%d\n", node->op); + } + GGML_ABORT("fatal error"); + } + } + + assert(n_tasks > 0); + + return n_tasks; +} + +static thread_ret_t ggml_graph_compute_secondary_thread(void* data); + +#if defined(_WIN32) +#include "windows.h" + +// TODO: support > 64 CPUs +bool ggml_thread_apply_affinity(bool * mask) { + HANDLE h = GetCurrentThread(); + uint64_t bitmask = 0ULL; + + assert(GGML_MAX_N_THREADS >= 64); + + for (int32_t i = 0; i < 8; i++) { + int32_t idx = i * 8; + uint8_t val = 0; + val |= mask[idx + 0] << 0; + val |= mask[idx + 1] << 1; + val |= mask[idx + 2] << 2; + val |= mask[idx + 3] << 3; + val |= mask[idx + 4] << 4; + val |= mask[idx + 5] << 5; + val |= mask[idx + 6] << 6; + val |= mask[idx + 7] << 7; + bitmask |= (uint64_t)val << idx; + } + + for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n"); + break; + } + } + + DWORD_PTR m = (DWORD_PTR)bitmask; + + m = SetThreadAffinityMask(h, m); + + return m != 0; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + // Note that on Windows the Process Priority Class must be updated in order to set Thread priority. + // This is up to the applications. + DWORD p = THREAD_PRIORITY_NORMAL; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; + case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; + case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; + case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + if (!SetThreadPriority(GetCurrentThread(), p)) { + fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + + return true; +} + +#elif defined(__APPLE__) +#include +#include + +static bool ggml_thread_apply_affinity(const bool * mask) { + // Not supported on Apple platforms + UNUSED(mask); + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); + return false; + } + + return true; +} + +#elif defined(__gnu_linux__) +// TODO: this may not work on BSD, to be verified + +static bool ggml_thread_apply_affinity(const bool * mask) { + cpu_set_t cpuset; + int err; + + CPU_ZERO(&cpuset); + + for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + CPU_SET(i, &cpuset); + } + } + +#ifdef __ANDROID__ + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + err = errno; + } +#else + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); +#endif + if (err != 0) { + fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err); + return false; + } + + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + if (prio == GGML_SCHED_PRIO_NORMAL) { + // Keep inherited policy/priority + return true; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); + return false; + } + + return true; +} + +#else // unsupported platforms + +static bool ggml_thread_apply_affinity(const bool * mask) { + UNUSED(mask); + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + UNUSED(prio); + return true; +} + +#endif + +static bool ggml_thread_cpumask_is_valid(const bool * mask) { + for (int i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { return true; } + } + return false; +} + +static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { + if (!strict) { + memcpy(local_mask, global_mask, GGML_MAX_N_THREADS); + return; + } else { + memset(local_mask, 0, GGML_MAX_N_THREADS); + int32_t base_idx = *iter; + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + int32_t idx = base_idx + i; + if (idx >= GGML_MAX_N_THREADS) { + // Just a cheaper modulo + idx -= GGML_MAX_N_THREADS; + } + if (global_mask[idx]) { + local_mask[idx] = 1; + *iter = idx + 1; + return; + } + } + } +} + +void ggml_threadpool_free(struct ggml_threadpool* threadpool) { + if (!threadpool) return; + + const int n_threads = threadpool->n_threads_max; + +#ifndef GGML_USE_OPENMP + struct ggml_compute_state* workers = threadpool->workers; + + ggml_mutex_lock(&threadpool->mutex); + + threadpool->stop = true; + threadpool->pause = false; + + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); + + for (int j = 1; j < n_threads; j++) { + int32_t rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); + UNUSED(rc); + } + + ggml_mutex_destroy(&threadpool->mutex); + ggml_cond_destroy(&threadpool->cond); +#endif // GGML_USE_OPENMP + + const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; + ggml_aligned_free(threadpool->workers, workers_size); + ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); +} + +#ifndef GGML_USE_OPENMP +// pause/resume must be called under mutex +static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) { + GGML_PRINT_DEBUG("Pausing threadpool\n"); + threadpool->pause = true; + ggml_cond_broadcast(&threadpool->cond); +} + +static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) { + GGML_PRINT_DEBUG("Resuming threadpool\n"); + threadpool->pause = false; + ggml_cond_broadcast(&threadpool->cond); +} +#endif + +void ggml_threadpool_pause(struct ggml_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + ggml_mutex_lock(&threadpool->mutex); + if (!threadpool->pause) { + ggml_threadpool_pause_locked(threadpool); + } + ggml_mutex_unlock(&threadpool->mutex); +#else + UNUSED(threadpool); +#endif +} + +void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + ggml_mutex_lock(&threadpool->mutex); + if (threadpool->pause) { + ggml_threadpool_resume_locked(threadpool); + } + ggml_mutex_unlock(&threadpool->mutex); +#else + UNUSED(threadpool); +#endif +} + +struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, + struct ggml_threadpool * threadpool) { + + if (threadpool == NULL) { + //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); + } + if (n_threads <= 0) { + n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; + } + + size_t work_size = 0; + + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); + + int max_tasks = 1; + + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + const int n_tasks = ggml_get_n_tasks(node, n_threads); + + max_tasks = MAX(max_tasks, n_tasks); + + size_t cur = 0; + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + if (ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || + (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_ACC: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + } + } break; + case GGML_OP_COUNT_EQUAL: + { + cur = ggml_type_size(node->type)*n_tasks; + } break; + case GGML_OP_MUL_MAT: + { + const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; + + if (node->src[1]->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + } + } break; + case GGML_OP_MUL_MAT_ID: + { + cur = 0; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); + } + const int n_as = src0->ne[2]; + cur += GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + } break; + case GGML_OP_OUT_PROD: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + const int64_t ne00 = node->src[0]->ne[0]; // K + const int64_t ne01 = node->src[0]->ne[1]; // Cout + const int64_t ne02 = node->src[0]->ne[2]; // Cin + + const int64_t ne10 = node->src[1]->ne[0]; // L + const int64_t ne11 = node->src[1]->ne[1]; // Cin + + if ((node->src[0]->type == GGML_TYPE_F16 || + node->src[0]->type == GGML_TYPE_BF16) && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; + cur += sizeof(ggml_fp16_t)*ne10*ne11; + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(float)*ne00*ne01*ne02; + cur += sizeof(float)*ne10*ne11; + } else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In + + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + const int64_t ne00 = node->src[0]->ne[0]; // D + + cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + } break; + + case GGML_OP_CROSS_ENTROPY_LOSS: + { + cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); + } break; + case GGML_OP_COUNT: + { + GGML_ABORT("fatal error"); + } + default: + break; + } + + work_size = MAX(work_size, cur); + } + + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads); + } + + cplan.threadpool = threadpool; + cplan.n_threads = MIN(max_tasks, n_threads); + cplan.work_size = work_size; + cplan.work_data = NULL; + + return cplan; +} + +static thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * tp = state->threadpool; + + const struct ggml_cgraph * cgraph = tp->cgraph; + const struct ggml_cplan * cplan = tp->cplan; + + set_numa_thread_affinity(state->ith); + + struct ggml_compute_params params = { + /*.ith =*/ state->ith, + /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + /*.threadpool=*/ tp, + }; + + for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + + ggml_compute_forward(¶ms, node); + + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + tp->abort = true; + tp->ec = GGML_STATUS_ABORTED; + } + + ggml_barrier(state->threadpool); + } + + return 0; +} + +#ifndef GGML_USE_OPENMP + +// check if thread is active +static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed); + return (state->ith < n_threads); +} + +// check if thread is ready to proceed (exit from polling or sleeping) +static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + if (state->pending || threadpool->stop || threadpool->pause) { return true; } + + // check for new graph/work + int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); + if (new_graph != state->last_graph) { + state->pending = ggml_graph_compute_thread_active(state); + state->last_graph = new_graph; + } + + return state->pending; +} + +// sync thread state after polling +static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) { + // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead + #ifdef GGML_TSAN_ENABLED + atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst); + #else + atomic_thread_fence(memory_order_seq_cst); + #endif + UNUSED(state); +} + +static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + // Skip polling for unused threads + if (!ggml_graph_compute_thread_active(state)) { + return state->pending; + } + + // This seems to make 0 ... 100 a decent range for polling level across modern processors. + // Perhaps, we can adjust it dynamically based on load and things. + const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; + + for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) { + // No new work. Keep polling. + ggml_thread_cpu_relax(); + } + + return state->pending; +} + +static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + + if (ggml_graph_compute_poll_for_work(state)) { + ggml_graph_compute_thread_sync(state); + return state->pending; + } + + ggml_mutex_lock_shared(&threadpool->mutex); + while (!ggml_graph_compute_thread_ready(state)) { + // No new work. Wait for the signal. + GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith); + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + ggml_mutex_unlock_shared(&threadpool->mutex); + + return state->pending; +} + +static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * threadpool = state->threadpool; + + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(state->cpumask)) { + ggml_thread_apply_affinity(state->cpumask); + } + + while (true) { + // Check if we need to sleep + while (threadpool->pause) { + GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith); + ggml_mutex_lock_shared(&threadpool->mutex); + if (threadpool->pause) { + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith); + ggml_mutex_unlock_shared(&threadpool->mutex); + } + + // This needs to be checked for after the cond_wait + if (threadpool->stop) break; + + // Check if there is new work + // The main thread is the only one that can dispatch new work + + ggml_graph_compute_check_for_work(state); + if (state->pending) { + state->pending = false; + + ggml_graph_compute_thread(state); + } + } + + return (thread_ret_t) 0; +} + +// Start processing new graph +static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads) +{ + // Always take the mutex here because the worker threads are doing hybrid poll/wait + + ggml_mutex_lock(&threadpool->mutex); + + GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads); + + // Update the number of active threads + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); + + // Indicate the graph is ready to be processed + // We need the full seq-cst fence here because of the polling threads (used in thread_sync) + atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst); + + if (threadpool->pause) { + // Update main thread prio and affinity to match the threadpool settings + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { + ggml_thread_apply_affinity(threadpool->workers[0].cpumask); + } + + // resume does cond broadcast + ggml_threadpool_resume_locked(threadpool); + } else { + ggml_cond_broadcast(&threadpool->cond); + } + + ggml_mutex_unlock(&threadpool->mutex); +} + +#endif // GGML_USE_OPENMP + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} + +static struct ggml_threadpool * ggml_threadpool_new_impl( + struct ggml_threadpool_params * tpp, + struct ggml_cgraph * cgraph, + struct ggml_cplan * cplan) { + + struct ggml_threadpool * threadpool = + ggml_aligned_malloc(sizeof(struct ggml_threadpool)); + { + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_graph = 0; + threadpool->n_barrier = 0; + threadpool->n_barrier_passed = 0; + threadpool->current_chunk = 0; + threadpool->stop = false; + threadpool->pause = tpp->paused; + threadpool->abort = false; + threadpool->workers = NULL; + threadpool->n_threads_max = tpp->n_threads; + threadpool->n_threads_cur = tpp->n_threads; + threadpool->poll = tpp->poll; + threadpool->prio = tpp->prio; + threadpool->ec = GGML_STATUS_SUCCESS; + } + + // Allocate and init workers state + const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; + struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); + + memset(workers, 0, workers_size); + for (int j = 0; j < tpp->n_threads; j++) { + workers[j].threadpool = threadpool; + workers[j].ith = j; + } + + threadpool->workers = workers; + +#ifndef GGML_USE_OPENMP + ggml_mutex_init(&threadpool->mutex); + ggml_cond_init(&threadpool->cond); + + // Spin the threads for all workers, and update CPU placements. + // Place the main thread last (towards the higher numbered CPU cores). + + int32_t cpumask_iter = 0; + + for (int j = 1; j < tpp->n_threads; j++) { + ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + + int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]); + GGML_ASSERT(rc == 0); + } + + ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter); + + if (!threadpool->pause) { + // Update main thread prio and affinity at the start, otherwise we'll do it in resume + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { + ggml_thread_apply_affinity(threadpool->workers[0].cpumask); + } + } +#endif // GGML_USE_OPENMP + + return threadpool; +} + +struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) { + return ggml_threadpool_new_impl(tpp, NULL, NULL); +} + +enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + ggml_cpu_init(); + + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); + + int n_threads = cplan->n_threads; + struct ggml_threadpool * threadpool = cplan->threadpool; + + bool disposable_threadpool = false; + + if (threadpool == NULL) { + //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); + disposable_threadpool = true; + + struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); + threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan); + } else { + // Reset some of the parameters that need resetting + // No worker threads should be accessing the parameters below at this stage + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->current_chunk = 0; + threadpool->abort = false; + threadpool->ec = GGML_STATUS_SUCCESS; + } + +#ifdef GGML_USE_OPENMP + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + #pragma omp single + { + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); + } + + ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); + } + } else { + atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); + ggml_graph_compute_thread(&threadpool->workers[0]); + } +#else + if (n_threads > threadpool->n_threads_max) { + GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); + n_threads = threadpool->n_threads_max; + } + + // Kick all threads to start the new graph + ggml_graph_compute_kickoff(threadpool, n_threads); + + // This is a work thread too + ggml_graph_compute_thread(&threadpool->workers[0]); +#endif + + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + + enum ggml_status ret = threadpool->ec; + + if (disposable_threadpool) { + ggml_threadpool_free(threadpool); + } + + return ret; +} + +enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); + + cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size); + + return ggml_graph_compute(cgraph, &cplan); +} + +int ggml_cpu_has_neon(void) { +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_neon; +#else + return 0; +#endif +} + +int ggml_cpu_has_sve(void) { +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_sve; +#else + return 0; +#endif +} + +int ggml_cpu_has_matmul_int8(void) { +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.has_i8mm; +#else + return 0; +#endif +} + +int ggml_cpu_get_sve_cnt(void) { +#if defined(__ARM_ARCH) + return ggml_arm_arch_features.sve_cnt; +#else + return 0; +#endif +} + +void ggml_cpu_init(void) { + ggml_critical_section_start(); + + static bool is_first_call = true; + + if (is_first_call) { + // initialize GELU, Quick GELU, SILU and EXP F32 tables + { + // FIXME: this may be called before ggml_init + //const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + + for (int i = 0; i < (1 << 16); ++i) { + union { + uint16_t u16; + ggml_fp16_t fp16; + } u = {i}; + // FIXME: this table is used in conversion functions outside of compute + // current code depends on ggml_init initializing this table + float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + } + + //const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + + //GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); + } + +#if defined(__ARM_ARCH) + ggml_init_arm_arch_features(); +#endif + + is_first_call = false; + } + + ggml_critical_section_end(); +} diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 65c4f8119..af29a26f0 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -8,6 +8,7 @@ #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include #include +#include #ifdef __cplusplus extern "C" { @@ -36,6 +37,20 @@ extern "C" { #endif #endif +static inline int ggml_up32(int n) { + return (n + 31) & ~31; +} + +//static inline int ggml_up64(int n) { +// return (n + 63) & ~63; +//} + +static inline int ggml_up(int n, int m) { + // assert m is a power of 2 + GGML_ASSERT((m & (m - 1)) == 0); + return (n + m - 1) & ~(m - 1); +} + // // logging // @@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define GGML_DEBUG 0 + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +// tensor params + +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + +static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + return ((const int32_t *)(tensor->op_params))[i]; +} + +static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); + return ((const float *)(tensor->op_params))[i]; +} + +static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + ((int32_t *)(tensor->op_params))[i] = value; +} + +static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); + ((float *)(tensor->op_params))[i] = value; +} + +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + // bitset typedef uint32_t ggml_bitset_t; @@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); void * ggml_aligned_malloc(size_t size); void ggml_aligned_free(void * ptr, size_t size); +// TODO: move to threading file +void ggml_critical_section_start(void); +void ggml_critical_section_end(void); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 2778009e4..8a772f224 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - return ggml_backend_cpu_buffer_from_ptr(ptr, size); - - UNUSED(dev); - UNUSED(max_tensor_size); -} - static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { UNUSED(dev); UNUSED(op); @@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = { /* .init_backend = */ ggml_backend_rpc_device_init, /* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_rpc_device_supports_op, /* .supports_buft = */ ggml_backend_rpc_device_supports_buft, /* .offload_op = */ NULL, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 84f2c766b..7dc3340a1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1,4 +1,4 @@ -#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows +#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-backend.h" @@ -31,168 +31,27 @@ #include #endif -#ifdef GGML_USE_OPENMP -#include -#endif - -#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) -#undef GGML_USE_LLAMAFILE -#endif - -#ifdef GGML_USE_LLAMAFILE -#include -#endif - -#if defined(_MSC_VER) -// disable "possible loss of data" to avoid hundreds of casts -// we should just be careful :) -#pragma warning(disable: 4244 4267) - -// disable POSIX deprecation warnings -// these functions are never going away, anyway -#pragma warning(disable: 4996) - -// unreachable code because of multiple instances of code after GGML_ABORT -#pragma warning(disable: 4702) -#endif - -// Note: once we move threading into a separate C++ file -// will use std::hardware_destructive_interference_size instead of hardcoding it here -// and we'll use C++ attribute syntax. -#define GGML_CACHE_LINE 64 - -#if defined(__clang__) || defined(__GNUC__) -#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE))) -#endif - -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define GGML_TSAN_ENABLED 1 -#endif -#else // __has_feature -#if defined(__SANITIZE_THREAD__) -#define GGML_TSAN_ENABLED 1 -#endif -#endif // __has_feature - -#if defined(_WIN32) - -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX -#endif -#include - -#if !defined(__clang__) -#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE)) - -typedef volatile LONG atomic_int; -typedef atomic_int atomic_bool; -typedef atomic_int atomic_flag; - -#define ATOMIC_FLAG_INIT 0 - -typedef enum { - memory_order_relaxed, - memory_order_consume, - memory_order_acquire, - memory_order_release, - memory_order_acq_rel, - memory_order_seq_cst -} memory_order; - -static void atomic_store(atomic_int * ptr, LONG val) { - InterlockedExchange(ptr, val); -} -static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) { - // TODO: add support for explicit memory order - InterlockedExchange(ptr, val); -} -static LONG atomic_load(atomic_int * ptr) { - return InterlockedCompareExchange(ptr, 0, 0); -} -static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) { - // TODO: add support for explicit memory order - return InterlockedCompareExchange(ptr, 0, 0); -} -static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { - return InterlockedExchangeAdd(ptr, inc); -} -static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) { - // TODO: add support for explicit memory order - return InterlockedExchangeAdd(ptr, inc); -} -static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { - return InterlockedExchange(ptr, 1); -} -static void atomic_flag_clear(atomic_flag * ptr) { - InterlockedExchange(ptr, 0); -} -static void atomic_thread_fence(memory_order mo) { - MemoryBarrier(); -} -#else // clang -#include -#endif - -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { - (void) unused; - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void * unused) { - (void) unused; - int ret = (int) WaitForSingleObject(thread, INFINITE); - CloseHandle(thread); - return ret; -} - -static int sched_yield (void) { - Sleep (0); - return 0; -} -#else - -#include -#include -#include -#if defined(__FreeBSD__) -#include -#endif - -typedef void * thread_ret_t; - -#include -#include -#include - -#endif - -typedef pthread_t ggml_thread_t; - -#ifdef GGML_USE_CPU_HBM -#include -#endif - #if defined(__APPLE__) #include #include #include #endif +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif +#include +#endif + +#define UNUSED GGML_UNUSED + #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH)) - +#include +#include +#include #include #if defined(__ANDROID__) @@ -305,15 +164,6 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { abort(); } -#define GGML_DEBUG 0 - -#define GGML_GELU_FP16 -#define GGML_GELU_QUICK_FP16 - -#define GGML_SOFT_MAX_UNROLL 4 -#define GGML_VEC_DOT_UNROLL 2 -#define GGML_VEC_MAD_UNROLL 32 - // // logging // @@ -358,24 +208,6 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi fflush(stderr); } -#if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG(...) -#endif - -#if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_5(...) -#endif - -#if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_10(...) -#endif - // // end of logging block // @@ -396,9 +228,9 @@ void * ggml_aligned_malloc(size_t size) { return NULL; } void * aligned_memory = NULL; -#ifdef GGML_USE_CPU_HBM + #ifdef GGML_USE_CPU_HBM int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); -#elif TARGET_OS_OSX + #elif TARGET_OS_OSX kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); int result = EFAULT; switch (alloc_status) { @@ -415,12 +247,9 @@ void * ggml_aligned_malloc(size_t size) { result = EFAULT; break; } -#elif GGML_USE_METAL - const long page_size = sysconf(_SC_PAGESIZE); - int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); -#else + #else int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); -#endif + #endif if (result != 0) { // Handle allocation failure const char *error_desc = "unknown allocation error"; @@ -433,7 +262,6 @@ void * ggml_aligned_malloc(size_t size) { break; } GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); - GGML_ABORT("fatal error"); return NULL; } return aligned_memory; @@ -490,44 +318,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #define GGML_FREE(ptr) free(ptr) -#define UNUSED GGML_UNUSED -#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) - -#if defined(GGML_USE_ACCELERATE) -#include -#endif - -// floating point type used to accumulate sums -typedef double ggml_float; - -#undef MIN -#undef MAX - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -// -// global data -// - -// precomputed gelu table for f16 (128 KB) -static ggml_fp16_t ggml_table_gelu_f16[1 << 16]; - -// precomputed quick gelu table for f16 (128 KB) -static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; - -// precomputed f32 table for f16 (256 KB) (ggml-impl.h) -float ggml_table_f32_f16[1 << 16]; - -#if defined(__ARM_ARCH) -struct ggml_arm_arch_features_type { - int has_neon; - int has_i8mm; - int has_sve; - int sve_cnt; -} ggml_arm_arch_features = {-1, -1, -1, 0}; -#endif - const char * ggml_status_to_string(enum ggml_status status) { switch (status) { case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; @@ -565,18 +355,22 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } +// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library +// currently, the ggml_cpu_has_* functions are entirely compile-time void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { int64_t i = 0; #if defined(__F16C__) - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); + if (ggml_cpu_has_f16c()) { + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for(; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } } #endif for (; i < n; i++) { @@ -587,24 +381,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { int64_t i = 0; #if defined(__AVX512F__) - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); + if (ggml_cpu_has_avx512()) { + for (; i + 16 <= n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } } -#elif defined(__AVX2__) - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); + if (ggml_cpu_has_avx2()) { + for (; i + 8 <= n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } } #endif for (; i < n; i++) { @@ -737,24 +534,8 @@ FILE * ggml_fopen(const char * fname, const char * mode) { #else return fopen(fname, mode); #endif + } - -// -// cache line -// - -#if defined(__cpp_lib_hardware_interference_size) -#define CACHE_LINE_SIZE hardware_destructive_interference_size -#else -#if defined(__POWER9_VECTOR__) -#define CACHE_LINE_SIZE 128 -#else -#define CACHE_LINE_SIZE 64 -#endif -#endif - -static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); - static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); @@ -789,16 +570,12 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(double), .is_quantized = false, - .nrows = 1, }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, - .vec_dot_type = GGML_TYPE_F32, - .nrows = 1, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -808,9 +585,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, - .vec_dot_type = GGML_TYPE_F16, - .nrows = 1, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -820,13 +594,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float = quantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, - .vec_dot = ggml_vec_dot_q4_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, -#if defined (__ARM_FEATURE_MATMUL_INT8) - .nrows = 2, -#else - .nrows = 1, -#endif }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -836,13 +603,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float = quantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, - .vec_dot = ggml_vec_dot_q4_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, -#if defined (__ARM_FEATURE_MATMUL_INT8) - .nrows = 2, -#else - .nrows = 1, -#endif }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -852,9 +612,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_COUNT, - .nrows = 1, }, [5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -864,9 +621,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_COUNT, - .nrows = 1, }, [GGML_TYPE_Q5_0] = { .type_name = "q5_0", @@ -876,9 +630,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float = quantize_row_q5_0, .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, - .vec_dot = ggml_vec_dot_q5_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -888,9 +639,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float = quantize_row_q5_1, .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, - .vec_dot = ggml_vec_dot_q5_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, - .nrows = 1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -900,14 +648,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q8_0, .from_float = quantize_row_q8_0, .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, - .from_float_to_mat = quantize_mat_q8_0, - .vec_dot = ggml_vec_dot_q8_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, -#if defined (__ARM_FEATURE_MATMUL_INT8) - .nrows = 2, -#else - .nrows = 1, -#endif }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -916,8 +656,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .from_float = quantize_row_q8_1, .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, - .vec_dot_type = GGML_TYPE_Q8_1, - .nrows = 1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -927,9 +665,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float = quantize_row_q2_K, .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref, - .vec_dot = ggml_vec_dot_q2_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -939,9 +674,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float = quantize_row_q3_K, .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref, - .vec_dot = ggml_vec_dot_q3_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -951,9 +683,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float = quantize_row_q4_K, .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref, - .vec_dot = ggml_vec_dot_q4_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -963,9 +692,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float = quantize_row_q5_K, .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref, - .vec_dot = ggml_vec_dot_q5_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -975,9 +701,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float = quantize_row_q6_K, .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, - .vec_dot = ggml_vec_dot_q6_K_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -987,9 +710,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -999,9 +719,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = ggml_vec_dot_iq2_xs_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -1011,9 +728,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, .from_float = quantize_row_iq3_xxs, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref, - .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ3_S] = { .type_name = "iq3_s", @@ -1023,9 +737,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq3_s, .from_float = quantize_row_iq3_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref, - .vec_dot = ggml_vec_dot_iq3_s_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ2_S] = { .type_name = "iq2_s", @@ -1035,9 +746,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq2_s, .from_float = quantize_row_iq2_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref, - .vec_dot = ggml_vec_dot_iq2_s_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", @@ -1047,9 +755,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq1_s, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = ggml_vec_dot_iq1_s_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", @@ -1059,9 +764,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq1_m, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = ggml_vec_dot_iq1_m_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", @@ -1071,9 +773,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, .from_float = quantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, - .vec_dot = ggml_vec_dot_iq4_nl_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", @@ -1083,9 +782,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float = quantize_row_iq4_xs, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref, - .vec_dot = ggml_vec_dot_iq4_xs_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", @@ -1102,9 +798,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, - .vec_dot_type = GGML_TYPE_BF16, - .nrows = 1, }, [GGML_TYPE_Q4_0_4_4] = { .type_name = "q4_0_4x4", @@ -1115,12 +808,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, }, [GGML_TYPE_Q4_0_4_8] = { .type_name = "q4_0_4x8", @@ -1131,12 +818,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, }, [GGML_TYPE_Q4_0_8_8] = { .type_name = "q4_0_8x8", @@ -1147,12 +828,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float = NULL, .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -1162,9 +837,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq1_0, .from_float = quantize_row_tq1_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, - .vec_dot = ggml_vec_dot_tq1_0_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, [GGML_TYPE_TQ2_0] = { .type_name = "tq2_0", @@ -1174,824 +846,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float = quantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, - .vec_dot = ggml_vec_dot_tq2_0_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, }, }; -// For internal test use const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { GGML_ASSERT(type < GGML_TYPE_COUNT); return &type_traits[type]; } -// -// simd mappings -// - -// we define a common set of C macros which map to specific intrinsics based on the current architecture -// we then implement the fundamental computation operations below using only these macros -// adding support for new architectures requires to define the corresponding SIMD macros -// -// GGML_F32_STEP / GGML_F16_STEP -// number of elements to process in a single step -// -// GGML_F32_EPR / GGML_F16_EPR -// number of elements to fit in a single register -// - -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) - -#define GGML_SIMD - -// F32 NEON - -#define GGML_F32_STEP 16 -#define GGML_F32_EPR 4 - -#define GGML_F32x4 float32x4_t -#define GGML_F32x4_ZERO vdupq_n_f32(0.0f) -#define GGML_F32x4_SET1(x) vdupq_n_f32(x) -#define GGML_F32x4_LOAD vld1q_f32 -#define GGML_F32x4_STORE vst1q_f32 -#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) -#define GGML_F32x4_ADD vaddq_f32 -#define GGML_F32x4_MUL vmulq_f32 -#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ - } \ - (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ -} - -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE - -// F16 NEON - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define GGML_F16_STEP 32 - #define GGML_F16_EPR 8 - - #define GGML_F16x8 float16x8_t - #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) - #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) - #define GGML_F16x8_STORE vst1q_f16 - #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) - #define GGML_F16x8_ADD vaddq_f16 - #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ - (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } while (0) - - #define GGML_F16_VEC GGML_F16x8 - #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO - #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) - #define GGML_F16_VEC_FMA GGML_F16x8_FMA - #define GGML_F16_VEC_ADD GGML_F16x8_ADD - #define GGML_F16_VEC_MUL GGML_F16x8_MUL - #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE -#else - // if FP16 vector arithmetic is not supported, we use FP32 instead - // and take advantage of the vcvt_ functions to convert to/from FP16 - - #define GGML_F16_STEP 16 - #define GGML_F16_EPR 4 - - #define GGML_F32Cx4 float32x4_t - #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) - #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) - #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define GGML_F32Cx4_ADD vaddq_f32 - #define GGML_F32Cx4_MUL vmulq_f32 - #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE - - #define GGML_F16_VEC GGML_F32Cx4 - #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO - #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) - #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA - #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD - #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL - #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE -#endif - -#elif defined(__AVX512F__) - -#define GGML_SIMD - -// F32 AVX512 - -#define GGML_F32_STEP 64 -#define GGML_F32_EPR 16 - -#define GGML_F32x16 __m512 -#define GGML_F32x16_ZERO _mm512_setzero_ps() -#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) -#define GGML_F32x16_LOAD _mm512_loadu_ps -#define GGML_F32x16_STORE _mm512_storeu_ps -// _mm512_fmadd_ps is defined in AVX512F so no guard is required -#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) -#define GGML_F32x16_ADD _mm512_add_ps -#define GGML_F32x16_MUL _mm512_mul_ps -#define GGML_F32x16_REDUCE(res, x) \ -do { \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - res = _mm512_reduce_add_ps(x[0]); \ -} while (0) - -// TODO: is this optimal ? - -#define GGML_F32_VEC GGML_F32x16 -#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD -#define GGML_F32_VEC_STORE GGML_F32x16_STORE -#define GGML_F32_VEC_FMA GGML_F32x16_FMA -#define GGML_F32_VEC_ADD GGML_F32x16_ADD -#define GGML_F32_VEC_MUL GGML_F32x16_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE - -// F16 AVX512 - -// F16 AVX - -#define GGML_F16_STEP 64 -#define GGML_F16_EPR 16 - -// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead - -#define GGML_F32Cx16 __m512 -#define GGML_F32Cx16_ZERO _mm512_setzero_ps() -#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) - -// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F -// so F16C guard isn't required -#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) -#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) - -#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) -#define GGML_F32Cx16_ADD _mm512_add_ps -#define GGML_F32Cx16_MUL _mm512_mul_ps -#define GGML_F32Cx16_REDUCE(res, x) \ -do { \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm512_add_ps(x[i], x[offset+i]); \ - } \ - res = _mm512_reduce_add_ps(x[0]); \ -} while (0) - -#define GGML_F16_VEC GGML_F32Cx16 -#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE - -#elif defined(__AVX__) - -#define GGML_SIMD - -// F32 AVX - -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 8 - -#define GGML_F32x8 __m256 -#define GGML_F32x8_ZERO _mm256_setzero_ps() -#define GGML_F32x8_SET1(x) _mm256_set1_ps(x) -#define GGML_F32x8_LOAD _mm256_loadu_ps -#define GGML_F32x8_STORE _mm256_storeu_ps -#if defined(__FMA__) - #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) -#else - #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) -#endif -#define GGML_F32x8_ADD _mm256_add_ps -#define GGML_F32x8_MUL _mm256_mul_ps -#define GGML_F32x8_REDUCE(res, x) \ -do { \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm256_add_ps(x[i], x[offset+i]); \ - } \ - const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ - _mm256_extractf128_ps(x[0], 1)); \ - const __m128 t1 = _mm_hadd_ps(t0, t0); \ - res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ -} while (0) -// TODO: is this optimal ? - -#define GGML_F32_VEC GGML_F32x8 -#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD -#define GGML_F32_VEC_STORE GGML_F32x8_STORE -#define GGML_F32_VEC_FMA GGML_F32x8_FMA -#define GGML_F32_VEC_ADD GGML_F32x8_ADD -#define GGML_F32_VEC_MUL GGML_F32x8_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE - -// F16 AVX - -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 8 - -// F16 arithmetic is not supported by AVX, so we use F32 instead - -#define GGML_F32Cx8 __m256 -#define GGML_F32Cx8_ZERO _mm256_setzero_ps() -#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) - -#if defined(__F16C__) -// the _mm256_cvt intrinsics require F16C -#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) -#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) -#else -static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { - float tmp[8]; - - for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - } - - return _mm256_loadu_ps(tmp); -} -static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { - float arr[8]; - - _mm256_storeu_ps(arr, y); - - for (int i = 0; i < 8; i++) - x[i] = GGML_FP32_TO_FP16(arr[i]); -} -#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) -#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) -#endif - -#define GGML_F32Cx8_FMA GGML_F32x8_FMA -#define GGML_F32Cx8_ADD _mm256_add_ps -#define GGML_F32Cx8_MUL _mm256_mul_ps -#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE - -#define GGML_F16_VEC GGML_F32Cx8 -#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE - -#elif defined(__POWER9_VECTOR__) - -#define GGML_SIMD - -// F32 POWER9 - -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 4 - -#define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f -#define GGML_F32x4_SET1 vec_splats -#define GGML_F32x4_LOAD(p) vec_xl(0, p) -#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) -#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) -#define GGML_F32x4_ADD vec_add -#define GGML_F32x4_MUL vec_mul -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vec_add(x[i], x[offset+i]); \ - } \ - res = vec_extract(x[0], 0) + \ - vec_extract(x[0], 1) + \ - vec_extract(x[0], 2) + \ - vec_extract(x[0], 3); \ -} - -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE - -// F16 POWER9 -#define GGML_F16_STEP GGML_F32_STEP -#define GGML_F16_EPR GGML_F32_EPR -#define GGML_F16_VEC GGML_F32x4 -#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F16_VEC_FMA GGML_F32x4_FMA -#define GGML_F16_VEC_ADD GGML_F32x4_ADD -#define GGML_F16_VEC_MUL GGML_F32x4_MUL -#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE -// Use vec_xl, not vec_ld, in case the load address is not aligned. -#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ - vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ - vec_extract_fp32_from_shortl(vec_xl(0, p)) -#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] -#define GGML_F16_VEC_STORE(p, r, i) \ - if (i & 0x1) \ - vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ - r[i - GGML_ENDIAN_BYTE(0)]), \ - 0, p - GGML_F16_EPR) - -#elif defined(__wasm_simd128__) - -#define GGML_SIMD - -// F32 WASM - -#define GGML_F32_STEP 16 -#define GGML_F32_EPR 4 - -#define GGML_F32x4 v128_t -#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) -#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) -#define GGML_F32x4_LOAD wasm_v128_load -#define GGML_F32x4_STORE wasm_v128_store -#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) -#define GGML_F32x4_ADD wasm_f32x4_add -#define GGML_F32x4_MUL wasm_f32x4_mul -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ -} - -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE - -// F16 WASM - -#define GGML_F16_STEP 16 -#define GGML_F16_EPR 4 - -inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { - float tmp[4]; - - tmp[0] = GGML_FP16_TO_FP32(p[0]); - tmp[1] = GGML_FP16_TO_FP32(p[1]); - tmp[2] = GGML_FP16_TO_FP32(p[2]); - tmp[3] = GGML_FP16_TO_FP32(p[3]); - - return wasm_v128_load(tmp); -} - -inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { - float tmp[4]; - - wasm_v128_store(tmp, x); - - p[0] = GGML_FP32_TO_FP16(tmp[0]); - p[1] = GGML_FP32_TO_FP16(tmp[1]); - p[2] = GGML_FP32_TO_FP16(tmp[2]); - p[3] = GGML_FP32_TO_FP16(tmp[3]); -} - -#define GGML_F16x4 v128_t -#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) -#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) -#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) -#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) -#define GGML_F16x4_FMA GGML_F32x4_FMA -#define GGML_F16x4_ADD wasm_f32x4_add -#define GGML_F16x4_MUL wasm_f32x4_mul -#define GGML_F16x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ - } \ - res = wasm_f32x4_extract_lane(x[0], 0) + \ - wasm_f32x4_extract_lane(x[0], 1) + \ - wasm_f32x4_extract_lane(x[0], 2) + \ - wasm_f32x4_extract_lane(x[0], 3); \ -} - -#define GGML_F16_VEC GGML_F16x4 -#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO -#define GGML_F16_VEC_SET1 GGML_F16x4_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F16x4_FMA -#define GGML_F16_VEC_ADD GGML_F16x4_ADD -#define GGML_F16_VEC_MUL GGML_F16x4_MUL -#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE - -#elif defined(__SSE3__) - -#define GGML_SIMD - -// F32 SSE - -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 4 - -#define GGML_F32x4 __m128 -#define GGML_F32x4_ZERO _mm_setzero_ps() -#define GGML_F32x4_SET1(x) _mm_set1_ps(x) -#define GGML_F32x4_LOAD _mm_loadu_ps -#define GGML_F32x4_STORE _mm_storeu_ps -#if defined(__FMA__) - // TODO: Does this work? - #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) -#else - #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) -#endif -#define GGML_F32x4_ADD _mm_add_ps -#define GGML_F32x4_MUL _mm_mul_ps -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = _mm_add_ps(x[i], x[offset+i]); \ - } \ - const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ - res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ -} -// TODO: is this optimal ? - -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE - -// F16 SSE - -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 4 - -static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { - float tmp[4]; - - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); - - return _mm_loadu_ps(tmp); -} - -static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { - float arr[4]; - - _mm_storeu_ps(arr, y); - - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); -} - -#define GGML_F32Cx4 __m128 -#define GGML_F32Cx4_ZERO _mm_setzero_ps() -#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) -#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) -#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) -#define GGML_F32Cx4_FMA GGML_F32x4_FMA -#define GGML_F32Cx4_ADD _mm_add_ps -#define GGML_F32Cx4_MUL _mm_mul_ps -#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE - -#define GGML_F16_VEC GGML_F32Cx4 -#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE - -#elif defined(__loongarch_asx) - -#define GGML_SIMD - -// F32 LASX -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 8 - -#define GGML_F32x8 __m256 -#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) -#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) -#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) -#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) -#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) -#define GGML_F32x8_ADD __lasx_xvfadd_s -#define GGML_F32x8_MUL __lasx_xvfmul_s -#define GGML_F32x8_REDUCE(res, x) \ -do { \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ - } \ - float *tmp_p = (float *)&x[0]; \ - res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ -} while (0) -// TODO: is this optimal ? - -#define GGML_F32_VEC GGML_F32x8 -#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD -#define GGML_F32_VEC_STORE GGML_F32x8_STORE -#define GGML_F32_VEC_FMA GGML_F32x8_FMA -#define GGML_F32_VEC_ADD GGML_F32x8_ADD -#define GGML_F32_VEC_MUL GGML_F32x8_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE - -// F16 LASX - -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 8 - -// F16 arithmetic is not supported by AVX, so we use F32 instead - -#define GGML_F32Cx8 __m256 -#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) -#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) - -static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { - float tmp[8]; - - for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - } - - return (__m256)__lasx_xvld(tmp, 0); -} -static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { - float arr[8]; - - __lasx_xvst(y, arr, 0); - - for (int i = 0; i < 8; i++) { - x[i] = GGML_FP32_TO_FP16(arr[i]); - } -} -#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) -#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) - -#define GGML_F32Cx8_FMA GGML_F32x8_FMA -#define GGML_F32Cx8_ADD __lasx_xvfadd_s -#define GGML_F32Cx8_MUL __lasx_xvfmul_s -#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE - -#define GGML_F16_VEC GGML_F32Cx8 -#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE - -#elif defined(__loongarch_sx) - -#define GGML_SIMD - -// F32 LSX - -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 4 - -#define GGML_F32x4 __m128 -#define GGML_F32x4_ZERO __lsx_vldi(0) -#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) -#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) -#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) -#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) -#define GGML_F32x4_ADD __lsx_vfadd_s -#define GGML_F32x4_MUL __lsx_vfmul_s -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ - tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ - tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ - const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ - tmp = __lsx_vsrli_d((__m128i)t0, 32); \ - tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ - tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ - res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ -} - -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE - -// F16 LSX - -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 4 - -static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { - float tmp[4]; - - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); - - return __lsx_vld(tmp, 0); -} - -static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { - float arr[4]; - - __lsx_vst(y, arr, 0); - - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); -} - -#define GGML_F32Cx4 __m128 -#define GGML_F32Cx4_ZERO __lsx_vldi(0) -#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) -#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x) -#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) -#define GGML_F32Cx4_FMA GGML_F32x4_FMA -#define GGML_F32Cx4_ADD __lsx_vfadd_s -#define GGML_F32Cx4_MUL __lsx_vfmul_s -#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE - -#define GGML_F16_VEC GGML_F32Cx4 -#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE - -#endif - -// GGML_F32_ARR / GGML_F16_ARR -// number of registers to use per step -#ifdef GGML_SIMD -#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) -#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) -#endif - // // ggml object // @@ -2031,972 +893,6 @@ struct ggml_context_container { struct ggml_context context; }; -// -// Threading defs -// - -typedef pthread_t ggml_thread_t; - -#if defined(_WIN32) - -typedef CONDITION_VARIABLE ggml_cond_t; -typedef SRWLOCK ggml_mutex_t; - -#define ggml_mutex_init(m) InitializeSRWLock(m) -#define ggml_mutex_destroy(m) -#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m) -#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m) -#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m) -#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m) - -#define ggml_cond_init(c) InitializeConditionVariable(c) -#define ggml_cond_destroy(c) -#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED) -#define ggml_cond_broadcast(c) WakeAllConditionVariable(c) - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -typedef pthread_cond_t ggml_cond_t; -typedef pthread_mutex_t ggml_mutex_t; - -#define ggml_mutex_init(m) pthread_mutex_init(m, NULL) -#define ggml_mutex_destroy(m) pthread_mutex_destroy(m) -#define ggml_mutex_lock(m) pthread_mutex_lock(m) -#define ggml_mutex_unlock(m) pthread_mutex_unlock(m) -#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m) -#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m) - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 -#define ggml_cond_init(c) pthread_cond_init(c, NULL) -#define ggml_cond_destroy(c) pthread_cond_destroy(c) -#define ggml_cond_wait(c, m) pthread_cond_wait(c, m) -#define ggml_cond_broadcast(c) pthread_cond_broadcast(c) - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#endif - -// Threadpool def -struct ggml_threadpool { - ggml_mutex_t mutex; // mutex for cond.var - ggml_cond_t cond; // cond.var for waiting for new work - - struct ggml_cgraph * cgraph; - struct ggml_cplan * cplan; - - // synchronization primitives - atomic_int n_graph; // incremented when there is work to be done (i.e each graph) - atomic_int GGML_CACHE_ALIGN n_barrier; - atomic_int GGML_CACHE_ALIGN n_barrier_passed; - atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. - - // these are atomic as an annotation for thread-sanitizer - atomic_bool stop; // Used for stopping the threadpool altogether - atomic_bool pause; // Used for pausing the threadpool or individual threads - atomic_bool abort; // Used for aborting processing of a graph - - struct ggml_compute_state * workers; // per thread state - int n_threads_max; // number of threads in the pool - atomic_int n_threads_cur; // number of threads used in the current graph - - int32_t prio; // Scheduling priority - uint32_t poll; // Polling level (0 - no polling) - - enum ggml_status ec; -}; - -// Per-thread state -struct ggml_compute_state { -#ifndef GGML_USE_OPENMP - ggml_thread_t thrd; - bool cpumask[GGML_MAX_N_THREADS]; - int last_graph; - bool pending; -#endif - struct ggml_threadpool * threadpool; - int ith; -}; - -struct ggml_compute_params { - // ith = thread index, nth = number of threads - int ith, nth; - - // work buffer for all threads - size_t wsize; - void * wdata; - - struct ggml_threadpool * threadpool; -}; - -// -// fundamental operations -// - -inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - -inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } -inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } -inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } -inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } -inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } -inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } -inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } - -static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - -#if defined(GGML_SIMD) - float sumf = 0.0f; - const int np = (n & ~(GGML_F32_STEP - 1)); - - GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; - - GGML_F32_VEC ax[GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; - - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - - sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); - } - } - - // reduce sum0..sum3 to sum0 - GGML_F32_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += x[i]*y[i]; - } -#else - // scalar - ggml_float sumf = 0.0; - for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(x[i]*y[i]); - } -#endif - - *s = sumf; -} - -static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - int i = 0; - ggml_float sumf = 0; - -#if defined(__AVX512BF16__) - __m512 c1 = _mm512_setzero_ps(); - __m512 c2 = _mm512_setzero_ps(); - for (; i + 64 <= n; i += 64) { - c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), - m512bh(_mm512_loadu_si512((y + i)))); - c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), - m512bh(_mm512_loadu_si512((y + i + 32)))); - } - sumf += (ggml_float)_mm512_reduce_add_ps(c1); - sumf += (ggml_float)_mm512_reduce_add_ps(c2); - -#elif defined(__AVX512F__) -#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) - __m512 c1 = _mm512_setzero_ps(); - __m512 c2 = _mm512_setzero_ps(); - for (; i + 32 <= n; i += 32) { - c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); - c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); - } - sumf += (ggml_float)_mm512_reduce_add_ps(c1); - sumf += (ggml_float)_mm512_reduce_add_ps(c2); - -#undef LOAD -#elif defined(__AVX2__) -#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - __m256 c4 = _mm256_setzero_ps(); - for (; i + 32 <= n; i += 32) { - c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); - c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); - c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); - c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); - } - __m128 g; - c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), - _mm256_add_ps(c2, c4)); - g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), - _mm256_castps256_ps128(c1)); - g = _mm_add_ps(g, _mm_movehl_ps(g, g)); - g = _mm_add_ss(g, _mm_movehdup_ps(g)); - sumf += (ggml_float)_mm_cvtss_f32(g); - -#undef LOAD -#endif - - for (; i < n; ++i) { - sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * - GGML_BF16_TO_FP32(y[i])); - } - *s = sumf; -} - -static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - ggml_float sumf = 0.0; - -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; - - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - - sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); - } - } - - // reduce sum0..sum3 to sum0 - GGML_F16_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); - } -#else - for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); - } -#endif - - *s = sumf; -} - -// compute GGML_VEC_DOT_UNROLL dot products at once -// xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { - ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; - - for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { - x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); - } - -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; - - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); - - sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); - } - } - } - - // reduce sum0..sum3 to sum0 - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - GGML_F16_VEC_REDUCE(sumf[k], sum[k]); - } - - // leftovers - for (int i = np; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); - } - } -#else - for (int i = 0; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); - } - } -#endif - - for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { - s[i] = sumf[i]; - } -} - -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); - - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - - GGML_F32_VEC ax[GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; - - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); - - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - y[i] += x[i]*v; - } -#else - // scalar - for (int i = 0; i < n; ++i) { - y[i] += x[i]*v; - } -#endif -} - -inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); - - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); - } -#else - // scalar - for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); - } -#endif -} - -// xs and vs are byte strides of x and v -inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { - - const float * restrict x[GGML_VEC_MAD_UNROLL]; - const float * restrict v[GGML_VEC_MAD_UNROLL]; - - for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { - x[i] = (const float *) ((const char *) xv + i*xs); - v[i] = (const float *) ((const char *) vv + i*vs); - } - -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); - - GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; - - for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { - vx[k] = GGML_F32_VEC_SET1(v[k][0]); - } - - GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; - - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - - for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { - ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); - } - - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); - } - } - - // leftovers - for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { - for (int i = np; i < n; ++i) { - y[i] += x[k][i]*v[k][0]; - } - } -#else - // scalar - for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { - for (int i = 0; i < n; ++i) { - y[i] += x[k][i]*v[k][0]; - } - } -#endif -} - -//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } -inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { -#if defined(GGML_USE_ACCELERATE) - vDSP_vsmul(y, 1, &v, y, 1, n); -#elif defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); - - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - - GGML_F32_VEC ay[GGML_F32_ARR]; - - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_MUL(ay[j], vx); - - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - y[i] *= v; - } -#else - // scalar - for (int i = 0; i < n; ++i) { - y[i] *= v; - } -#endif -} - -inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_MUL(ay[j], vx); - - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); - } -#else - // scalar - for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); - } -#endif -} - -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } -inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } -inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } -inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } -inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } -inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } -inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } -inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } -inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } -inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } -inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } -inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } -inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } -// TODO: optimize performance -inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } -inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } -inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } - -static const float GELU_COEF_A = 0.044715f; -static const float GELU_QUICK_COEF = -1.702f; -static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - -inline static float ggml_gelu_f32(float x) { - return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); -} - -inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { - const uint16_t * i16 = (const uint16_t *) x; - for (int i = 0; i < n; ++i) { - y[i] = ggml_table_gelu_f16[i16[i]]; - } -} - -#ifdef GGML_GELU_FP16 -inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { - uint16_t t; - for (int i = 0; i < n; ++i) { - if (x[i] <= -10.0f) { - y[i] = 0.0f; - } else if (x[i] >= 10.0f) { - y[i] = x[i]; - } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); - } - } -} -#else -inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { - for (int i = 0; i < n; ++i) { - y[i] = ggml_gelu_f32(x[i]); - } -} -#endif - -inline static float ggml_gelu_quick_f32(float x) { - return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); -} - -//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { -// const uint16_t * i16 = (const uint16_t *) x; -// for (int i = 0; i < n; ++i) { -// y[i] = ggml_table_gelu_quick_f16[i16[i]]; -// } -//} - -#ifdef GGML_GELU_QUICK_FP16 -inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { - uint16_t t; - for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); - } -} -#else -inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { - for (int i = 0; i < n; ++i) { - y[i] = ggml_gelu_quick_f32(x[i]); - } -} -#endif - -// Sigmoid Linear Unit (SiLU) function -inline static float ggml_silu_f32(float x) { - return x/(1.0f + expf(-x)); -} - -#if __FINITE_MATH_ONLY__ -#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" -#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461" -#endif - -#if defined(__ARM_NEON) && defined(__aarch64__) - -// adapted from arm limited optimized routine -// the maximum error is 1.45358 plus 0.5 ulps -// numbers above 88.38 will flush to infinity -// numbers beneath -103.97 will flush to zero -inline static float32x4_t ggml_v_expf(float32x4_t x) { - const float32x4_t r = vdupq_n_f32(0x1.8p23f); - const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); - const float32x4_t n = vsubq_f32(z, r); - const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, - vdupq_n_f32(0x1.7f7d1cp-20f)); - const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); - const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); - const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); - const float32x4_t u = vmulq_f32(b, b); - const float32x4_t j = vfmaq_f32( - vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), - vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), - vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); - if (!vpaddd_u64(vreinterpretq_u64_u32(c))) - return vfmaq_f32(k, j, k); - const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); - const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); - const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); - return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), - vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); -} - -// computes silu x/(1+exp(-x)) in single precision vector -inline static float32x4_t ggml_v_silu(float32x4_t x) { - const float32x4_t one = vdupq_n_f32(1.0f); - const float32x4_t zero = vdupq_n_f32(0.0f); - const float32x4_t neg_x = vsubq_f32(zero, x); - const float32x4_t exp_neg_x = ggml_v_expf(neg_x); - const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); - return vdivq_f32(x, one_plus_exp_neg_x); -} - -#elif defined(__AVX512F__) && defined(__AVX512DQ__) - -// adapted from arm limited optimized routine -// the maximum error is 1.45358 plus 0.5 ulps -// numbers above 88.38 will flush to infinity -// numbers beneath -103.97 will flush to zero -inline static __m512 ggml_v_expf(__m512 x) { - const __m512 r = _mm512_set1_ps(0x1.8p23f); - const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); - const __m512 n = _mm512_sub_ps(z, r); - const __m512 b = - _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), - _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); - const __mmask16 d = - _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); - const __m512 u = _mm512_mul_ps(b, b); - const __m512 j = _mm512_fmadd_ps( - _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, - _mm512_set1_ps(0x1.573e2ep-5f)), - u, - _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, - _mm512_set1_ps(0x1.fffdb6p-2f))), - u, - _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F))); - const __m512 res = _mm512_scalef_ps(j, n); - if (_mm512_kortestz(d, d)) - return res; - const __m512 zero = _mm512_setzero_ps(); - const __m512 alt = _mm512_mask_blend_ps( - _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero); - return _mm512_mask_blend_ps(d, res, alt); -} - -// computes silu x/(1+exp(-x)) in single precision vector -inline static __m512 ggml_v_silu(__m512 x) { - const __m512 one = _mm512_set1_ps(1); - const __m512 zero = _mm512_setzero_ps(); - const __m512 neg_x = _mm512_sub_ps(zero, x); - const __m512 exp_neg_x = ggml_v_expf(neg_x); - const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); - return _mm512_div_ps(x, one_plus_exp_neg_x); -} - -#elif defined(__AVX2__) && defined(__FMA__) - -// adapted from arm limited optimized routine -// the maximum error is 1.45358 plus 0.5 ulps -// numbers above 88.38 will flush to infinity -// numbers beneath -103.97 will flush to zero -inline static __m256 ggml_v_expf(__m256 x) { - const __m256 r = _mm256_set1_ps(0x1.8p23f); - const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); - const __m256 n = _mm256_sub_ps(z, r); - const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), - _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); - const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); - const __m256 k = _mm256_castsi256_ps( - _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); - const __m256i c = _mm256_castps_si256( - _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), - _mm256_set1_ps(126), _CMP_GT_OQ)); - const __m256 u = _mm256_mul_ps(b, b); - const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, - _mm256_set1_ps(0x1.573e2ep-5f)), u, - _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, - _mm256_set1_ps(0x1.fffdb6p-2f))), - u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); - if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) - return _mm256_fmadd_ps(j, k, k); - const __m256i g = _mm256_and_si256( - _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), - _mm256_set1_epi32(0x82000000u)); - const __m256 s1 = - _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); - const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); - const __m256i d = _mm256_castps_si256( - _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), - _mm256_set1_ps(192), _CMP_GT_OQ)); - return _mm256_or_ps( - _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), - _mm256_andnot_ps( - _mm256_castsi256_ps(d), - _mm256_or_ps( - _mm256_and_ps(_mm256_castsi256_ps(c), - _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), - _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); -} - -// computes silu x/(1+exp(-x)) in single precision vector -inline static __m256 ggml_v_silu(__m256 x) { - const __m256 one = _mm256_set1_ps(1); - const __m256 zero = _mm256_setzero_ps(); - const __m256 neg_x = _mm256_sub_ps(zero, x); - const __m256 exp_neg_x = ggml_v_expf(neg_x); - const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); - return _mm256_div_ps(x, one_plus_exp_neg_x); -} - -#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON - -#if defined(__FMA__) -#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) -#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) -#else -#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) -#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) -#endif - -// adapted from arm limited optimized routine -// the maximum error is 1.45358 plus 0.5 ulps -// numbers above 88.38 will flush to infinity -// numbers beneath -103.97 will flush to zero -inline static __m128 ggml_v_expf(__m128 x) { - const __m128 r = _mm_set1_ps(0x1.8p23f); - const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); - const __m128 n = _mm_sub_ps(z, r); - const __m128 b = - NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); - const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); - const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); - const __m128i c = - _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); - const __m128 u = _mm_mul_ps(b, b); - const __m128 j = - MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, - MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), - u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); - if (!_mm_movemask_epi8(c)) - return MADD128(j, k, k); - const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), - _mm_set1_epi32(0x82000000u)); - const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); - const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); - const __m128i d = - _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); - return _mm_or_ps( - _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), - _mm_andnot_ps(_mm_castsi128_ps(d), - _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), - _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); -} - -// computes silu x/(1+exp(-x)) in single precision vector -inline static __m128 ggml_v_silu(__m128 x) { - const __m128 one = _mm_set1_ps(1); - const __m128 zero = _mm_setzero_ps(); - const __m128 neg_x = _mm_sub_ps(zero, x); - const __m128 exp_neg_x = ggml_v_expf(neg_x); - const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); - return _mm_div_ps(x, one_plus_exp_neg_x); -} - -#endif // __ARM_NEON / __AVX2__ / __SSE2__ - -static void ggml_vec_silu_f32(const int n, float * y, const float * x) { - int i = 0; -#if defined(__AVX512F__) && defined(__AVX512DQ__) - for (; i + 15 < n; i += 16) { - _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i))); - } -#elif defined(__AVX2__) && defined(__FMA__) - for (; i + 7 < n; i += 8) { - _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i))); - } -#elif defined(__SSE2__) - for (; i + 3 < n; i += 4) { - _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); - } -#elif defined(__ARM_NEON) && defined(__aarch64__) - for (; i + 3 < n; i += 4) { - vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); - } -#endif - for (; i < n; ++i) { - y[i] = ggml_silu_f32(x[i]); - } -} - -static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { - int i = 0; - ggml_float sum = 0; -#if defined(__AVX512F__) && defined(__AVX512DQ__) - for (; i + 15 < n; i += 16) { - __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), - _mm512_set1_ps(max))); - _mm512_storeu_ps(y + i, val); - sum += (ggml_float)_mm512_reduce_add_ps(val); - } -#elif defined(__AVX2__) && defined(__FMA__) - for (; i + 7 < n; i += 8) { - __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), - _mm256_set1_ps(max))); - _mm256_storeu_ps(y + i, val); - __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), - _mm256_castps256_ps128(val)); - val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); - val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); - sum += (ggml_float)_mm_cvtss_f32(val2); - } -#elif defined(__SSE2__) - for (; i + 3 < n; i += 4) { - __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), - _mm_set1_ps(max))); - _mm_storeu_ps(y + i, val); -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) - val = _mm_add_ps(val, _mm_movehl_ps(val, val)); - val = _mm_add_ss(val, _mm_movehdup_ps(val)); -#else - __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); - val = _mm_add_ps(val, tmp); - tmp = _mm_movehl_ps(tmp, val); - val = _mm_add_ss(val, tmp); -#endif - sum += (ggml_float)_mm_cvtss_f32(val); - } -#elif defined(__ARM_NEON) && defined(__aarch64__) - for (; i + 3 < n; i += 4) { - float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), - vdupq_n_f32(max))); - vst1q_f32(y + i, val); - sum += (ggml_float)vaddvq_f32(val); - } -#endif - for (; i < n; ++i) { - float val = expf(x[i] - max); - sum += (ggml_float)val; - y[i] = val; - } - return sum; -} - -static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) { - // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i) - - int i = 0; - ggml_float sum = 0; - for (; i < n; ++i) { - float val = x[i] - max; - y[i] = val; - sum += (ggml_float)expf(val); - } - return sum = (ggml_float)logf(sum); -} - -inline static float ggml_silu_backward_f32(float x, float dy) { - const float s = 1.0f/(1.0f + expf(-x)); - return dy*s*(1.0f + x*(1.0f - s)); -} - -inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { - for (int i = 0; i < n; ++i) { - dx[i] = ggml_silu_backward_f32(x[i], dy[i]); - } -} - -inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { -#ifndef GGML_USE_ACCELERATE - ggml_float sum = 0.0; - for (int i = 0; i < n; ++i) { - sum += (ggml_float)x[i]; - } - *s = sum; -#else - vDSP_sve(x, 1, s, n); -#endif -} - -inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { - ggml_float sum = 0.0; - for (int i = 0; i < n; ++i) { - sum += (ggml_float)x[i]; - } - *s = sum; -} - -inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { - float sum = 0.0f; - for (int i = 0; i < n; ++i) { - sum += GGML_FP16_TO_FP32(x[i]); - } - *s = sum; -} - -inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { - float sum = 0.0f; - for (int i = 0; i < n; ++i) { - sum += GGML_BF16_TO_FP32(x[i]); - } - *s = sum; -} - -inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { -#ifndef GGML_USE_ACCELERATE - float max = -INFINITY; - for (int i = 0; i < n; ++i) { - max = MAX(max, x[i]); - } - *s = max; -#else - vDSP_maxv(x, 1, s, n); -#endif -} - -inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { - ggml_vec_norm_f32(n, s, x); - *s = 1.f/(*s); -} - -inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { - float max = -INFINITY; - int idx = 0; - for (int i = 0; i < n; ++i) { - max = MAX(max, x[i]); - if (max == x[i]) { idx = i; } - } - *s = idx; -} - // // data types // @@ -3217,214 +1113,6 @@ static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); -// Helpers for polling loops -#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) ) -static inline void ggml_thread_cpu_relax(void) { - __asm__ volatile("yield" ::: "memory"); -} -#elif defined(__x86_64__) -static inline void ggml_thread_cpu_relax(void) { - _mm_pause(); -} -#else -static inline void ggml_thread_cpu_relax(void) {;} -#endif - -// -// NUMA support -// - -#define GGML_NUMA_MAX_NODES 8 -#define GGML_NUMA_MAX_CPUS 512 - -struct ggml_numa_node { - uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node - uint32_t n_cpus; -}; - -struct ggml_numa_nodes { - enum ggml_numa_strategy numa_strategy; - struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; - uint32_t n_nodes; - uint32_t total_cpus; // hardware threads on system - uint32_t current_node; // node on which main process is execting -#if defined(__gnu_linux__) - cpu_set_t cpuset; // cpuset from numactl -#else - uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype -#endif -}; - -// -// ggml state -// - -struct ggml_state { - struct ggml_numa_nodes numa; -}; - -// global state -static struct ggml_state g_state; -static atomic_flag g_state_critical = ATOMIC_FLAG_INIT; - -// critical section via spin lock -inline static void ggml_critical_section_start(void) { - while (atomic_flag_test_and_set(&g_state_critical)) { - // spin - sched_yield(); - } -} - -static void ggml_barrier(struct ggml_threadpool * tp) { - int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); - if (n_threads == 1) { - return; - } - -#ifdef GGML_USE_OPENMP - #pragma omp barrier -#else - int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed); - - // enter barrier (full seq-cst fence) - int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst); - - if (n_barrier == (n_threads - 1)) { - // last thread - atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed); - - // exit barrier (fill seq-cst fence) - atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst); - return; - } - - // wait for other threads - while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { - ggml_thread_cpu_relax(); - } - - // exit barrier (full seq-cst fence) - // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead - #ifdef GGML_TSAN_ENABLED - atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst); - #else - atomic_thread_fence(memory_order_seq_cst); - #endif -#endif -} - -// TODO: make this somehow automatically executed -// some sort of "sentry" mechanism -inline static void ggml_critical_section_end(void) { - atomic_flag_clear(&g_state_critical); -} - -#if defined(__gnu_linux__) -static cpu_set_t ggml_get_numa_affinity(void) { - cpu_set_t cpuset; - pthread_t thread; - thread = pthread_self(); - CPU_ZERO(&cpuset); - pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); - return cpuset; -} -#else -static uint32_t ggml_get_numa_affinity(void) { - return 0; // no NUMA support -} -#endif - -void ggml_numa_init(enum ggml_numa_strategy numa_flag) { - if (g_state.numa.n_nodes > 0) { - fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); - - return; - } - -#if defined(__gnu_linux__) - struct stat st; - char path[256]; - int rv; - - // set numa scheme - g_state.numa.numa_strategy = numa_flag; - - GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); - - g_state.numa.cpuset = ggml_get_numa_affinity(); - - // enumerate nodes - while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) != 0) { break; } - ++g_state.numa.n_nodes; - } - - // enumerate CPUs - while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) != 0) { break; } - ++g_state.numa.total_cpus; - } - - GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); - - // figure out which node we're on - uint current_cpu; - int getcpu_ret = 0; -#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) - getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); -#else - // old glibc doesn't have a wrapper for this call. Fall back on direct syscall -# if !defined(SYS_getcpu) && defined(SYS_get_cpu) -# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name -# endif - getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node); -#endif - - if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { - g_state.numa.n_nodes = 0; - return; - } - - GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); - - for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { - struct ggml_numa_node * node = &g_state.numa.nodes[n]; - GGML_PRINT_DEBUG("CPUs on node %u:", n); - node->n_cpus = 0; - for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) == 0) { - node->cpus[node->n_cpus++] = c; - GGML_PRINT_DEBUG(" %u", c); - } - } - GGML_PRINT_DEBUG("\n"); - } - - if (ggml_is_numa()) { - FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); - if (fptr != NULL) { - char buf[42]; - if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { - GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); - } - fclose(fptr); - } - } -#else - UNUSED(numa_flag); - // TODO -#endif -} - -bool ggml_is_numa(void) { - return g_state.numa.n_nodes > 1; -} //////////////////////////////////////////////////////////////////////////////// @@ -3561,22 +1249,6 @@ int ggml_n_dims(const struct ggml_tensor * tensor) { return 1; } -static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return (t0->ne[0] == t1->ne[0]) && - (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable - (t1->ne[3]%t0->ne[3] == 0); -} - -static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return (t0->ne[1] == t1->ne[1]) && - (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable - (t1->ne[3]%t0->ne[3] == 0); -} - enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { enum ggml_type wtype = GGML_TYPE_COUNT; @@ -3723,140 +1395,29 @@ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const str return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1); } -static inline int ggml_up32(int n) { - return (n + 31) & ~31; -} - -//static inline int ggml_up64(int n) { -// return (n + 63) & ~63; -//} - -static inline int ggml_up(int n, int m) { - // assert m is a power of 2 - GGML_ASSERT((m & (m - 1)) == 0); - return (n + m - 1) & ~(m - 1); -} - // assert that pointer is aligned to GGML_MEM_ALIGN #define GGML_ASSERT_ALIGNED(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// -#if defined(__ARM_ARCH) - -#if defined(__linux__) && defined(__aarch64__) -#include -#elif defined(__APPLE__) -#include -#endif - -#if !defined(HWCAP2_I8MM) -#define HWCAP2_I8MM 0 -#endif - -static void ggml_init_arm_arch_features(void) { -#if defined(__linux__) && defined(__aarch64__) - uint32_t hwcap = getauxval(AT_HWCAP); - uint32_t hwcap2 = getauxval(AT_HWCAP2); - - ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); - ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); - ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); - -#if defined(__ARM_FEATURE_SVE) - ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); -#endif -#elif defined(__APPLE__) - int oldp = 0; - size_t size = sizeof(oldp); - if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_neon = oldp; - - if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_i8mm = oldp; - - ggml_arm_arch_features.has_sve = 0; - ggml_arm_arch_features.sve_cnt = 0; -#else -// Run-time CPU feature detection not implemented for this platform, fallback to compile time -#if defined(__ARM_NEON) - ggml_arm_arch_features.has_neon = 1; -#else - ggml_arm_arch_features.has_neon = 0; -#endif - -#if defined(__ARM_FEATURE_MATMUL_INT8) - ggml_arm_arch_features.has_i8mm = 1; -#else - ggml_arm_arch_features.has_i8mm = 0; -#endif - -#if defined(__ARM_FEATURE_SVE) - ggml_arm_arch_features.has_sve = 1; - ggml_arm_arch_features.sve_cnt = 16; -#else - ggml_arm_arch_features.has_sve = 0; - ggml_arm_arch_features.sve_cnt = 0; -#endif -#endif -} -#endif - struct ggml_context * ggml_init(struct ggml_init_params params) { - // make this function thread safe + static bool is_first_call = false; + ggml_critical_section_start(); - static bool is_first_call = true; - - if (is_first_call) { + if (!is_first_call) { // initialize time system (required on Windows) ggml_time_init(); - // initialize GELU, Quick GELU, SILU and EXP F32 tables - { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - - for (int i = 0; i < (1 << 16); ++i) { - union { - uint16_t u16; - ggml_fp16_t fp16; - } u = {i}; - float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); - ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); - } - - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - - GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + for (int i = 0; i < (1 << 16); ++i) { + union { + uint16_t u16; + ggml_fp16_t fp16; + } u = {i}; + ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); } - - // initialize g_state - { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - - g_state = (struct ggml_state) { - /*.numa =*/ { - .n_nodes = 0, - .total_cpus = 0, - }, - }; - - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - - GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); - } - -#if defined(__ARM_ARCH) - ggml_init_arm_arch_features(); -#endif - - is_first_call = false; + is_first_call = true; } ggml_critical_section_end(); @@ -4123,183 +1684,16 @@ struct ggml_tensor * ggml_new_tensor_4d( return ggml_new_tensor(ctx, type, 4, ne); } -struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); +void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) { + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, nbytes); - ggml_set_i32(result, value); - - return result; -} - -struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - - ggml_set_f32(result, value); - - return result; + return (uint8_t *)ctx->mem_buffer + obj->offs; } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } -static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { - GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings - assert(params_size <= GGML_MAX_OP_PARAMS); - memcpy(tensor->op_params, params, params_size); -} - -static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - return ((const int32_t *)(tensor->op_params))[i]; -} - -static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); - return ((const float *)(tensor->op_params))[i]; -} - -static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - ((int32_t *)(tensor->op_params))[i] = value; -} - -static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); - ((float *)(tensor->op_params))[i] = value; -} - -struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { - if (ggml_is_empty(tensor)) { - return tensor; - } - if (tensor->buffer) { - ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); - } else { - GGML_ASSERT(tensor->data); - memset(tensor->data, 0, ggml_nbytes(tensor)); - } - return tensor; -} - -struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { - const int n = ggml_nrows(tensor); - const int nc = tensor->ne[0]; - const size_t n1 = tensor->nb[1]; - - char * const data = tensor->data; - - switch (tensor->type) { - case GGML_TYPE_I8: - { - assert(tensor->nb[0] == sizeof(int8_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_I16: - { - assert(tensor->nb[0] == sizeof(int16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_I32: - { - assert(tensor->nb[0] == sizeof(int32_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_F16: - { - assert(tensor->nb[0] == sizeof(ggml_fp16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); - } - } break; - case GGML_TYPE_BF16: - { - assert(tensor->nb[0] == sizeof(ggml_fp16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); - } - } break; - case GGML_TYPE_F32: - { - assert(tensor->nb[0] == sizeof(float)); - for (int i = 0; i < n; i++) { - ggml_vec_set_f32(nc, (float *)(data + i*n1), value); - } - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - - return tensor; -} - -struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { - const int n = ggml_nrows(tensor); - const int nc = tensor->ne[0]; - const size_t n1 = tensor->nb[1]; - - char * const data = tensor->data; - - switch (tensor->type) { - case GGML_TYPE_I8: - { - assert(tensor->nb[0] == sizeof(int8_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_I16: - { - assert(tensor->nb[0] == sizeof(int16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_I32: - { - assert(tensor->nb[0] == sizeof(int32_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); - } - } break; - case GGML_TYPE_F16: - { - assert(tensor->nb[0] == sizeof(ggml_fp16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); - } - } break; - case GGML_TYPE_BF16: - { - assert(tensor->nb[0] == sizeof(ggml_bf16_t)); - for (int i = 0; i < n; i++) { - ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); - } - } break; - case GGML_TYPE_F32: - { - assert(tensor->nb[0] == sizeof(float)); - for (int i = 0; i < n; i++) { - ggml_vec_set_f32(nc, (float *)(data + i*n1), value); - } - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - - return tensor; -} - void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) { const int64_t ne2 = tensor->ne[2]; const int64_t ne1 = tensor->ne[1]; @@ -4324,280 +1718,6 @@ void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * } } -int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = { 0, 0, 0, 0 }; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]); - } - switch (tensor->type) { - case GGML_TYPE_I8: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); - return ((int8_t *)(tensor->data))[i]; - } - case GGML_TYPE_I16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); - return ((int16_t *)(tensor->data))[i]; - } - case GGML_TYPE_I32: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); - return ((int32_t *)(tensor->data))[i]; - } - case GGML_TYPE_F16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } - case GGML_TYPE_BF16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); - return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); - } - case GGML_TYPE_F32: - { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); - return ((float *)(tensor->data))[i]; - } - default: - { - GGML_ABORT("fatal error"); - } - } -} - -void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = { 0, 0, 0, 0 }; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value); - return; - } - switch (tensor->type) { - case GGML_TYPE_I8: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); - ((int8_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_I16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); - ((int16_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_I32: - { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); - ((int32_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_F16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); - } break; - case GGML_TYPE_BF16: - { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); - ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); - } break; - case GGML_TYPE_F32: - { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); - ((float *)(tensor->data))[i] = value; - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; - switch (tensor->type) { - case GGML_TYPE_I8: - return ((int8_t *) data)[0]; - case GGML_TYPE_I16: - return ((int16_t *) data)[0]; - case GGML_TYPE_I32: - return ((int32_t *) data)[0]; - case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); - case GGML_TYPE_BF16: - return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); - case GGML_TYPE_F32: - return ((float *) data)[0]; - default: - GGML_ABORT("fatal error"); - } -} - -void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; - switch (tensor->type) { - case GGML_TYPE_I8: - { - ((int8_t *)(data))[0] = value; - } break; - case GGML_TYPE_I16: - { - ((int16_t *)(data))[0] = value; - } break; - case GGML_TYPE_I32: - { - ((int32_t *)(data))[0] = value; - } break; - case GGML_TYPE_F16: - { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); - } break; - case GGML_TYPE_BF16: - { - ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); - } break; - case GGML_TYPE_F32: - { - ((float *)(data))[0] = value; - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = { 0, 0, 0, 0 }; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]); - } - switch (tensor->type) { - case GGML_TYPE_I8: - { - return ((int8_t *)(tensor->data))[i]; - } - case GGML_TYPE_I16: - { - return ((int16_t *)(tensor->data))[i]; - } - case GGML_TYPE_I32: - { - return ((int32_t *)(tensor->data))[i]; - } - case GGML_TYPE_F16: - { - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } - case GGML_TYPE_BF16: - { - return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); - } - case GGML_TYPE_F32: - { - return ((float *)(tensor->data))[i]; - } - default: - { - GGML_ABORT("fatal error"); - } - } -} - -void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = { 0, 0, 0, 0 }; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); - return; - } - switch (tensor->type) { - case GGML_TYPE_I8: - { - ((int8_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_I16: - { - ((int16_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_I32: - { - ((int32_t *)(tensor->data))[i] = value; - } break; - case GGML_TYPE_F16: - { - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); - } break; - case GGML_TYPE_BF16: - { - ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); - } break; - case GGML_TYPE_F32: - { - ((float *)(tensor->data))[i] = value; - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; - switch (tensor->type) { - case GGML_TYPE_I8: - return ((int8_t *) data)[0]; - case GGML_TYPE_I16: - return ((int16_t *) data)[0]; - case GGML_TYPE_I32: - return ((int32_t *) data)[0]; - case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); - case GGML_TYPE_BF16: - return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); - case GGML_TYPE_F32: - return ((float *) data)[0]; - default: - GGML_ABORT("fatal error"); - } -} - -void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { - void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; - switch (tensor->type) { - case GGML_TYPE_I8: - { - ((int8_t *)(data))[0] = value; - } break; - case GGML_TYPE_I16: - { - ((int16_t *)(data))[0] = value; - } break; - case GGML_TYPE_I32: - { - ((int32_t *)(data))[0] = value; - } break; - case GGML_TYPE_F16: - { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); - } break; - case GGML_TYPE_BF16: - { - ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); - } break; - case GGML_TYPE_F32: - { - ((float *)(data))[0] = value; - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - void * ggml_get_data(const struct ggml_tensor * tensor) { return tensor->data; } @@ -5572,6 +2692,14 @@ struct ggml_tensor * ggml_group_norm_inplace( // ggml_mul_mat +static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); +} + struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, @@ -5641,6 +2769,14 @@ struct ggml_tensor * ggml_mul_mat_id( // ggml_out_prod +static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[1] == t1->ne[1]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); +} + struct ggml_tensor * ggml_out_prod( struct ggml_context * ctx, struct ggml_tensor * a, @@ -7613,11 +4749,6 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( } // ggml_map_custom1 -struct ggml_map_custom1_op_params { - ggml_custom1_op_t fun; - int n_tasks; - void * userdata; -}; static struct ggml_tensor * ggml_map_custom1_impl( struct ggml_context * ctx, @@ -7663,12 +4794,6 @@ struct ggml_tensor * ggml_map_custom1_inplace( // ggml_map_custom2 -struct ggml_map_custom2_op_params { - ggml_custom2_op_t fun; - int n_tasks; - void * userdata; -}; - static struct ggml_tensor * ggml_map_custom2_impl( struct ggml_context * ctx, struct ggml_tensor * a, @@ -7717,12 +4842,6 @@ struct ggml_tensor * ggml_map_custom2_inplace( // ggml_map_custom3 -struct ggml_map_custom3_op_params { - ggml_custom3_op_t fun; - int n_tasks; - void * userdata; -}; - static struct ggml_tensor * ggml_map_custom3_impl( struct ggml_context * ctx, struct ggml_tensor * a, @@ -7850,9675 +4969,6 @@ struct ggml_tensor * ggml_opt_step_adamw( //////////////////////////////////////////////////////////////////////////////// -// ggml_compute_forward_dup - -static void ggml_compute_forward_dup_same_cont( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(src0->type == dst->type); - - const size_t nb0 = ggml_type_size(src0->type); - - const int ith = params->ith; // thread index - const int nth = params->nth; // number of threads - - // parallelize by elements - const int ne = ggml_nelements(dst); - const int dr = (ne + nth - 1) / nth; - const int ie0 = dr * ith; - const int ie1 = MIN(ie0 + dr, ne); - - if (ie0 < ie1) { - memcpy( - ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb0), - (ie1 - ie0) * nb0); - } -} - -static void ggml_compute_forward_dup_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - - GGML_TENSOR_UNARY_OP_LOCALS - - const int ith = params->ith; // thread index - const int nth = params->nth; // number of threads - - // parallelize by rows - const int nr = ne01; - // number of rows per thread - const int dr = (nr + nth - 1) / nth; - // row range for this thread - const int ir0 = dr * ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (src0->type == dst->type && - ne00 == ne0 && - nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { - // copy by rows - const size_t rs = ne00*nb00; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ir0; i01 < ir1; i01++) { - memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), - rs); - } - } - } - return; - } - - // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - - if (ggml_is_contiguous(dst)) { - if (nb00 == sizeof(ggml_fp16_t)) { - if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (type_traits[dst->type].from_float) { - ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; - float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; - - size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - - for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); - } - - quantize_row_q(src0_f32, dst_ptr + id, ne00); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } - return; - } - - // dst counters - int64_t i10 = 0; - int64_t i11 = 0; - int64_t i12 = 0; - int64_t i13 = 0; - - if (dst->type == GGML_TYPE_F16) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); - - if (++i10 == ne00) { - i10 = 0; - if (++i11 == ne01) { - i11 = 0; - if (++i12 == ne02) { - i12 = 0; - if (++i13 == ne03) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else if (dst->type == GGML_TYPE_F32) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } -} - -static void ggml_compute_forward_dup_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - - GGML_TENSOR_UNARY_OP_LOCALS - - const int ith = params->ith; // thread index - const int nth = params->nth; // number of threads - - // parallelize by rows - const int nr = ne01; - // number of rows per thread - const int dr = (nr + nth - 1) / nth; - // row range for this thread - const int ir0 = dr * ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (src0->type == dst->type && - ne00 == ne0 && - nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { - // copy by rows - const size_t rs = ne00*nb00; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ir0; i01 < ir1; i01++) { - memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), - rs); - } - } - } - return; - } - - // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - - if (ggml_is_contiguous(dst)) { - if (nb00 == sizeof(ggml_bf16_t)) { - if (dst->type == GGML_TYPE_BF16) { - size_t id = 0; - const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (type_traits[dst->type].from_float) { - ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; - float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; - - size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - - for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]); - } - - quantize_row_q(src0_f32, dst_ptr + id, ne00); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_BF16) { - size_t id = 0; - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } - return; - } - - // dst counters - int64_t i10 = 0; - int64_t i11 = 0; - int64_t i12 = 0; - int64_t i13 = 0; - - if (dst->type == GGML_TYPE_BF16) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); - - if (++i10 == ne00) { - i10 = 0; - if (++i11 == ne01) { - i11 = 0; - if (++i12 == ne02) { - i12 = 0; - if (++i13 == ne03) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else if (dst->type == GGML_TYPE_F32) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } -} - -static void ggml_compute_forward_dup_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - - GGML_TENSOR_UNARY_OP_LOCALS - - const int ith = params->ith; // thread index - const int nth = params->nth; // number of threads - - // parallelize by rows - const int nr = ne01; - // number of rows per thread - const int dr = (nr + nth - 1) / nth; - // row range for this thread - const int ir0 = dr * ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (src0->type == dst->type && - ne00 == ne0 && - nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { - // copy by rows - const size_t rs = ne00*nb00; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ir0; i01 < ir1; i01++) { - memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), - rs); - } - } - } - return; - } - - if (ggml_is_contiguous(dst)) { - // TODO: simplify - if (nb00 == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else if (type_traits[dst->type].from_float) { - ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; - - size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - quantize_row_q(src0_ptr, dst_ptr + id, ne00); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (dst->type == GGML_TYPE_BF16) { - size_t id = 0; - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } - } - - return; - } - - // dst counters - - int64_t i10 = 0; - int64_t i11 = 0; - int64_t i12 = 0; - int64_t i13 = 0; - - if (dst->type == GGML_TYPE_F32) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - memcpy(dst_ptr, src0_ptr, sizeof(float)); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else if (dst->type == GGML_TYPE_BF16) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - } else { - GGML_ABORT("fatal error"); // TODO: implement - } -} - -// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. -static void ggml_compute_forward_dup_bytes( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(src0->type == dst->type); - - GGML_TENSOR_UNARY_OP_LOCALS; - - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - - const size_t type_size = ggml_type_size(src0->type); - const int ith = params->ith; // thread index - const int nth = params->nth; // number of threads - - - // parallelize by rows - const int nr = ne01; - // number of rows per thread - const int dr = (nr + nth - 1) / nth; - // row range for this thread - const int ir0 = dr * ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (src0->type == dst->type && - ne00 == ne0 && - nb00 == type_size && nb0 == type_size) { - // copy by rows - const size_t rs = ne00 * type_size; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ir0; i01 < ir1; i01++) { - memcpy( - ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), - rs); - } - } - } - return; - } - - if (ggml_is_contiguous(dst)) { - size_t id = 0; - char * dst_ptr = (char *) dst->data; - const size_t rs = ne00 * type_size; - - if (nb00 == type_size) { - // src0 is contigous on first dimension, copy by rows - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int64_t i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, type_size); - - id += type_size; - } - } - id += rs * (ne01 - ir1); - } - } - } - - return; - } - - // dst counters - - int64_t i10 = 0; - int64_t i11 = 0; - int64_t i12 = 0; - int64_t i13 = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - - memcpy(dst_ptr, src0_ptr, type_size); - - if (++i10 == ne0) { - i10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } -} - -static void ggml_compute_forward_dup( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (src0->type == dst->type) { - ggml_compute_forward_dup_bytes(params, dst); - return; - } - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_dup_f16(params, dst); - } break; - case GGML_TYPE_BF16: - { - ggml_compute_forward_dup_bf16(params, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_dup_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_add - -static void ggml_compute_forward_add_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; - } - } - } -} - -static void ggml_compute_forward_add_f16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (dst->type == GGML_TYPE_F32) { - GGML_ASSERT( nb0 == sizeof(float)); - } - else { - GGML_ASSERT(dst->type == GGML_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - } - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - if (dst->type == GGML_TYPE_F16) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } - } else { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; - } - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_bf16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (dst->type == GGML_TYPE_F32) { - GGML_ASSERT( nb0 == sizeof(float)); - } - else { - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - } - - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - if (dst->type == GGML_TYPE_BF16) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } - } else { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; - } - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_f16_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(ggml_fp16_t)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_bf16_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_BF16); - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(ggml_bf16_t)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i])); - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_q_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const enum ggml_type type = src0->type; - const enum ggml_type dtype = dst->type; - ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float; - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == sizeof(float)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ggml_is_quantized(src0->type)); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - // src1 and dst are same shape as src0 => same indices - const int i13 = i03; - const int i12 = i02; - const int i11 = i01; - - const int i3 = i03; - const int i2 = i02; - const int i1 = i01; - - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); - void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - - assert(ne00 % 32 == 0); - - // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne00); - // add src1 - ggml_vec_acc_f32(ne00, wdata, src1_row); - // quantize row to dst - if (quantize_row_q != NULL) { - quantize_row_q(wdata, dst_row, ne00); - } else { - memcpy(dst_row, wdata, ne0*nb0); - } - } -} - -static void ggml_compute_forward_add( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_BF16: - { - if (src1->type == GGML_TYPE_BF16) { - ggml_compute_forward_add_bf16_bf16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_bf16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - { - ggml_compute_forward_add_q_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_add1 - -static void ggml_compute_forward_add1_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - -#ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_add1_f32); - - vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data), 0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); -#else - ggml_vec_add1_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - *(float *) src1->data); -#endif - } -} - -static void ggml_compute_forward_add1_f16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - // scalar to add - const float v = *(float *) src1->data; - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); - } - } -} - -static void ggml_compute_forward_add1_f16_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - // scalar to add - const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); - } - } -} - -static void ggml_compute_forward_add1_q_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - // scalar to add - const float v = *(float *) src1->data; - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - const enum ggml_type type = src0->type; - ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - ggml_from_float_t const quantize_row_q = type_traits[type].from_float; - - // we don't support permuted src0 - GGML_ASSERT(nb00 == ggml_type_size(type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ggml_is_quantized(src0->type)); - GGML_ASSERT(dst->type == src0->type); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); - void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); - - assert(ne0 % 32 == 0); - - // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne0); - // add src1 - ggml_vec_acc1_f32(ne0, wdata, v); - // quantize row to dst - quantize_row_q(wdata, dst_row, ne0); - } -} - -static void ggml_compute_forward_add1_bf16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - // scalar to add - const float v = *(float *) src1->data; - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); - } - } -} - -static void ggml_compute_forward_add1_bf16_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); - - // scalar to add - const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_BF16); - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); - } - } -} - -static void ggml_compute_forward_add1( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_add1_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add1_f16_f16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_f16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_BF16: - { - if (src1->type == GGML_TYPE_BF16) { - ggml_compute_forward_add1_bf16_bf16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_bf16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - { - ggml_compute_forward_add1_q_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_acc - -static void ggml_compute_forward_acc_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - - // view src0 and dst with these strides and data offset inbytes during acc - // nb0 is implicitly element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) dst->op_params)[0]; - size_t nb2 = ((int32_t *) dst->op_params)[1]; - size_t nb3 = ((int32_t *) dst->op_params)[2]; - size_t offset = ((int32_t *) dst->op_params)[3]; - bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - if (params->ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - } - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src1); - const int nc = src1->ne[0]; - - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) - - // src0 and dst as viewed during acc - const size_t nb0 = ggml_element_size(src0); - - const size_t nb00 = nb0; - const size_t nb01 = nb1; - const size_t nb02 = nb2; - const size_t nb03 = nb3; - - GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); - GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); - - GGML_ASSERT(nb10 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are viewed with shape of src1 and offset - // => same indices - const int i3 = ir/(ne12*ne11); - const int i2 = (ir - i3*ne12*ne11)/ne11; - const int i1 = (ir - i3*ne12*ne11 - i2*ne11); - -#ifdef GGML_USE_ACCELERATE - vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); -#else - ggml_vec_add_f32(nc, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); -#endif - } -} - -static void ggml_compute_forward_acc( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_acc_f32(params, dst); - } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sub - -static void ggml_compute_forward_sub_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] - *src1_ptr; - } - } - } -} - -static void ggml_compute_forward_sub( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sub_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_mul - -static void ggml_compute_forward_mul_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (nb10 == sizeof(float)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0 ; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_mul_f32); - - vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne00; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); - } - } - } -} - -static void ggml_compute_forward_mul( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_mul_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_div - -static void ggml_compute_forward_div_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (nb10 == sizeof(float)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_div_f32); - - vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne00; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); - } - } - } -} - -static void ggml_compute_forward_div( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_div_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sqr - -static void ggml_compute_forward_sqr_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_sqr_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sqr( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sqr_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sqrt - -static void ggml_compute_forward_sqrt_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_sqrt_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sqrt( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sqrt_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_log - -static void ggml_compute_forward_log_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_log_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_log( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_log_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sin - -static void ggml_compute_forward_sin_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_sin_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sin( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sin_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_cos - -static void ggml_compute_forward_cos_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_cos_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_cos( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_cos_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sum - -static void ggml_compute_forward_sum_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_scalar(dst)); - assert(src0->nb[0] == sizeof(float)); - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - - ggml_float sum = 0; - ggml_float row_sum = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32_ggf(ne00, - &row_sum, - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); - sum += row_sum; - } - } - } - ((float *) dst->data)[0] = sum; -} - -static void ggml_compute_forward_sum_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_scalar(dst)); - - assert(src0->nb[0] == sizeof(ggml_fp16_t)); - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - - float sum = 0; - float row_sum = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f16_ggf(ne00, - &row_sum, - (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); - sum += row_sum; - } - } - } - ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); -} - -static void ggml_compute_forward_sum_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_scalar(dst)); - - assert(src0->nb[0] == sizeof(ggml_bf16_t)); - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - - float sum = 0; - float row_sum = 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_bf16_ggf(ne00, - &row_sum, - (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); - sum += row_sum; - } - } - } - ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); -} - -static void ggml_compute_forward_sum( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sum_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_sum_f16(params, dst); - } break; - case GGML_TYPE_BF16: - { - ggml_compute_forward_sum_bf16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sum_rows - -static void ggml_compute_forward_sum_rows_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(dst->nb[0] == sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(ne0 == 1); - GGML_ASSERT(ne1 == ne01); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - - for (int64_t i3 = 0; i3 < ne03; i3++) { - for (int64_t i2 = 0; i2 < ne02; i2++) { - for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); - float row_sum = 0; - ggml_vec_sum_f32(ne00, &row_sum, src_row); - dst_row[0] = row_sum; - } - } - } -} - -static void ggml_compute_forward_sum_rows( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sum_rows_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_mean - -static void ggml_compute_forward_mean_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(src0->nb[0] == sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - assert(ne0 == 1); - assert(ne1 == ne01); - assert(ne2 == ne02); - assert(ne3 == ne03); - - UNUSED(ne0); - UNUSED(ne1); - UNUSED(ne2); - UNUSED(ne3); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32(ne00, - (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); - - *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; - } - } - } -} - -static void ggml_compute_forward_mean( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_mean_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_argmax - -static void ggml_compute_forward_argmax_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(src0->nb[0] == sizeof(float)); - assert(dst->nb[0] == sizeof(float)); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - - const size_t nb01 = src0->nb[1]; - const size_t nb0 = dst->nb[0]; - - for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src = (float *) ((char *) src0->data + i1*nb01); - int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); - int v = 0; - ggml_vec_argmax_f32(ne00, &v, src); - dst_[0] = v; - } -} - -static void ggml_compute_forward_argmax( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_argmax_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_count_equal - -static void ggml_compute_forward_count_equal_i32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS; - - GGML_ASSERT(src0->type == GGML_TYPE_I32); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_are_same_shape(src0, src1)); - GGML_ASSERT(ggml_is_scalar(dst)); - GGML_ASSERT(dst->type == GGML_TYPE_I64); - - const int64_t nr = ggml_nrows(src0); - - const int ith = params->ith; - const int nth = params->nth; - - int64_t * sums = (int64_t *) params->wdata; - int64_t sum_thread = 0; - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - for (int64_t ir = ir0; ir < ir1; ++ir) { - const int64_t i03 = ir / (ne02*ne01); - const int64_t i02 = (ir - i03*ne03) / ne01; - const int64_t i01 = ir - i03*ne03 - i02*ne02; - - const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; - const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; - - for (int64_t i00 = 0; i00 < ne00; ++i00) { - const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); - const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); - - sum_thread += val0 == val1; - } - } - if (ith != 0) { - sums[ith] = sum_thread; - } - ggml_barrier(params->threadpool); - - if (ith != 0) { - return; - } - - for (int ith_other = 1; ith_other < nth; ++ith_other) { - sum_thread += sums[ith_other]; - } - *((int64_t *) dst->data) = sum_thread; -} - -static void ggml_compute_forward_count_equal( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_I32: - { - ggml_compute_forward_count_equal_i32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_repeat - -static void ggml_compute_forward_repeat_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(src0, dst)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne0/ne00); - const int nr1 = (int)(ne1/ne01); - const int nr2 = (int)(ne2/ne02); - const int nr3 = (int)(ne3/ne03); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne03; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne02; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne01; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_vec_cpy_f32(ne00, - (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), - (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(src0, dst)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne0/ne00); - const int nr1 = (int)(ne1/ne01); - const int nr2 = (int)(ne2/ne02); - const int nr3 = (int)(ne3/ne03); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne03; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne02; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne01; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); - ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); - // ggml_vec_cpy_f16(ne00, y, x) - for (int i = 0; i < ne00; ++i) { - y[i] = x[i]; - } - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_I16: - { - ggml_compute_forward_repeat_f16(params, dst); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_I32: - { - ggml_compute_forward_repeat_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_repeat_back - -static void ggml_compute_forward_repeat_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(dst, src0)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne00/ne0); - const int nr1 = (int)(ne01/ne1); - const int nr2 = (int)(ne02/ne2); - const int nr3 = (int)(ne03/ne3); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (ggml_is_contiguous(dst)) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); - } else { - for (int k3 = 0; k3 < ne3; k3++) { - for (int k2 = 0; k2 < ne2; k2++) { - for (int k1 = 0; k1 < ne1; k1++) { - ggml_vec_set_f32(ne0, - (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), - 0); - } - } - } - } - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne3; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne2; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne1; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_vec_acc_f32(ne0, - (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), - (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_repeat_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_concat - -static void ggml_compute_forward_concat_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t dim = ggml_get_op_params_i32(dst, 0); - - GGML_ASSERT(dim >= 0 && dim < 4); - - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = src0->ne[dim]; - - const float * x; - - // TODO: smarter multi-theading - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2 += nth) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); - } else { - x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); - } - - float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } -} - -static void ggml_compute_forward_concat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - case GGML_TYPE_I32: - { - ggml_compute_forward_concat_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_abs - -static void ggml_compute_forward_abs_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_abs_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_abs( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_abs_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sgn - -static void ggml_compute_forward_sgn_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_sgn_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sgn( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sgn_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_neg - -static void ggml_compute_forward_neg_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_neg_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_neg( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_neg_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_step - -static void ggml_compute_forward_step_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_step_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_step( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_step_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_tanh - -static void ggml_compute_forward_tanh_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_tanh_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_tanh( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_tanh_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_elu - -static void ggml_compute_forward_elu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_elu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_elu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_elu_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_relu - -static void ggml_compute_forward_relu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_relu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_relu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_relu_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sigmoid - -static void ggml_compute_forward_sigmoid_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_sigmoid_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sigmoid( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sigmoid_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_gelu - -static void ggml_compute_forward_gelu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_gelu_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); - -#ifndef NDEBUG - for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - UNUSED(x); - assert(!isnan(x)); - assert(!isinf(x)); - } -#endif - } -} - -static void ggml_compute_forward_gelu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_gelu_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_gelu_quick - -static void ggml_compute_forward_gelu_quick_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_gelu_quick_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); - -#ifndef NDEBUG - for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - UNUSED(x); - assert(!isnan(x)); - assert(!isinf(x)); - } -#endif - } -} - -static void ggml_compute_forward_gelu_quick( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_gelu_quick_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_silu - -static void ggml_compute_forward_silu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_silu_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1]))); - -#ifndef NDEBUG - for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k]; - UNUSED(x); - assert(!isnan(x)); - assert(!isinf(x)); - } -#endif - } -} - -static void ggml_compute_forward_silu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_silu_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} -// ggml_compute_forward_leaky_relu - -static void ggml_compute_forward_leaky_relu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - - assert(dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_leaky_relu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); - } -} - -static void ggml_compute_forward_leaky_relu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_leaky_relu_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_silu_back - -static void ggml_compute_forward_silu_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * grad = dst->src[1]; - - assert(ggml_is_contiguous_1(grad)); - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - assert(ggml_are_same_shape(src0, grad)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_silu_backward_f32(nc, - (float *) ((char *) dst->data + i1*( dst->nb[1])), - (float *) ((char *) src0->data + i1*(src0->nb[1])), - (float *) ((char *) grad->data + i1*(grad->nb[1]))); - -#ifndef NDEBUG - for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - UNUSED(x); - assert(!isnan(x)); - assert(!isinf(x)); - } -#endif - } -} - -static void ggml_compute_forward_silu_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_silu_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - -static void ggml_compute_forward_hardswish_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardswish_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} -static void ggml_compute_forward_hardswish( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_hardswish_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_hardsigmoid_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardsigmoid_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_hardsigmoid( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_hardsigmoid_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_exp_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_exp_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_exp( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_exp_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - -// ggml_compute_forward_norm - -static void ggml_compute_forward_norm_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - GGML_ASSERT(eps > 0.0f); - - // TODO: optimize - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - - ggml_float sum = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)x[i00]; - } - - float mean = sum/ne00; - - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - - ggml_float sum2 = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - float v = x[i00] - mean; - y[i00] = v; - sum2 += (ggml_float)(v*v); - } - - float variance = sum2/ne00; - const float scale = 1.0f/sqrtf(variance + eps); - - ggml_vec_scale_f32(ne00, y, scale); - } - } - } -} - -static void ggml_compute_forward_norm( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_norm_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_group_rms_norm - -static void ggml_compute_forward_rms_norm_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - GGML_ASSERT(eps > 0.0f); - - // TODO: optimize - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - - ggml_float sum = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)(x[i00] * x[i00]); - } - - const float mean = sum/ne00; - - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - - memcpy(y, x, ne00 * sizeof(float)); - // for (int i00 = 0; i00 < ne00; i00++) { - // y[i00] = x[i00]; - // } - - const float scale = 1.0f/sqrtf(mean + eps); - - ggml_vec_scale_f32(ne00, y, scale); - } - } - } -} - -static void ggml_compute_forward_rms_norm( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_rms_norm_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_rms_norm_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - // TODO: optimize - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - // src1 is same shape as src0 => same indices - const int64_t i11 = i01; - const int64_t i12 = i02; - const int64_t i13 = i03; - - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); - - ggml_float sum_xx = 0.0; - ggml_float sum_xdz = 0.0; - - for (int64_t i00 = 0; i00 < ne00; i00++) { - sum_xx += (ggml_float)(x[i00] * x[i00]); - sum_xdz += (ggml_float)(x[i00] * dz[i00]); - } - - //const float mean = (float)(sum_xx)/ne00; - const float mean_eps = (float)(sum_xx)/ne00 + eps; - const float sum_eps = (float)(sum_xx) + eps*ne00; - //const float mean_xdz = (float)(sum_xdz)/ne00; - // we could cache rms from forward pass to improve performance. - // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. - //const float rms = sqrtf(mean_eps); - const float rrms = 1.0f / sqrtf(mean_eps); - //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) - - { - // z = rms_norm(x) - // - // rms_norm(src0) = - // scale( - // src0, - // div( - // 1, - // sqrt( - // add( - // scale( - // sum( - // sqr( - // src0)), - // (1.0/N)), - // eps)))); - - // postorder: - // ## op args grad - // 00 param src0 grad[#00] - // 01 const 1 - // 02 sqr (#00) grad[#02] - // 03 sum (#02) grad[#03] - // 04 const 1/N - // 05 scale (#03, #04) grad[#05] - // 06 const eps - // 07 add (#05, #06) grad[#07] - // 08 sqrt (#07) grad[#08] - // 09 div (#01,#08) grad[#09] - // 10 scale (#00,#09) grad[#10] - // - // backward pass, given grad[#10] - // #10: scale - // grad[#00] += scale(grad[#10],#09) - // grad[#09] += sum(mul(grad[#10],#00)) - // #09: div - // grad[#08] += neg(mul(grad[#09], div(#09,#08))) - // #08: sqrt - // grad[#07] += mul(grad[#08], div(0.5, #08)) - // #07: add - // grad[#05] += grad[#07] - // #05: scale - // grad[#03] += scale(grad[#05],#04) - // #03: sum - // grad[#02] += repeat(grad[#03], #02) - // #02: - // grad[#00] += scale(mul(#00, grad[#02]), 2.0) - // - // substitute and simplify: - // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) - // grad[#02] = repeat(grad[#03], #02) - // grad[#02] = repeat(scale(grad[#05],#04), #02) - // grad[#02] = repeat(scale(grad[#07],#04), #02) - // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02) - // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02) - // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02) - // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02) - // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02) - // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02) - // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02) - // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) - // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0) - // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0) - // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) - // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) - // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) - // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps)) - // a = b*c + d*e - // a = b*c*f/f + d*e*f/f - // a = (b*c*f + d*e*f)*(1/f) - // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c)) - // a = (b + d*e/c)*c - // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) - // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms - // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms - // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms - // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms - // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms - // a = (dz + x*div(-mean_xdz,mean_eps))*rrms - // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms) - // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) - // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) - } - // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) - // post-order: - // dx := x - // dx := scale(dx,-mean_xdz/mean_eps) - // dx := add(dx, dz) - // dx := scale(dx, rrms) - float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - - ggml_vec_cpy_f32 (ne00, dx, x); - // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); - ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); - ggml_vec_acc_f32 (ne00, dx, dz); - ggml_vec_scale_f32(ne00, dx, rrms); - } - } - } -} - -static void ggml_compute_forward_rms_norm_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_rms_norm_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_group_norm - -static void ggml_compute_forward_group_norm_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - // TODO: optimize - - float eps; - memcpy(&eps, dst->op_params + 1, sizeof(float)); - - int n_channels = src0->ne[2]; - int n_groups = dst->op_params[0]; - int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; - for (int i = ith; i < n_groups; i += nth) { - int start = i * n_channels_per_group; - int end = start + n_channels_per_group; - if (end > n_channels) { - end = n_channels; - } - int step = end - start; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - ggml_float sum = 0.0; - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - - ggml_float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - sumr += (ggml_float)x[i00]; - } - sum += sumr; - } - } - const float mean = sum / (ne00 * ne01 * step); - - ggml_float sum2 = 0.0; - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); - - ggml_float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { - float v = x[i00] - mean; - y[i00] = v; - sumr += (ggml_float)(v * v); - } - sum2 += sumr; - } - } - const float variance = sum2 / (ne00 * ne01 * step); - const float scale = 1.0f / sqrtf(variance + eps); - - for (int64_t i02 = start; i02 < end; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); - ggml_vec_scale_f32(ne00, y, scale); - } - } - } - } -} - -static void ggml_compute_forward_group_norm( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_group_norm_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_mul_mat - -static void ggml_compute_forward_mul_mat_one_chunk( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const int64_t num_rows_per_vec_dot, - const int64_t ir0_start, - const int64_t ir0_end, - const int64_t ir1_start, - const int64_t ir1_end) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const enum ggml_type type = src0->type; - - const bool src1_cont = ggml_is_contiguous(src1); - - ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; - - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end); - - // threads with no work simply yield (not sure if it helps) - if (ir0_start >= ir0_end || ir1_start >= ir1_end) { - return; - } - - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); - - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; - - const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; - - // attempt to reduce false-sharing (does not seem to make a difference) - // 16 * 2, accounting for mmla kernels - float tmp[32]; - - for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { - for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { - const int64_t i13 = (ir1 / (ne12 * ne1)); - const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; - const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); - - // broadcast src0 into src1 - const int64_t i03 = i13 / r3; - const int64_t i02 = i12 / r2; - - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; - - const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char*)wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12 + i13 * nb13)); - float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); - - //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { - // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); - //} - - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); - } - - for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { - memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); - } - } - } - } -} - -static void ggml_compute_forward_mul_mat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const enum ggml_type type = src0->type; - - enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; - ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float; - ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat; - int64_t const vec_dot_num_rows = type_traits[type].nrows; - int64_t const matmul_num_cols = type_traits[type].ncols; - int64_t const blck_size_interleave = type_traits[type].blck_size_interleave; - ggml_gemv_t const gemv = type_traits[type].gemv; - ggml_gemm_t const gemm = type_traits[type].gemm; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if GGML_USE_LLAMAFILE - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const bool src1_cont = ggml_is_contiguous(src1); - - if (src1_cont) { - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(src0->type), - (const char *)src1->data + i12*nb12 + i13*nb13, - nb11/ggml_type_size(src1->type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - src0->type, - src1->type, - dst->type)) - goto UseGgmlGemm1; - return; - } -UseGgmlGemm1:; -#endif - - if (src1->type != vec_dot_type) { - char * wdata = params->wdata; - - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; - - assert(params->wsize >= ne13*nbw3); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - int64_t i11_processed = 0; - if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - 4, ne10, blck_size_interleave); - } - i11_processed = ne11 - ne11 % 4; - } - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); - } - } - } - } - - if (ith == 0) { - // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); - } - - ggml_barrier(params->threadpool); - -#if GGML_USE_LLAMAFILE - if (src1->type != vec_dot_type) { - const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(src0->type), - (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, - row_size/ggml_type_size(vec_dot_type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - src0->type, - vec_dot_type, - dst->type)) - goto UseGgmlGemm2; - return; - } -UseGgmlGemm2:; -#endif - - // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) - const int64_t nr0 = ne0; - - // This is the size of the rest of the dimensions of the result - const int64_t nr1 = ne1 * ne2 * ne3; - - // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols - int64_t num_rows_per_vec_dot = vec_dot_num_rows; - // TODO: currently the mmla kernels support only even numbered rows/cols. - // this check can be removed once they are extended to support odd numbered rows/cols too - if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { - num_rows_per_vec_dot = 1; - } - - // Now select a reasonable chunk size. - int chunk_size = 16; - - // We need to step up the size if it's small - if (nr0 == 1 || nr1 == 1) { - chunk_size = 64; - } - - // distribute the work across the inner or outer loop based on which one is larger - // The number of chunks in the 0/1 dim. - // CEIL(nr0/chunk_size) - int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; - int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; - - // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. - // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915 - // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. - if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) { - // distribute the thread work across the inner or outer loop based on which one is larger - nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows - nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - } - - // The number of elements in each chunk - const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; - const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - - if ((ggml_n_dims(src0) == 2) && gemv) { - const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; - int64_t src0_start = (ith * ne01) / nth; - int64_t src0_end = ((ith + 1) * ne01) / nth; - src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; - src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; - if (src0_start >= src0_end) return; - - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (gemm && (ne11 > 3)) { - gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); - } - for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, - src0_end - src0_start); - } - return; - } - - // The first chunk comes from our thread_id, the rest will get auto-assigned. - int current_chunk = ith; - - while (current_chunk < nchunk0 * nchunk1) { - const int64_t ith0 = current_chunk % nchunk0; - const int64_t ith1 = current_chunk / nchunk0; - - const int64_t ir0_start = dr0 * ith0; - const int64_t ir0_end = MIN(ir0_start + dr0, nr0); - - const int64_t ir1_start = dr1 * ith1; - const int64_t ir1_end = MIN(ir1_start + dr1, nr1); - - ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); - - if (nth >= nchunk0 * nchunk1) { - break; - } - - current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); - } -} - -// ggml_compute_forward_mul_mat_id - -static void ggml_compute_forward_mul_mat_id( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * ids = dst->src[2]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const enum ggml_type type = src0->type; - - const bool src1_cont = ggml_is_contiguous(src1); - - ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; - ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float; - int64_t const matmul_num_cols = type_traits[type].ncols; - ggml_gemv_t const gemv = type_traits[type].gemv; - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // row groups - const int n_ids = ids->ne[0]; // n_expert_used - const int n_as = ne02; // n_expert - - char * wdata_src1_end = (src1->type == vec_dot_type) ? - (char *) params->wdata : - (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); - - struct mmid_row_mapping { - int32_t i1; - int32_t i2; - }; - - int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] - struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] - - if (src1->type != vec_dot_type) { - char * wdata = params->wdata; - - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; - - assert(params->wsize >= ne13*nbw3); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); - } - } - } - } - -#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] - - if (ith == 0) { - // initialize matrix_row_counts - memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); - - // group rows by src0 matrix - for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { - for (int id = 0; id < n_ids; ++id) { - const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]); - - assert(i02 >= 0 && i02 < n_as); - - MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1}; - matrix_row_counts[i02] += 1; - } - } - } - - ggml_barrier(params->threadpool); - - // compute each matrix multiplication in sequence - for (int cur_a = 0; cur_a < n_as; ++cur_a) { - const int64_t cne1 = matrix_row_counts[cur_a]; - - if (cne1 == 0) { - continue; - } - - const char * src0_cur = (const char *) src0->data + cur_a*nb02; - - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1; // src1 rows - - if (((ggml_n_dims(src0) - 1) == 2) && gemv) { - int64_t src0_cur_start = (ith * ne01) / nth; - int64_t src0_cur_end = ((ith + 1) * ne01) / nth; - src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; - src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; - if (src0_cur_start >= src0_cur_end) return; - - for (int ir1 = 0; ir1 < nr1; ir1++) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); - const int id = row_mapping.i1; // selected expert index - - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12)); - - gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, - (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); - } - continue; - } - - // distribute the thread work across the inner or outer loop based on which one is larger - - const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows - const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - - const int64_t ith0 = ith % nth0; - const int64_t ith1 = ith / nth0; - - const int64_t dr0 = (nr0 + nth0 - 1)/nth0; - const int64_t dr1 = (nr1 + nth1 - 1)/nth1; - - const int64_t ir010 = dr0*ith0; - const int64_t ir011 = MIN(ir010 + dr0, nr0); - - const int64_t ir110 = dr1*ith1; - const int64_t ir111 = MIN(ir110 + dr1, nr1); - - // threads with no work simply yield (not sure if it helps) - //if (ir010 >= ir011 || ir110 >= ir111) { - // sched_yield(); - // continue; - //} - - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; - - // attempt to reduce false-sharing (does not seem to make a difference) - float tmp[16]; - - for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { - for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t _i12 = ir1; // logical row index for this expert - - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); - const int id = row_mapping.i1; // selected expert index - - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11)*row_size - : (i11*nb11 + i12*nb12)); - - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); - - //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); - //} - - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); - } - - memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); - } - } - } - } - -#undef MMID_MATRIX_ROW -} - -// ggml_compute_forward_out_prod - -static void ggml_compute_forward_out_prod_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne3 == ne13); - GGML_ASSERT(ne03 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == sizeof(float)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - // GGML_ASSERT(nb0 <= nb1); - // GGML_ASSERT(nb1 <= nb2); - // GGML_ASSERT(nb2 <= nb3); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - - if (ith == 0) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); - } - ggml_barrier(params->threadpool); - - // dst[:,:,:,:] = 0 - // for i2,i3: - // for i1: - // for i01: - // for i0: - // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] - - // parallelize by last three dimensions - - // total rows in dst - const int64_t nr = ne1*ne2*ne3; - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - // block-tiling attempt - const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); - const int64_t blck_1 = 16; - - for (int64_t bir = ir0; bir < ir1; bir += blck_1) { - const int64_t bir1 = MIN(bir + blck_1, ir1); - for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { - const int64_t bne01 = MIN(bi01 + blck_0, ne01); - for (int64_t ir = bir; ir < bir1; ++ir) { - // dst indices - const int64_t i3 = ir/(ne2*ne1); - const int64_t i2 = (ir - i3*ne2*ne1)/ne1; - const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); - - const int64_t i02 = i2; - const int64_t i03 = i3; - - //const int64_t i10 = i1; - const int64_t i12 = i2; - const int64_t i13 = i3; - -#if GGML_VEC_MAD_UNROLL > 2 - const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL); - for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) { - const int64_t i11 = i01; - - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - - ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); - } - for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { - const int64_t i11 = i01; - - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - - ggml_vec_mad_f32(ne0, d, s0, *s1); - } -#else - for (int64_t i01 = bi01; i01 < bne01; ++i01) { - const int64_t i11 = i01; - - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - - ggml_vec_mad_f32(ne0, d, s0, *s1); - } -#endif - } - } - } -} - -static void ggml_compute_forward_out_prod_q_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS; - - const int ith = params->ith; - const int nth = params->nth; - - const enum ggml_type type = src0->type; - ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 dim0 - GGML_ASSERT(nb00 == ggml_type_size(type)); - - // dst dim0 cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - // GGML_ASSERT(nb0 <= nb1); - // GGML_ASSERT(nb1 <= nb2); - // GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - - if (ith == 0) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); - } - ggml_barrier(params->threadpool); - - // parallelize by last three dimensions - - // total rows in dst - const int64_t nr = ne1*ne2*ne3; - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - // dst[:,:,:,:] = 0 - // for i2,i3: - // for i1: - // for i01: - // for i0: - // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] - - float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; - - for (int64_t ir = ir0; ir < ir1; ++ir) { - // dst indices - const int64_t i3 = ir/(ne2*ne1); - const int64_t i2 = (ir - i3*ne2*ne1)/ne1; - const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); - - const int64_t i02 = i2; - const int64_t i03 = i3; - - //const int64_t i10 = i1; - const int64_t i12 = i2; - const int64_t i13 = i3; - - for (int64_t i01 = 0; i01 < ne01; ++i01) { - const int64_t i11 = i01; - - float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); - float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - - dequantize_row_q(s0, wdata, ne0); - ggml_vec_mad_f32(ne0, d, wdata, *s1); - } - } -} - -static void ggml_compute_forward_out_prod( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - { - ggml_compute_forward_out_prod_q_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - GGML_ABORT("fatal error"); // todo - // ggml_compute_forward_out_prod_f16_f32(params, dst); - } - case GGML_TYPE_F32: - { - ggml_compute_forward_out_prod_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_scale - -static void ggml_compute_forward_scale_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - // scale factor - float v; - memcpy(&v, dst->op_params, sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - const size_t nb01 = src0->nb[1]; - - const size_t nb1 = dst->nb[1]; - - for (int i1 = ir0; i1 < ir1; i1++) { - if (dst->data != src0->data) { - // src0 is same shape as dst => same indices - memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); - } - ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); - } -} - -static void ggml_compute_forward_scale( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_scale_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_set - -static void ggml_compute_forward_set_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - - // view src0 and dst with these strides and data offset inbytes during set - // nb0 is implicitly element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) dst->op_params)[0]; - size_t nb2 = ((int32_t *) dst->op_params)[1]; - size_t nb3 = ((int32_t *) dst->op_params)[2]; - size_t offset = ((int32_t *) dst->op_params)[3]; - bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - if (params->ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - } - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src1); - const int nc = src1->ne[0]; - - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) - - // src0 and dst as viewed during set - const size_t nb0 = ggml_element_size(src0); - - const int im0 = (ne10 == 0 ? 0 : ne10-1); - const int im1 = (ne11 == 0 ? 0 : ne11-1); - const int im2 = (ne12 == 0 ? 0 : ne12-1); - const int im3 = (ne13 == 0 ? 0 : ne13-1); - - GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); - - GGML_ASSERT(nb10 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 and dst are viewed with shape of src1 and offset - // => same indices - const int i3 = ir/(ne12*ne11); - const int i2 = (ir - i3*ne12*ne11)/ne11; - const int i1 = (ir - i3*ne12*ne11 - i2*ne11); - - ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); - } -} - -static void ggml_compute_forward_set( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_set_f32(params, dst); - } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_cpy - -static void ggml_compute_forward_cpy( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, dst); -} - -// ggml_compute_forward_cont - -static void ggml_compute_forward_cont( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, dst); -} - -// ggml_compute_forward_reshape - -static void ggml_compute_forward_reshape( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - // NOP - UNUSED(params); - UNUSED(dst); -} - -// ggml_compute_forward_view - -static void ggml_compute_forward_view( - const struct ggml_compute_params * params, - const struct ggml_tensor * dst) { - // NOP - UNUSED(params); - UNUSED(dst); -} - -// ggml_compute_forward_permute - -static void ggml_compute_forward_permute( - const struct ggml_compute_params * params, - const struct ggml_tensor * dst) { - // NOP - UNUSED(params); - UNUSED(dst); -} - -// ggml_compute_forward_transpose - -static void ggml_compute_forward_transpose( - const struct ggml_compute_params * params, - const struct ggml_tensor * dst) { - // NOP - UNUSED(params); - UNUSED(dst); -} - -// ggml_compute_forward_get_rows - -static void ggml_compute_forward_get_rows_q( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); - - const enum ggml_type type = src0->type; - ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - - assert(ne0 == nc); - assert(ne02 == ne11); - assert(nb00 == ggml_type_size(type)); - assert(ggml_nrows(dst) == nr); - - const int ith = params->ith; - const int nth = params->nth; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int64_t i = ir0; i < ir1; ++i) { - const int64_t i12 = i/(ne11*ne10); - const int64_t i11 = (i - i12*ne11*ne10)/ne10; - const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - - GGML_ASSERT(i01 >= 0 && i01 < ne01); - - dequantize_row_q( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); - } -} - -static void ggml_compute_forward_get_rows_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); - - assert(ne0 == nc); - assert(ne02 == ne11); - assert(nb00 == sizeof(ggml_fp16_t)); - assert(ggml_nrows(dst) == nr); - - const int ith = params->ith; - const int nth = params->nth; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int64_t i = ir0; i < ir1; ++i) { - const int64_t i12 = i/(ne11*ne10); - const int64_t i11 = (i - i12*ne11*ne10)/ne10; - const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - - GGML_ASSERT(i01 >= 0 && i01 < ne01); - - ggml_fp16_to_fp32_row( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); - } -} - -static void ggml_compute_forward_get_rows_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); - - assert(ne0 == nc); - assert(ne02 == ne11); - assert(nb00 == sizeof(ggml_bf16_t)); - assert(ggml_nrows(dst) == nr); - - const int ith = params->ith; - const int nth = params->nth; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int64_t i = ir0; i < ir1; ++i) { - const int64_t i12 = i/(ne11*ne10); - const int64_t i11 = (i - i12*ne11*ne10)/ne10; - const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - - GGML_ASSERT(i01 >= 0 && i01 < ne01); - - ggml_bf16_to_fp32_row( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); - } -} - -static void ggml_compute_forward_get_rows_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); - - assert(ne0 == nc); - assert(ne02 == ne11); - assert(nb00 == sizeof(float)); - assert(ggml_nrows(dst) == nr); - - const int ith = params->ith; - const int nth = params->nth; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int64_t i = ir0; i < ir1; ++i) { - const int64_t i12 = i/(ne11*ne10); - const int64_t i11 = (i - i12*ne11*ne10)/ne10; - const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - - GGML_ASSERT(i01 >= 0 && i01 < ne01); - - ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), - (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); - } -} - -static void ggml_compute_forward_get_rows( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - { - ggml_compute_forward_get_rows_q(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_get_rows_f16(params, dst); - } break; - case GGML_TYPE_BF16: - { - ggml_compute_forward_get_rows_bf16(params, dst); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_I32: - { - ggml_compute_forward_get_rows_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - - //static bool first = true; - //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - //if (first) { - // first = false; - //} else { - // for (int k = 0; k < dst->ne[1]; ++k) { - // for (int j = 0; j < dst->ne[0]/16; ++j) { - // for (int i = 0; i < 16; ++i) { - // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // exit(0); - //} -} - -// ggml_compute_forward_get_rows_back - -static void ggml_compute_forward_get_rows_back_f32_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_is_contiguous(dst)); - - // ggml_compute_forward_dup_same_cont(params, opt0, dst); - - memset(dst->data, 0, ggml_nbytes(dst)); - - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); - - GGML_ASSERT( dst->ne[0] == nc); - GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); - - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; - - for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); - } - } -} - -static void ggml_compute_forward_get_rows_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_is_contiguous(dst)); - - // ggml_compute_forward_dup_same_cont(params, opt0, dst); - - memset(dst->data, 0, ggml_nbytes(dst)); - - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); - - GGML_ASSERT( dst->ne[0] == nc); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; - - ggml_vec_add_f32(nc, - (float *) ((char *) dst->data + r*dst->nb[1]), - (float *) ((char *) dst->data + r*dst->nb[1]), - (float *) ((char *) src0->data + i*src0->nb[1])); - } -} - -static void ggml_compute_forward_get_rows_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_get_rows_back_f32_f16(params, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_get_rows_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - - //static bool first = true; - //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - //if (first) { - // first = false; - //} else { - // for (int k = 0; k < dst->ne[1]; ++k) { - // for (int j = 0; j < dst->ne[0]/16; ++j) { - // for (int i = 0; i < 16; ++i) { - // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // exit(0); - //} -} - -// ggml_compute_forward_diag - -static void ggml_compute_forward_diag_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - // TODO: handle transposed/permuted matrices - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(ne00 == ne0); - GGML_ASSERT(ne00 == ne1); - GGML_ASSERT(ne01 == 1); - GGML_ASSERT(ne02 == ne2); - GGML_ASSERT(ne03 == ne3); - - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb0 == sizeof(float)); - - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = 0; i2 < ne2; i2++) { - for (int i1 = 0; i1 < ne1; i1++) { - float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); - for (int i0 = 0; i0 < i1; i0++) { - d[i0] = 0; - } - d[i1] = s[i1]; - for (int i0 = i1+1; i0 < ne0; i0++) { - d[i0] = 0; - } - } - } - } -} - -static void ggml_compute_forward_diag( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_diag_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_diag_mask_inf - -static void ggml_compute_forward_diag_mask_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const float value) { - - const struct ggml_tensor * src0 = dst->src[0]; - - const int ith = params->ith; - const int nth = params->nth; - - const int n_past = ((int32_t *) dst->op_params)[0]; - const bool inplace = src0->data == dst->data; - - GGML_ASSERT(n_past >= 0); - - if (!inplace) { - if (ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - } - - // TODO: handle transposed/permuted matrices - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - const int nr = src0->ne[1]; - const int nz = n/nr; - - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int k = 0; k < nz; k++) { - for (int j = ith; j < nr; j += nth) { - for (int i = n_past; i < nc; i++) { - if (i > n_past + j) { - *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; - } - } - } - } -} - -static void ggml_compute_forward_diag_mask_inf( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_diag_mask_zero( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_diag_mask_f32(params, dst, 0); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_soft_max - -static void ggml_compute_forward_soft_max_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - assert(ggml_is_contiguous(dst)); - assert(ggml_are_same_shape(src0, dst)); - - float scale = 1.0f; - float max_bias = 0.0f; - - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); - - // TODO: handle transposed/permuted matrices - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - //const int64_t ne11 = src1 ? src1->ne[1] : 1; - - // TODO: is this supposed to be ceil instead of floor? - // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 - const uint32_t n_head = ne02; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; - - const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); - - for (int i1 = ir0; i1 < ir1; i1++) { - // ALiBi - const uint32_t h = (i1/ne01)%ne02; // head - const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; - - float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); - float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); - - // broadcast the mask across rows - ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; - float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; - - ggml_vec_cpy_f32 (nc, wp, sp); - ggml_vec_scale_f32(nc, wp, scale); - if (mp_f32) { - if (use_f16) { - for (int i = 0; i < nc; ++i) { - wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); - } - } else { - for (int i = 0; i < nc; ++i) { - wp[i] += slope*mp_f32[i]; - } - } - } - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(wp[i])); - } -#endif - - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, wp); - - ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max); - assert(sum > 0.0); - - sum = 1.0/sum; - ggml_vec_scale_f32(nc, dp, sum); - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - assert(!isnan(dp[i])); - assert(!isinf(dp[i])); - } -#endif - } -} - -static void ggml_compute_forward_soft_max( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_soft_max_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - -// ggml_compute_forward_soft_max_back - -static void ggml_compute_forward_soft_max_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_are_same_shape(src1, dst)); - - // TODO: handle transposed/permuted matrices - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int i1 = ir0; i1 < ir1; i1++) { - float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); - float *y = (float *)((char *) src1->data + i1*src1->nb[1]); - float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(dy[i])); - assert(!isnan(y[i])); - } -#endif - // Jii = yi - yi*yi - // Jij = -yi*yj - // J = diag(y)-y.T*y - // dx = J * dy - // dxk = sum_i(Jki * dyi) - // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk - // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk - // dxk = sum_i(-yk*yi * dyi) + yk*dyk - // dxk = -yk * sum_i(yi * dyi) + yk*dyk - // dxk = -yk * dot(y, dy) + yk*dyk - // dxk = yk * (- dot(y, dy) + dyk) - // dxk = yk * (dyk - dot(y, dy)) - // - // post-order: - // dot_y_dy := dot(y, dy) - // dx := dy - // dx := dx - dot_y_dy - // dx := dx * y - - // linear runtime, no additional memory - float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); - ggml_vec_cpy_f32 (nc, dx, dy); - ggml_vec_acc1_f32(nc, dx, -dot_y_dy); - ggml_vec_mul_f32 (nc, dx, dx, y); - -#ifndef NDEBUG - for (int i = 0; i < nc; ++i) { - assert(!isnan(dx[i])); - assert(!isinf(dx[i])); - } -#endif - } -} - -static void ggml_compute_forward_soft_max_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_soft_max_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_clamp - -static void ggml_compute_forward_clamp_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - float min; - float max; - memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - for (int j = ith; j < n; j += nth) { - float * dst_ptr = (float *) ((char *) dst->data + j*nb1); - float * src0_ptr = (float *) ((char *) src0->data + j*nb01); - - for (int i = 0; i < nc; i++) { - dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); - } - } -} - -static void ggml_compute_forward_clamp( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_clamp_f32(params, dst); - } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_I64: - case GGML_TYPE_F64: - case GGML_TYPE_COUNT: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_rope - -static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / MAX(0.001f, high - low); - return 1 - MIN(1, MAX(0, y)); -} - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static void rope_yarn( - float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); - } - *cos_theta = cosf(theta) * mscale; - *sin_theta = sinf(theta) * mscale; -} - -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); -} - -static void ggml_rope_cache_init( - float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, - float * cache, float sin_sign, float theta_scale) { - // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py - float theta = theta_base; - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; - rope_yarn( - theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] - ); - cache[i0 + 1] *= sin_sign; - - theta *= theta_scale; - } -} - -void ggml_rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] -) { - // start and end correction dims - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = MAX(0, start); - dims[1] = MIN(n_dims - 1, end); -} - -static void ggml_compute_forward_rope_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const bool forward) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * src2 = dst->src[2]; - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - //const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - GGML_ASSERT(nb00 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(dst); - - GGML_ASSERT(n_dims <= ne0); - GGML_ASSERT(n_dims % 2 == 0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - - const float * freq_factors = NULL; - if (src2 != NULL) { - GGML_ASSERT(src2->type == GGML_TYPE_F32); - GGML_ASSERT(src2->ne[0] >= n_dims / 2); - freq_factors = (const float *) src2->data; - } - - // backward process uses inverse rotation by cos and sin. - // cos and sin build a rotation matrix, where the inverse is the transpose. - // this essentially just switches the sign of sin. - const float sin_sign = forward ? 1.0f : -1.0f; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - - float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; - ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); - - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { - const float cos_theta = cache[i0 + 0]; - const float sin_theta = cache[i0 + 1]; - - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = src[0]; - const float x1 = src[1]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; - } - } else { - for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { - const int64_t ic = i0/2; - - const float cos_theta = cache[i0 + 0]; - const float sin_theta = cache[i0 + 1]; - - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - - const float x0 = src[0]; - const float x1 = src[n_dims/2]; - - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; - } - } - - for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } - } - } -} - -// TODO: deduplicate f16/f32 code -static void ggml_compute_forward_rope_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const bool forward) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * src2 = dst->src[2]; - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - //const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); - //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - - GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(dst); - - GGML_ASSERT(n_dims <= ne0); - GGML_ASSERT(n_dims % 2 == 0); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - // row index used to determine which thread to use - int ir = 0; - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - - const float * freq_factors = NULL; - if (src2 != NULL) { - GGML_ASSERT(src2->type == GGML_TYPE_F32); - GGML_ASSERT(src2->ne[0] >= n_dims / 2); - freq_factors = (const float *) src2->data; - } - - // backward process uses inverse rotation by cos and sin. - // cos and sin build a rotation matrix, where the inverse is the transpose. - // this essentially just switches the sign of sin. - const float sin_sign = forward ? 1.0f : -1.0f; - - const int32_t * pos = (const int32_t *) src1->data; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - const int64_t p = pos[i2]; - - float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; - ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); - - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (ir++ < ir0) continue; - if (ir > ir1) break; - - if (!is_neox) { - for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { - const float cos_theta = cache[i0 + 0]; - const float sin_theta = cache[i0 + 1]; - - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); - - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - } - } else { - for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { - const int64_t ic = i0/2; - - const float cos_theta = cache[i0 + 0]; - const float sin_theta = cache[i0 + 1]; - - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); - - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - } - } - - for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - - dst_data[0] = src[0]; - dst_data[1] = src[1]; - } - } - } - } -} - -static void ggml_compute_forward_rope( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_rope_f16(params, dst, true); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_rope_f32(params, dst, true); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_rope_back - -static void ggml_compute_forward_rope_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_rope_f16(params, dst, false); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_rope_f32(params, dst, false); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_conv_transpose_1d - -static void ggml_compute_forward_conv_transpose_1d_f16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - - if (ith == 0) { - memset(params->wdata, 0, params->wsize); - - // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } - - // permute source data (src1) from (L x Cin) to (Cin x L) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; - ggml_fp16_t * dst_data = wdata; - - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); - } - } - } - - // need to zero dst since we are accumulating into it - memset(dst->data, 0, ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - - // total rows in dst - const int nr = ne1; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = wdata + nk; - - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - ggml_vec_dot_f16(ne02, &v, 0, - (ggml_fp16_t *) wdata_src + i1n, 0, - (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); - dst_data[i10*s0 + i00] += v; - } - } - } -} - -static void ggml_compute_forward_conv_transpose_1d_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02; - - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb10 == sizeof(float)); - - if (ith == 0) { - memset(params->wdata, 0, params->wsize); - - // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) - { - float * const wdata = (float *) params->wdata + 0; - - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i01*ne00*ne02; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; - } - } - } - } - - // prepare source data (src1) - { - float * const wdata = (float *) params->wdata + nk; - float * dst_data = wdata; - - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = src[i10]; - } - } - } - - // need to zero dst since we are accumulating into it - memset(dst->data, 0, ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - - // total rows in dst - const int nr = ne1; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float * const wdata = (float *) params->wdata + 0; - float * const wdata_src = wdata + nk; - - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - float * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - ggml_vec_dot_f32(ne02, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i00*ne02, 0, 1); - dst_data[i10*s0 + i00] += v; - } - } - } -} - -static void ggml_compute_forward_conv_transpose_1d( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_conv_transpose_1d_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_im2col_f32 -// src0: kernel [OC, IC, KH, KW] -// src1: image [N, IC, IH, IW] -// dst: result [N, OH, OW, IC*KH*KW] -static void ggml_compute_forward_im2col_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS; - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - const int64_t IH = is_2D ? ne11 : 1; - const int64_t IW = ne10; - - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; - - const int64_t OH = is_2D ? ne2 : 1; - const int64_t OW = ne1; - - int ofs0 = is_2D ? nb13 : nb12; - int ofs1 = is_2D ? nb12 : nb11; - - GGML_ASSERT(nb10 == sizeof(float)); - - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] - { - float * const wdata = (float *) dst->data; - - for (int64_t in = 0; in < N; in++) { - for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 - for (int64_t iow = 0; iow < OW; iow++) { - for (int64_t iic = ith; iic < IC; iic += nth) { - - // micro kernel - float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] - - for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 - for (int64_t ikw = 0; ikw < KW; ikw++) { - const int64_t iiw = iow*s0 + ikw*d0 - p0; - const int64_t iih = ioh*s1 + ikh*d1 - p1; - - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; - } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); - } - } - } - } - } - } - } - } -} - - -// ggml_compute_forward_im2col_f16 -// src0: kernel [OC, IC, KH, KW] -// src1: image [N, IC, IH, IW] -// dst: result [N, OH, OW, IC*KH*KW] -static void ggml_compute_forward_im2col_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); - - GGML_TENSOR_BINARY_OP_LOCALS; - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - const int64_t IH = is_2D ? ne11 : 1; - const int64_t IW = ne10; - - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; - - const int64_t OH = is_2D ? ne2 : 1; - const int64_t OW = ne1; - - int ofs0 = is_2D ? nb13 : nb12; - int ofs1 = is_2D ? nb12 : nb11; - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; - - for (int64_t in = 0; in < N; in++) { - for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 - for (int64_t iow = 0; iow < OW; iow++) { - for (int64_t iic = ith; iic < IC; iic += nth) { - - // micro kernel - ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] - - for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 - for (int64_t ikw = 0; ikw < KW; ikw++) { - const int64_t iiw = iow*s0 + ikw*d0 - p0; - const int64_t iih = ioh*s1 + ikh*d1 - p1; - - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; - } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); - } - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_im2col( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - switch (dst->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_im2col_f16(params, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_im2col_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_im2col_back_f32 - -static void ggml_compute_forward_im2col_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS; - - const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; - const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t N = is_2D ? ne3 : ne2; - const int64_t IC = is_2D ? ne2 : ne1; - const int64_t IH = is_2D ? ne1 : 1; - const int64_t IW = ne0; - - const int64_t KH = is_2D ? ne01 : 1; - const int64_t KW = ne00; - - const int64_t OH = is_2D ? ne12 : 1; - const int64_t OW = ne11; - - int ofs0 = is_2D ? nb3 : nb2; - int ofs1 = is_2D ? nb2 : nb1; - - GGML_ASSERT(nb0 == sizeof(float)); - - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] - { - float * const wdata = (float *) dst->data; - - for (int64_t in = 0; in < N; in++) { - for (int64_t iic = ith; iic < IC; iic += nth) { - for (int64_t iih = 0; iih < IH; iih++) { - for (int64_t iiw = 0; iiw < IW; iiw++) { - - // micro kernel - float grad = 0.0f; - for (int64_t ikh = 0; ikh < KH; ikh++) { - for (int64_t ikw = 0; ikw < KW; ikw++) { - // For s0 > 1 some values were skipped over in the forward pass. - // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well. - const int64_t tmpw = (iiw + p0 - ikw*d0); - if (tmpw % s0 != 0) { - continue; - } - const int64_t iow = tmpw / s0; - - // Equivalent logic as above except for s1. - int64_t ioh; - if (is_2D) { - const int64_t tmph = iih + p1 - ikh*d1; - - if (tmph % s1 != 0) { - continue; - } - - ioh = tmph / s1; - } else { - ioh = 0; - } - - if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) { - continue; - } - - const float * const src_data = (const float *) src1->data - + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] - grad += src_data[iic*(KH*KW) + ikh*KW + ikw]; - } - } - float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW] - dst_data[iih*IW + iiw] = grad; - } - } - } - } - } -} - -// ggml_compute_forward_conv_transpose_2d - -static void ggml_compute_forward_conv_transpose_2d( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const int nk = ne00*ne01*ne02*ne03; - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - - if (ith == 0) { - memset(params->wdata, 0, params->wsize); - - // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); - ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; - } - } - } - } - } - - // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; - for (int i12 = 0; i12 < ne12; i12++) { - for (int i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); - ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; - for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); - } - } - } - } - - memset(dst->data, 0, ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - - const int32_t stride = ggml_get_op_params_i32(dst, 0); - - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = wdata + nk; - - for (int i2 = ip0; i2 < ip1; i2++) { // Cout - float * dst_data = (float *)((char *) dst->data + i2*nb2); - ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; - for (int i11 = 0; i11 < ne11; i11++) { - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i11*ne10*ne12 + i10*ne12; - for (int i01 = 0; i01 < ne01; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - ggml_vec_dot_f16(ne03, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); - dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; - } - } - } - } - } -} - -// ggml_compute_forward_pool_1d_sk_p0 - -static void ggml_compute_forward_pool_1d_sk_p0( - const struct ggml_compute_params * params, - const enum ggml_op_pool op, - const int k, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src = dst->src[0]; - - assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); - - if (params->ith != 0) { - return; - } - - const char * cdata = (const char *)src->data; - const char * const data_end = cdata + ggml_nbytes(src); - float * drow = (float *)dst->data; - - const int64_t rs = dst->ne[0]; - - while (cdata < data_end) { - const void * srow = (const void *)cdata; - int j = 0; - for (int64_t i = 0; i < rs; ++i) { - switch (op) { - case GGML_OP_POOL_AVG: drow[i] = 0; break; - case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - for (int ki = 0; ki < k; ++ki) { - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); - switch (op) { - case GGML_OP_POOL_AVG: drow[i] += srow_j; break; - case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - ++j; - } - switch (op) { - case GGML_OP_POOL_AVG: drow[i] /= k; break; - case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - } - - cdata += src->nb[1]; - drow += rs; - } -} - -// ggml_compute_forward_pool_1d - -static void ggml_compute_forward_pool_1d( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int s0 = opts[2]; - const int p0 = opts[3]; - GGML_ASSERT(p0 == 0); // padding not supported - GGML_ASSERT(k0 == s0); // only s = k supported - - ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); -} - -// ggml_compute_forward_pool_2d - -static void ggml_compute_forward_pool_2d( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src = dst->src[0]; - - assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); - - if (params->ith != 0) { - return; - } - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - const char * cdata = (const char*)src->data; - const char * const data_end = cdata + ggml_nbytes(src); - - const int64_t px = dst->ne[0]; - const int64_t py = dst->ne[1]; - const int64_t pa = px * py; - - float * dplane = (float *)dst->data; - - const int ka = k0 * k1; - const int offset0 = -p0; - const int offset1 = -p1; - - while (cdata < data_end) { - for (int oy = 0; oy < py; ++oy) { - float * const drow = dplane + oy * px; - for (int ox = 0; ox < px; ++ox) { - float * const out = drow + ox; - switch (op) { - case GGML_OP_POOL_AVG: *out = 0; break; - case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - - const int ix = offset0 + ox * s0; - const int iy = offset1 + oy * s1; - - for (int ky = 0; ky < k1; ++ky) { - if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); - for (int kx = 0; kx < k0; ++kx) { - int j = ix + kx; - if (j < 0 || j >= src->ne[0]) continue; - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); - switch (op) { - case GGML_OP_POOL_AVG: *out += srow_j; break; - case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - } - } - switch (op) { - case GGML_OP_POOL_AVG: *out /= ka; break; - case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); - } - } - } - - cdata += src->nb[2]; - dplane += pa; - } -} - -// ggml_compute_forward_pool_2d_back - -static void ggml_compute_forward_pool_2d_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src = dst->src[0]; - const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst - - assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - - if (params->ith != 0) { - return; - } - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - char * cdata = (char *) dst->data; - const char * cdataf = (const char *) dstf->data; - const char * const data_end = cdata + ggml_nbytes(dst); - - GGML_ASSERT(params->ith == 0); - memset(cdata, 0, ggml_nbytes(dst)); - - const int64_t px = src->ne[0]; - const int64_t py = src->ne[1]; - const int64_t pa = px * py; - - const float * splane = (const float *) src->data; - - const int ka = k0 * k1; - const int offset0 = -p0; - const int offset1 = -p1; - - while (cdata < data_end) { - for (int oy = 0; oy < py; ++oy) { - const float * const srow = splane + oy * px; - for (int ox = 0; ox < px; ++ox) { - const float grad0 = srow[ox]; - - const int ix = offset0 + ox * s0; - const int iy = offset1 + oy * s1; - - if (op == GGML_OP_POOL_MAX) { - float maxval = -FLT_MAX; - int kxmax = -1; - int kymax = -1; - - for (int ky = 0; ky < k1; ++ky) { - if (iy + ky < 0 || iy + ky >= dst->ne[1]) { - continue; - } - const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky)); - for (int kx = 0; kx < k0; ++kx) { - int j = ix + kx; - if (j < 0 || j >= dst->ne[0]) { - continue; - } - - const float val = dst->type == GGML_TYPE_F32 ? - ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); - if (val <= maxval) { - continue; - } - - maxval = val; - kxmax = kx; - kymax = ky; - } - } - - if (kxmax == -1 || kymax == -1) { - continue; - } - - void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax)); - const int j = ix + kxmax; - if (dst->type == GGML_TYPE_F32) { - ((float *) drow)[j] += grad0; - } else { - ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); - } - } else if (op == GGML_OP_POOL_AVG) { - const float grad = grad0 / ka; - - for (int ky = 0; ky < k1; ++ky) { - if (iy + ky < 0 || iy + ky >= dst->ne[1]) { - continue; - } - void * drow = (void *)(cdata + dst->nb[1] * (iy + ky)); - for (int kx = 0; kx < k0; ++kx) { - int j = ix + kx; - if (j < 0 || j >= dst->ne[0]) { - continue; - } - - if (dst->type == GGML_TYPE_F32) { - ((float *) drow)[j] += grad; - } else { - ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); - } - } - } - } else { - GGML_ASSERT(false); - } - } - } - - cdata += dst->nb[2]; - cdataf += dst->nb[2]; - splane += pa; - } -} - -// ggml_compute_forward_upscale - -static void ggml_compute_forward_upscale_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - const float sf0 = (float)ne0/src0->ne[0]; - const float sf1 = (float)ne1/src0->ne[1]; - const float sf2 = (float)ne2/src0->ne[2]; - const float sf3 = (float)ne3/src0->ne[3]; - - // TODO: optimize - - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - const int64_t i01 = i1 / sf1; - for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t i00 = i0 / sf0; - - const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } -} - -static void ggml_compute_forward_upscale( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_upscale_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - -// ggml_compute_forward_pad - -static void ggml_compute_forward_pad_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT( dst->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - float * dst_ptr = (float *) dst->data; - - // TODO: optimize - - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = ith; i1 < ne1; i1 += nth) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - for (int64_t i3 = 0; i3 < ne3; ++i3) { - const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - dst_ptr[dst_idx] = *src_ptr; - } else { - dst_ptr[dst_idx] = 0; - } - } - } - } - } -} - -static void ggml_compute_forward_pad( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_pad_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - -// ggml_compute_forward_arange - -static void ggml_compute_forward_arange_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - GGML_ASSERT(dst->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const float start = ggml_get_op_params_f32(dst, 0); - const float stop = ggml_get_op_params_f32(dst, 1); - const float step = ggml_get_op_params_f32(dst, 2); - - const int64_t steps = (int64_t) ceilf((stop - start) / step); - - GGML_ASSERT(ggml_nelements(dst) == steps); - - for (int64_t i = ith; i < steps; i+= nth) { - float value = start + step * i; - ((float *)dst->data)[i] = value; - } -} - -static void ggml_compute_forward_arange( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - switch (dst->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_arange_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_timestep_embedding_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - const int dim = ggml_get_op_params_i32(dst, 0); - const int max_period = ggml_get_op_params_i32(dst, 1); - - int half = dim / 2; - - for (int64_t i = 0; i < ne00; i++) { - float * embed_data = (float *)((char *) dst->data + i*nb1); - for (int64_t j = ith; j < half; j += nth) { - float timestep = ((float *)src0->data)[i]; - float freq = (float)expf(-logf(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cosf(arg); - embed_data[j + half] = sinf(arg); - } - if (dim % 2 != 0 && ith == 0) { - embed_data[dim] = 0.f; - } - } -} - -static void ggml_compute_forward_timestep_embedding( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_timestep_embedding_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_argsort - -static void ggml_compute_forward_argsort_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_TENSOR_UNARY_OP_LOCALS - - GGML_ASSERT(nb0 == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0); - - for (int64_t i = ith; i < nr; i += nth) { - int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); - const float * src_data = (float *)((char *) src0->data + i*nb01); - - for (int64_t j = 0; j < ne0; j++) { - dst_data[j] = j; - } - - // C doesn't have a functional sort, so we do a bubble sort instead - for (int64_t j = 0; j < ne0; j++) { - for (int64_t k = j + 1; k < ne0; k++) { - if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || - (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { - int32_t tmp = dst_data[j]; - dst_data[j] = dst_data[k]; - dst_data[k] = tmp; - } - } - } - } -} - -static void ggml_compute_forward_argsort( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_argsort_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_flash_attn_ext - -static void ggml_compute_forward_flash_attn_ext_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * mask, - struct ggml_tensor * dst) { - - GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = neq0; - const int64_t N = neq1; - - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne2 == N); - - // input tensor rows must be contiguous - GGML_ASSERT(nbq0 == ggml_type_size(q->type)); - GGML_ASSERT(nbk0 == ggml_type_size(k->type)); - GGML_ASSERT(nbv0 == ggml_type_size(v->type)); - - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev0 == D); - - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nev0 == D); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // broadcast factors - const int64_t rk2 = neq2/nek2; - const int64_t rk3 = neq3/nek3; - - const int64_t rv2 = neq2/nev2; - const int64_t rv3 = neq3/nev3; - - // parallelize by q rows using ggml_vec_dot_f32 - - // total rows in q - const int nr = neq1*neq2*neq3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - float scale = 1.0f; - float max_bias = 0.0f; - float logit_softcap = 0.0f; - - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); - memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float)); - - if (logit_softcap != 0) { - scale /= logit_softcap; - } - - const uint32_t n_head = neq2; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - enum ggml_type const k_vec_dot_type = type_traits[k->type].vec_dot_type; - ggml_from_float_t const q_to_vec_dot = type_traits[k_vec_dot_type].from_float; - ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot; - ggml_to_float_t const v_to_float = type_traits[v->type].to_float; - - GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type"); - GGML_ASSERT(v_to_float && "fattn: unsupported V-type"); - - // loop over n_batch and n_head - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - - const uint32_t h = iq2; // head index - const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; - - float S = 0.0f; // sum - float M = -INFINITY; // maximum KQ value - - float * VKQ32 = (float *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator - float * V32 = (VKQ32 + 1*D); // (temporary) FP32 V buffer - ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator - ggml_fp16_t * Q_q = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16 - - if (v->type == GGML_TYPE_F16) { - memset(VKQ16, 0, D*sizeof(ggml_fp16_t)); - } else { - memset(VKQ32, 0, D*sizeof(float)); - } - - const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; - - // k indices - const int ik3 = iq3 / rk3; - const int ik2 = iq2 / rk2; - - // v indices - const int iv3 = iq3 / rv3; - const int iv2 = iq2 / rv2; - - const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); - q_to_vec_dot(pq, Q_q, D); - - // online softmax / attention - // loop over n_kv and n_head_kv - // ref: https://arxiv.org/pdf/2112.05682.pdf - for (int64_t ic = 0; ic < nek1; ++ic) { - const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; - if (mv == -INFINITY) { - continue; - } - - float s; // KQ value - - const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); - kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1); - - s = s*scale; // scale KQ value - - if (logit_softcap != 0.0f) { - s = logit_softcap*tanhf(s); - } - - s += mv; // apply mask - - const float Mold = M; - - float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value - float vs = 1.0f; // post-softmax KQ value, expf(s - M) - - const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); - - if (v->type == GGML_TYPE_F16) { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); - - // V = V*expf(Mold - M) - ggml_vec_scale_f16(D, VKQ16, ms); - } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); - } - - // V += v*expf(s - M) - ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs); - } else { - if (s > M) { - // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f - M = s; - ms = expf(Mold - M); - - // V = V*expf(Mold - M) - ggml_vec_scale_f32(D, VKQ32, ms); - } else { - // no new maximum, ms == 1.0f, vs != 1.0f - vs = expf(s - M); - } - - v_to_float(v_data, V32, D); - - // V += v*expf(s - M) - ggml_vec_mad_f32(D, VKQ32, V32, vs); - } - - S = S*ms + vs; // scale and increment sum with partial sum - } - - if (v->type == GGML_TYPE_F16) { - for (int64_t d = 0; d < D; ++d) { - VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); - } - } - - // V /= S - const float S_inv = 1.0f/S; - ggml_vec_scale_f32(D, VKQ32, S_inv); - - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // original - //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); - - // permute(0, 2, 1, 3) - memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); - } -} - -static void ggml_compute_forward_flash_attn_ext( - const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, - const struct ggml_tensor * mask, - struct ggml_tensor * dst) { - switch (dst->op_params[3]) { - case GGML_PREC_DEFAULT: - case GGML_PREC_F32: - { - // uses F32 accumulators - ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_flash_attn_back - -static void ggml_compute_forward_flash_attn_back_f32( - const struct ggml_compute_params * params, - const bool masked, - struct ggml_tensor * dst) { - - const struct ggml_tensor * q = dst->src[0]; - const struct ggml_tensor * k = dst->src[1]; - const struct ggml_tensor * v = dst->src[2]; - const struct ggml_tensor * d = dst->src[3]; - - GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - GGML_TENSOR_LOCALS(int64_t, ned, d, ne) - GGML_TENSOR_LOCALS(size_t, nbd, d, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; - - const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); - const int mxDM = MAX(D, Mup); - - // GGML_ASSERT(ne0 == D); - // GGML_ASSERT(ne1 == N); - GGML_ASSERT(P >= 0); - - GGML_ASSERT(nbq0 == sizeof(float)); - GGML_ASSERT(nbk0 == sizeof(float)); - GGML_ASSERT(nbv0 == sizeof(float)); - - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev1 == D); - GGML_ASSERT(ned0 == D); - - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nek1 == N + P); - GGML_ASSERT(nev1 == D); - GGML_ASSERT(ned1 == N); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - if (ith == 0) { - memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); - } - ggml_barrier(params->threadpool); - - const int64_t elem_q = ggml_nelements(q); - const int64_t elem_k = ggml_nelements(k); - - enum ggml_type result_type = dst->type; - GGML_ASSERT(ggml_blck_size(result_type) == 1); - const size_t tsize = ggml_type_size(result_type); - - const size_t offs_q = 0; - const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); - const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); - - void * grad_q = (char *) dst->data; - void * grad_k = (char *) dst->data + offs_k; - void * grad_v = (char *) dst->data + offs_v; - - const size_t nbgq1 = nb0*neq0; - const size_t nbgq2 = nb0*neq0*neq1; - const size_t nbgq3 = nb0*neq0*neq1*neq2; - - const size_t nbgk1 = nb0*nek0; - const size_t nbgk2 = nb0*nek0*nek1; - const size_t nbgk3 = nb0*nek0*nek1*neq2; - - const size_t nbgv1 = nb0*nev0; - const size_t nbgv2 = nb0*nev0*nev1; - const size_t nbgv3 = nb0*nev0*nev1*neq2; - - // parallelize by k rows using ggml_vec_dot_f32 - - // total rows in k - const int nr = nek2*nek3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - const float scale = 1.0f/sqrtf(D); - - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); - - // how often k2 (and v2) is repeated in q2 - int nrep = neq2/nek2; - - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int ik3 = ir/(nek2); - const int ik2 = ir - ik3*nek2; - - const int iq3 = ik3; - const int id3 = ik3; - const int iv3 = ik3; - const int iv2 = ik2; - - for (int irep = 0; irep < nrep; ++irep) { - const int iq2 = ik2 + irep*nek2; - const int id2 = iq2; - - // (ik2 + irep*nek2) % nek2 == ik2 - for (int iq1 = 0; iq1 < neq1; ++iq1) { - const int id1 = iq1; - - // not sure about CACHE_LINE_SIZE_F32.. - // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? - float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); - float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); - - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } - - const int64_t masked_begin = masked ? (P + iq1 + 1) : M; - for (int64_t ic = 0; ic < masked_begin; ++ic) { - // k indices - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - ggml_vec_dot_f32(neq0, - S + i1, 0, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); - } - - // scale - ggml_vec_scale_f32(masked_begin, S, scale); - - for (int64_t i = masked_begin; i < M; i++) { - S[i] = -INFINITY; - } - - // softmax - // exclude known -INF S[..] values from max and loop - // dont forget to set their SM values to zero - { - float max = -INFINITY; - ggml_vec_max_f32(masked_begin, &max, S); - - ggml_float sum = 0.0; - { -#ifdef GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(SM, 1, &max, SM, 1, Mup); - vvexpf(SM, SM, &Mup); - ggml_vec_sum_f32(Mup, &sum, SM); -#else - sum = ggml_vec_soft_max_f32(Mup, SM, S, max); -#endif - } - - assert(sum > 0.0); - - sum = 1.0/sum; - ggml_vec_scale_f32(masked_begin, SM, sum); - - } - - // step-by-step explanation - { - // forward-process shape grads from backward process - // parallel_for ik2,ik3: - // for irep: - // iq2 = ik2 + irep*nek2 - // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,ik2,ik3] += grad[kcur] - // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] - // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iv2,iv3] += grad[vcur] - // for iq1: - // kcur = k[:D,:M,ik2,ik3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur - // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur - // vcur = v[:M,:D,iv2,iv3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 - // S0 = -Inf [D,1,1,1] - // ~S1[i] = dot(kcur[:D,i], qcur) - // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale - // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) - // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur - // ~S5[i] = dot(vcur[:,i], S4) - // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,id1,id2,id3] - // ~dst[i,iq1,iq2,iq3] = S5[i] ^ - // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3] - // dst backward-/ grad[dst] = d - // - // output gradients with their dependencies: - // - // grad[kcur] = grad[S1].T @ qcur - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S4] = grad[S5] @ vcur - // grad[S4] = d[:D,id1,id2,id3] @ vcur - // grad[qcur] = grad[S1] @ kcur - // grad[vcur] = grad[S5].T @ S4 - // grad[vcur] = d[:D,id1,id2,id3].T @ S4 - // - // in post-order: - // - // S1 = qcur @ kcur.T - // S2 = S1 * scale - // S3 = diag_mask_inf(S2, P) - // S4 = softmax(S3) - // grad[S4] = d[:D,id1,id2,id3] @ vcur - // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) - // grad[S1] = diag_mask_zero(grad[S3], P) * scale - // grad[qcur] = grad[S1] @ kcur - // grad[kcur] = grad[S1].T @ qcur - // grad[vcur] = d[:D,id1,id2,id3].T @ S4 - // - // using less variables (SM=S4): - // - // S = diag_mask_inf(qcur @ kcur.T * scale, P) - // SM = softmax(S) - // S = d[:D,iq1,iq2,iq3] @ vcur - // dot_SM_gradSM = dot(SM, S) - // S = SM * (S - dot(SM, S)) - // S = diag_mask_zero(S, P) * scale - // - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[k][:D,:M,ik2,ik3] += S.T @ qcur - // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM - } - - // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] - // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] - // for ic: - // S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3] - // exclude known future zero S[..] values from operation - ggml_vec_set_f32(masked_begin, S, 0); - for (int64_t ic = 0; ic < D; ++ic) { - ggml_vec_mad_f32(masked_begin, - S, - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); - } - - // S = SM * (S - dot(SM, S)) - float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); - ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); - ggml_vec_mul_f32 (masked_begin, S, S, SM); - - // S = diag_mask_zero(S, P) * scale - // already done by above ggml_vec_set_f32 - - // exclude known zero S[..] values from operation - ggml_vec_scale_f32(masked_begin, S, scale); - - // S shape [M,1] - // SM shape [M,1] - // kcur shape [D,M] - // qcur shape [D,1] - // vcur shape [M,D] - - // grad[q][:D,iq1,iq2,iq3] += S @ kcur - // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] - // for ic: - // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3] - // exclude known zero S[..] values from loop - for (int64_t ic = 0; ic < masked_begin; ++ic) { - ggml_vec_mad_f32(D, - (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), - (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), - S[ic]); - } - - // grad[k][:D,:M,iq2,iq3] += S.T @ qcur - // for ic: - // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] - // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] - // exclude known zero S[..] values from loop - for (int64_t ic = 0; ic < masked_begin; ++ic) { - ggml_vec_mad_f32(D, - (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), - S[ic]); - } - - // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM - // for ic: - // grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M] - // grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3] * SM[:M] - // exclude known zero SM[..] values from mad - for (int64_t ic = 0; ic < D; ++ic) { - ggml_vec_mad_f32(masked_begin, - (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), - SM, - *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); - } - } - } - } -} - -static void ggml_compute_forward_flash_attn_back( - const struct ggml_compute_params * params, - const bool masked, - struct ggml_tensor * dst) { - - const struct ggml_tensor * q = dst->src[0]; - - switch (q->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_flash_attn_back_f32(params, masked, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_ssm_conv - -static void ggml_compute_forward_ssm_conv_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; // conv_x - const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight - - const int ith = params->ith; - const int nth = params->nth; - - const int nc = src1->ne[0]; // d_conv - const int ncs = src0->ne[0]; // d_conv - 1 + n_t - const int nr = src0->ne[1]; // d_inner - const int n_t = dst->ne[1]; // tokens per sequence - const int n_s = dst->ne[2]; // number of sequences in the batch - - GGML_ASSERT( dst->ne[0] == nr); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(src1->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - const int ir = ir1 - ir0; - - for (int i3 = 0; i3 < n_s; ++i3) { - for (int i2 = 0; i2 < n_t; ++i2) { - // {d_conv - 1 + n_t, d_inner, n_seqs} - // sliding window - const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} - const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner} - float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} - - // TODO: transpose the output for smaller strides for big batches? - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - // rowwise dot product - // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision - float sumf = 0.0f; - - // d_conv - for (int i0 = 0; i0 < nc; ++i0) { - sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; - } - x[i1] = sumf; - } - } - } -} - -static void ggml_compute_forward_ssm_conv( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - switch (dst->src[0]->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_ssm_conv_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_ssm_scan - -static void ggml_compute_forward_ssm_scan_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; // s - const struct ggml_tensor * src1 = dst->src[1]; // x - const struct ggml_tensor * src2 = dst->src[2]; // dt - const struct ggml_tensor * src3 = dst->src[3]; // A - const struct ggml_tensor * src4 = dst->src[4]; // B - const struct ggml_tensor * src5 = dst->src[5]; // C - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nc = src0->ne[0]; // d_state - const int64_t nr = src0->ne[1]; // d_inner - const int64_t n_t = src1->ne[1]; // number of tokens per sequence - const int64_t n_s = src0->ne[2]; // number of sequences in the batch - - GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(src1->nb[0] == sizeof(float)); - GGML_ASSERT(src2->nb[0] == sizeof(float)); - GGML_ASSERT(src3->nb[0] == sizeof(float)); - GGML_ASSERT(src4->nb[0] == sizeof(float)); - GGML_ASSERT(src5->nb[0] == sizeof(float)); - // required for the dot product between s and C - GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); - // required for per-sequence offsets for states - GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); - // required to get correct offset for state destination (i.e. src1->nb[3]) - GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - const int ir = ir1 - ir0; - - for (int i3 = 0; i3 < n_s; ++i3) { - for (int i2 = 0; i2 < n_t; ++i2) { - const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} - const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} - const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} - const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} - const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} - float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} - float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} - - // use the output as the source for the next token-wise iterations - if (i2 > 0) { s0 = s; } - - // d_inner - for (int i1 = 0; i1 < ir; ++i1) { - // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 - float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; - float x_dt = x[i1] * dt_soft_plus; - float sumf = 0.0f; - // d_state - for (int i0 = 0; i0 < nc; ++i0) { - int i = i0 + i1*nc; - // state = prev_state * dA + dB * x - float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); - // y = rowwise_dotprod(state, C) - sumf += state * C[i0]; - s[i] = state; - } - y[i1] = sumf; - } - } - } -} - -static void ggml_compute_forward_ssm_scan( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - switch (dst->src[0]->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_ssm_scan_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_win_part - -static void ggml_compute_forward_win_part_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - UNUSED(params); - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - - const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; - const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; - const int32_t w = ((const int32_t *)(dst->op_params))[2]; - - assert(ne00 == ne0); - assert(ne3 == nep0*nep1); - - // TODO: optimize / multi-thread - for (int py = 0; py < nep1; ++py) { - for (int px = 0; px < nep0; ++px) { - const int64_t i3 = py*nep0 + px; - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = 0; i1 < ne1; ++i1) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int64_t i02 = py*w + i2; - const int64_t i01 = px*w + i1; - const int64_t i00 = i0; - - const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0; - const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; - - if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { - ((float *) dst->data)[i] = 0.0f; - } else { - ((float *) dst->data)[i] = ((float *) src0->data)[j]; - } - } - } - } - } - } -} - -static void ggml_compute_forward_win_part( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_win_part_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_win_unpart - -static void ggml_compute_forward_win_unpart_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - UNUSED(params); - - const struct ggml_tensor * src0 = dst->src[0]; - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - - const int32_t w = ((const int32_t *)(dst->op_params))[0]; - - // padding - const int px = (w - ne1%w)%w; - //const int py = (w - ne2%w)%w; - - const int npx = (px + ne1)/w; - //const int npy = (py + ne2)/w; - - assert(ne0 == ne00); - - // TODO: optimize / multi-thread - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = 0; i1 < ne1; ++i1) { - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int ip2 = i2/w; - const int ip1 = i1/w; - - const int64_t i02 = i2%w; - const int64_t i01 = i1%w; - const int64_t i00 = i0; - - const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; - const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; - - ((float *) dst->data)[j] = ((float *) src0->data)[i]; - } - } - } -} - -static void ggml_compute_forward_win_unpart( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_win_unpart_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -//gmml_compute_forward_unary - -static void ggml_compute_forward_unary( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const enum ggml_unary_op op = ggml_get_unary_op(dst); - - switch (op) { - case GGML_UNARY_OP_ABS: - { - ggml_compute_forward_abs(params, dst); - } break; - case GGML_UNARY_OP_SGN: - { - ggml_compute_forward_sgn(params, dst); - } break; - case GGML_UNARY_OP_NEG: - { - ggml_compute_forward_neg(params, dst); - } break; - case GGML_UNARY_OP_STEP: - { - ggml_compute_forward_step(params, dst); - } break; - case GGML_UNARY_OP_TANH: - { - ggml_compute_forward_tanh(params, dst); - } break; - case GGML_UNARY_OP_ELU: - { - ggml_compute_forward_elu(params, dst); - } break; - case GGML_UNARY_OP_RELU: - { - ggml_compute_forward_relu(params, dst); - } break; - case GGML_UNARY_OP_SIGMOID: - { - ggml_compute_forward_sigmoid(params, dst); - } break; - case GGML_UNARY_OP_GELU: - { - ggml_compute_forward_gelu(params, dst); - } break; - case GGML_UNARY_OP_GELU_QUICK: - { - ggml_compute_forward_gelu_quick(params, dst); - } break; - case GGML_UNARY_OP_SILU: - { - ggml_compute_forward_silu(params, dst); - } break; - case GGML_UNARY_OP_HARDSWISH: - { - ggml_compute_forward_hardswish(params, dst); - } break; - case GGML_UNARY_OP_HARDSIGMOID: - { - ggml_compute_forward_hardsigmoid(params, dst); - } break; - case GGML_UNARY_OP_EXP: - { - ggml_compute_forward_exp(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_get_rel_pos - -static void ggml_compute_forward_get_rel_pos_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - UNUSED(params); - - const struct ggml_tensor * src0 = dst->src[0]; - - // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - - GGML_TENSOR_UNARY_OP_LOCALS - - const int64_t w = ne1; - - ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; - ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; - - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = 0; i1 < ne1; ++i1) { - const int64_t pos = (w - i1 - 1) + i2; - for (int64_t i0 = 0; i0 < ne0; ++i0) { - dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; - } - } - } -} - -static void ggml_compute_forward_get_rel_pos( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - { - ggml_compute_forward_get_rel_pos_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_add_rel_pos - -static void ggml_compute_forward_add_rel_pos_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * src2 = dst->src[2]; - - const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; - if (!inplace) { - if (params->ith == 0) { - memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); - } - ggml_barrier(params->threadpool); - } - // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 - - float * src1_data = (float *) src1->data; - float * src2_data = (float *) src2->data; - float * dst_data = (float *) dst->data; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int ith = params->ith; - const int nth = params->nth; - - // total patches in dst - const int np = ne13; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - - for (int64_t i13 = ip0; i13 < ip1; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; - for (int64_t i10 = 0; i10 < ne10; ++i10) { - const int64_t jp0 = jp1 + i10; - const float src1_e = src1_data[jp0]; - const float src2_e = src2_data[jp0]; - - const int64_t jdh = jp0 * ne10; - const int64_t jdw = jdh - (ne10 - 1) * i10; - - for (int64_t j = 0; j < ne10; ++j) { - dst_data[jdh + j ] += src2_e; - dst_data[jdw + j*ne10] += src1_e; - } - } - } - } - } -} - -static void ggml_compute_forward_add_rel_pos( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_add_rel_pos_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_rwkv_wkv - -static void ggml_compute_forward_rwkv_wkv_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - const size_t T = dst->src[1]->ne[3]; - const size_t C = dst->ne[0]; - const size_t H = dst->src[1]->ne[2]; - const size_t n_seqs = dst->src[5]->ne[1]; - - float * dst_data = (float *) dst->data; - float * state = ((float *) dst->data) + C * T; - - if (params->ith != 0) { - return; - } - - memset(dst_data, 0, T * C * sizeof(float)); - - float * k = (float *) dst->src[0]->data; - float * v = (float *) dst->src[1]->data; - float * r = (float *) dst->src[2]->data; - float * time_faaaa = (float *) dst->src[3]->data; - float * time_decay = (float *) dst->src[4]->data; - - size_t t_stride = H * (C / H); - - size_t h_stride = C / H; - size_t h_stride_2d = (C / H) * (C / H); - - // basically fused operations: - // dst = r @ (time_faaaa * (k @ v) + state), - // state = time_decay * state + (k @ v), - // recursive through each token - for (size_t t = 0; t < T; t++) { - size_t t_offset = t * t_stride; - size_t state_offset = (C / H) * C * (t / (T / n_seqs)); - float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; - - for (size_t h = 0; h < H; h++) { - size_t h_offset = h * h_stride; - size_t t_h_offset = t_offset + h_offset; - size_t h_2d_offset = h * h_stride_2d; - - for (size_t i = 0; i < C / H; i++) { - size_t t_h_i_offset = t_h_offset + i; - size_t h_i_offset = h_offset + i; - size_t h_2d_i_offset = h_2d_offset + i * h_stride; - - float k_val = k[t_h_i_offset]; - float r_val = r[t_h_i_offset]; - float time_faaaa_val = time_faaaa[h_i_offset]; - // RWKV v6: different time_decay for each token. - float time_decay_val = time_decay[t_h_i_offset]; - - for (size_t j = 0; j < C / H; j ++) { - size_t t_h_j_offset = t_h_offset + j; - size_t h_2d_i_j_offset = h_2d_i_offset + j; - - float v_val = v[t_h_j_offset]; - float kv_val = v_val * k_val; - float prev_state_val = state_prev[h_2d_i_j_offset]; - float temp_val = kv_val * time_faaaa_val + prev_state_val; - dst_data[t_h_j_offset] += temp_val * r_val; - state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; - } - } - } - } -} - -static void ggml_compute_forward_rwkv_wkv( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_rwkv_wkv_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_map_unary - -static void ggml_compute_forward_map_unary_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - fun(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_map_unary( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_unary_f32(params, dst, fun); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_map_binary - -static void ggml_compute_forward_map_binary_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_binary_op_f32_t fun) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(src1)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - fun(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); - } -} - -static void ggml_compute_forward_map_binary( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_binary_op_f32_t fun) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_binary_f32(params, dst, fun); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_map_custom1 - -static void ggml_compute_forward_map_custom1_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_custom1_op_f32_t fun) { - - const struct ggml_tensor * a = dst->src[0]; - - if (params->ith != 0) { - return; - } - - fun(dst, a); -} - -// ggml_compute_forward_map_custom2 - -static void ggml_compute_forward_map_custom2_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_custom2_op_f32_t fun) { - - const struct ggml_tensor * a = dst->src[0]; - const struct ggml_tensor * b = dst->src[1]; - - if (params->ith != 0) { - return; - } - - fun(dst, a, b); -} - -// ggml_compute_forward_map_custom3 - -static void ggml_compute_forward_map_custom3_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst, - const ggml_custom3_op_f32_t fun) { - - const struct ggml_tensor * a = dst->src[0]; - const struct ggml_tensor * b = dst->src[1]; - const struct ggml_tensor * c = dst->src[1]; - - if (params->ith != 0) { - return; - } - - fun(dst, a, b, c); -} - -// ggml_compute_forward_map_custom1 - -static void ggml_compute_forward_map_custom1( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * a = dst->src[0]; - - struct ggml_map_custom1_op_params p; - memcpy(&p, dst->op_params, sizeof(p)); - - p.fun(dst, a, params->ith, params->nth, p.userdata); -} - -// ggml_compute_forward_map_custom2 - -static void ggml_compute_forward_map_custom2( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * a = dst->src[0]; - const struct ggml_tensor * b = dst->src[1]; - - struct ggml_map_custom2_op_params p; - memcpy(&p, dst->op_params, sizeof(p)); - - p.fun(dst, a, b, params->ith, params->nth, p.userdata); -} - -// ggml_compute_forward_map_custom3 - -static void ggml_compute_forward_map_custom3( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * a = dst->src[0]; - const struct ggml_tensor * b = dst->src[1]; - const struct ggml_tensor * c = dst->src[2]; - - struct ggml_map_custom3_op_params p; - memcpy(&p, dst->op_params, sizeof(p)); - - p.fun(dst, a, b, c, params->ith, params->nth, p.userdata); -} - -// ggml_compute_forward_cross_entropy_loss - -static void ggml_compute_forward_cross_entropy_loss_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(ggml_are_same_shape(src0, src1)); - GGML_ASSERT(ggml_is_scalar(dst)); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - // TODO: handle transposed/permuted matrices - const int64_t nc = src0->ne[0]; - const int64_t nr = ggml_nrows(src0); - - const int ith = params->ith; - const int nth = params->nth; - - float * sums = (float *) params->wdata; - float * st = ((float *) params->wdata) + nth + ith*nc; - float sum_thread = 0.0f; - - GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - for (int64_t i1 = ir0; i1 < ir1; ++i1) { - const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); - const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); - -#ifndef NDEBUG - for (int64_t i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(s0[i])); - assert(!isnan(s1[i])); - } -#endif - - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, s0); - const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max); - assert(sum_softmax >= 0.0); - - ggml_vec_add1_f32(nc, st, st, -sum_softmax); - ggml_vec_mul_f32(nc, st, st, s1); - - float sum_st = 0.0f; - ggml_vec_sum_f32(nc, &sum_st, st); - sum_thread += sum_st; - -#ifndef NDEBUG - for (int64_t i = 0; i < nc; ++i) { - assert(!isnan(st[i])); - assert(!isinf(st[i])); - } -#endif - } - sums[ith] = sum_thread; - ggml_barrier(params->threadpool); - - if (ith == 0) { - float * dp = (float *) dst->data; - ggml_vec_sum_f32(nth, dp, sums); - dp[0] *= -1.0f / (float) nr; - } -} - -static void ggml_compute_forward_cross_entropy_loss( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_cross_entropy_loss_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_cross_entropy_loss_back - -static void ggml_compute_forward_cross_entropy_loss_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * opt0 = dst->src[2]; - - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(opt0)); - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int64_t ith = params->ith; - const int64_t nth = params->nth; - - // TODO: handle transposed/permuted matrices - const int64_t nc = src0->ne[0]; - const int64_t nr = ggml_nrows(src0); - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr; - - for (int64_t i1 = ir0; i1 < ir1; i1++) { - float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); - float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); - float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - -#ifndef NDEBUG - for (int64_t i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(s0[i])); - assert(!isnan(s1[i])); - } -#endif - - // soft_max - float max = -INFINITY; - ggml_vec_max_f32(nc, &max, s0); - ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); - assert(sum > 0.0); - ggml_vec_scale_f32(nc, ds0, 1.0/sum); - - // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr - ggml_vec_sub_f32(nc, ds0, ds0, s1); - ggml_vec_scale_f32(nc, ds0, d_by_nr); - -#ifndef NDEBUG - for (int64_t i = 0; i < nc; ++i) { - assert(!isnan(ds0[i])); - assert(!isinf(ds0[i])); - } -#endif - } -} - -static void ggml_compute_forward_cross_entropy_loss_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_opt_step_adamw_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src0_grad = dst->src[1]; - const struct ggml_tensor * src0_grad_m = dst->src[2]; - const struct ggml_tensor * src0_grad_v = dst->src[3]; - GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_UNARY_OP_LOCALS - GGML_ASSERT(nb00 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - /* const float gnorm = 1.0f; */ - int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t)); - const float alpha = ggml_get_op_params_f32(dst, 2); - const float beta1 = ggml_get_op_params_f32(dst, 3); - const float beta2 = ggml_get_op_params_f32(dst, 4); - const float eps = ggml_get_op_params_f32(dst, 5); - const float wd = ggml_get_op_params_f32(dst, 6); - - const float beta1h = alpha/(1.0f - powf(beta1, iter)); - const float beta2h = 1.0f/(1.0f - powf(beta2, iter)); - - for (int ir = ir0; ir < ir1; ++ir) { - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const size_t offset = i03*nb03 + i02*nb02 + i01*nb01; - - float * w = (float *) ((char *) src0->data + offset); // weight - const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad - float * m = (float *) ((char *) src0_grad_m->data + offset); - float * v = (float *) ((char *) src0_grad_v->data + offset); - - for (int i00 = 0; i00 < ne00; ++i00) { - m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); - v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); - - const float mh = m[i00]*beta1h; - const float vh = sqrtf(v[i00]*beta2h) + eps; - - // The weight decay is applied independently of the Adam momenta m and v. - // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss. - // See: https://arxiv.org/pdf/1711.05101v3.pdf - w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh; - } - } - - ggml_barrier(params->threadpool); - if (ith != 0) { - return; - } - - iter++; - memcpy(&dst->op_params[0], &iter, sizeof(int64_t)); -} - -static void ggml_compute_forward_opt_step_adamw( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_opt_step_adamw_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} -///////////////////////////////// - -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { - GGML_ASSERT(params); - - if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { - return; - } - - switch (tensor->op) { - case GGML_OP_DUP: - { - ggml_compute_forward_dup(params, tensor); - } break; - case GGML_OP_ADD: - { - ggml_compute_forward_add(params, tensor); - } break; - case GGML_OP_ADD1: - { - ggml_compute_forward_add1(params, tensor); - } break; - case GGML_OP_ACC: - { - ggml_compute_forward_acc(params, tensor); - } break; - case GGML_OP_SUB: - { - ggml_compute_forward_sub(params, tensor); - } break; - case GGML_OP_MUL: - { - ggml_compute_forward_mul(params, tensor); - } break; - case GGML_OP_DIV: - { - ggml_compute_forward_div(params, tensor); - } break; - case GGML_OP_SQR: - { - ggml_compute_forward_sqr(params, tensor); - } break; - case GGML_OP_SQRT: - { - ggml_compute_forward_sqrt(params, tensor); - } break; - case GGML_OP_LOG: - { - ggml_compute_forward_log(params, tensor); - } break; - case GGML_OP_SIN: - { - ggml_compute_forward_sin(params, tensor); - } break; - case GGML_OP_COS: - { - ggml_compute_forward_cos(params, tensor); - } break; - case GGML_OP_SUM: - { - ggml_compute_forward_sum(params, tensor); - } break; - case GGML_OP_SUM_ROWS: - { - ggml_compute_forward_sum_rows(params, tensor); - } break; - case GGML_OP_MEAN: - { - ggml_compute_forward_mean(params, tensor); - } break; - case GGML_OP_ARGMAX: - { - ggml_compute_forward_argmax(params, tensor); - } break; - case GGML_OP_COUNT_EQUAL: - { - ggml_compute_forward_count_equal(params, tensor); - } break; - case GGML_OP_REPEAT: - { - ggml_compute_forward_repeat(params, tensor); - } break; - case GGML_OP_REPEAT_BACK: - { - ggml_compute_forward_repeat_back(params, tensor); - } break; - case GGML_OP_CONCAT: - { - ggml_compute_forward_concat(params, tensor); - } break; - case GGML_OP_SILU_BACK: - { - ggml_compute_forward_silu_back(params, tensor); - } break; - case GGML_OP_NORM: - { - ggml_compute_forward_norm(params, tensor); - } break; - case GGML_OP_RMS_NORM: - { - ggml_compute_forward_rms_norm(params, tensor); - } break; - case GGML_OP_RMS_NORM_BACK: - { - ggml_compute_forward_rms_norm_back(params, tensor); - } break; - case GGML_OP_GROUP_NORM: - { - ggml_compute_forward_group_norm(params, tensor); - } break; - case GGML_OP_MUL_MAT: - { - ggml_compute_forward_mul_mat(params, tensor); - } break; - case GGML_OP_MUL_MAT_ID: - { - ggml_compute_forward_mul_mat_id(params, tensor); - } break; - case GGML_OP_OUT_PROD: - { - ggml_compute_forward_out_prod(params, tensor); - } break; - case GGML_OP_SCALE: - { - ggml_compute_forward_scale(params, tensor); - } break; - case GGML_OP_SET: - { - ggml_compute_forward_set(params, tensor); - } break; - case GGML_OP_CPY: - { - ggml_compute_forward_cpy(params, tensor); - } break; - case GGML_OP_CONT: - { - ggml_compute_forward_cont(params, tensor); - } break; - case GGML_OP_RESHAPE: - { - ggml_compute_forward_reshape(params, tensor); - } break; - case GGML_OP_VIEW: - { - ggml_compute_forward_view(params, tensor); - } break; - case GGML_OP_PERMUTE: - { - ggml_compute_forward_permute(params, tensor); - } break; - case GGML_OP_TRANSPOSE: - { - ggml_compute_forward_transpose(params, tensor); - } break; - case GGML_OP_GET_ROWS: - { - ggml_compute_forward_get_rows(params, tensor); - } break; - case GGML_OP_GET_ROWS_BACK: - { - ggml_compute_forward_get_rows_back(params, tensor); - } break; - case GGML_OP_DIAG: - { - ggml_compute_forward_diag(params, tensor); - } break; - case GGML_OP_DIAG_MASK_INF: - { - ggml_compute_forward_diag_mask_inf(params, tensor); - } break; - case GGML_OP_DIAG_MASK_ZERO: - { - ggml_compute_forward_diag_mask_zero(params, tensor); - } break; - case GGML_OP_SOFT_MAX: - { - ggml_compute_forward_soft_max(params, tensor); - } break; - case GGML_OP_SOFT_MAX_BACK: - { - ggml_compute_forward_soft_max_back(params, tensor); - } break; - case GGML_OP_ROPE: - { - ggml_compute_forward_rope(params, tensor); - } break; - case GGML_OP_ROPE_BACK: - { - ggml_compute_forward_rope_back(params, tensor); - } break; - case GGML_OP_CLAMP: - { - ggml_compute_forward_clamp(params, tensor); - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - ggml_compute_forward_conv_transpose_1d(params, tensor); - } break; - case GGML_OP_IM2COL: - { - ggml_compute_forward_im2col(params, tensor); - } break; - case GGML_OP_IM2COL_BACK: - { - ggml_compute_forward_im2col_back_f32(params, tensor); - } break; - case GGML_OP_CONV_TRANSPOSE_2D: - { - ggml_compute_forward_conv_transpose_2d(params, tensor); - } break; - case GGML_OP_POOL_1D: - { - ggml_compute_forward_pool_1d(params, tensor); - } break; - case GGML_OP_POOL_2D: - { - ggml_compute_forward_pool_2d(params, tensor); - } break; - case GGML_OP_POOL_2D_BACK: - { - ggml_compute_forward_pool_2d_back(params, tensor); - } break; - case GGML_OP_UPSCALE: - { - ggml_compute_forward_upscale(params, tensor); - } break; - case GGML_OP_PAD: - { - ggml_compute_forward_pad(params, tensor); - } break; - case GGML_OP_ARANGE: - { - ggml_compute_forward_arange(params, tensor); - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - ggml_compute_forward_timestep_embedding(params, tensor); - } break; - case GGML_OP_ARGSORT: - { - ggml_compute_forward_argsort(params, tensor); - } break; - case GGML_OP_LEAKY_RELU: - { - ggml_compute_forward_leaky_relu(params, tensor); - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - int32_t t = ggml_get_op_params_i32(tensor, 0); - GGML_ASSERT(t == 0 || t == 1); - bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, masked, tensor); - } break; - case GGML_OP_SSM_CONV: - { - ggml_compute_forward_ssm_conv(params, tensor); - } break; - case GGML_OP_SSM_SCAN: - { - ggml_compute_forward_ssm_scan(params, tensor); - } break; - case GGML_OP_WIN_PART: - { - ggml_compute_forward_win_part(params, tensor); - } break; - case GGML_OP_WIN_UNPART: - { - ggml_compute_forward_win_unpart(params, tensor); - } break; - case GGML_OP_UNARY: - { - ggml_compute_forward_unary(params, tensor); - } break; - case GGML_OP_GET_REL_POS: - { - ggml_compute_forward_get_rel_pos(params, tensor); - } break; - case GGML_OP_ADD_REL_POS: - { - ggml_compute_forward_add_rel_pos(params, tensor); - } break; - case GGML_OP_RWKV_WKV: - { - ggml_compute_forward_rwkv_wkv(params, tensor); - } break; - case GGML_OP_MAP_UNARY: - { - ggml_unary_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_unary(params, tensor, fun); - } - break; - case GGML_OP_MAP_BINARY: - { - ggml_binary_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_binary(params, tensor, fun); - } - break; - case GGML_OP_MAP_CUSTOM1_F32: - { - ggml_custom1_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1_f32(params, tensor, fun); - } - break; - case GGML_OP_MAP_CUSTOM2_F32: - { - ggml_custom2_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2_f32(params, tensor, fun); - } - break; - case GGML_OP_MAP_CUSTOM3_F32: - { - ggml_custom3_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3_f32(params, tensor, fun); - } - break; - case GGML_OP_MAP_CUSTOM1: - { - ggml_compute_forward_map_custom1(params, tensor); - } - break; - case GGML_OP_MAP_CUSTOM2: - { - ggml_compute_forward_map_custom2(params, tensor); - } - break; - case GGML_OP_MAP_CUSTOM3: - { - ggml_compute_forward_map_custom3(params, tensor); - } - break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - ggml_compute_forward_cross_entropy_loss(params, tensor); - } - break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - ggml_compute_forward_cross_entropy_loss_back(params, tensor); - } - break; - case GGML_OP_OPT_STEP_ADAMW: - { - ggml_compute_forward_opt_step_adamw(params, tensor); - } - break; - case GGML_OP_NONE: - { - // nop - } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); - } - } -} - -//////////////////////////////////////////////////////////////////////////////// - struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; @@ -18870,7 +6320,6 @@ void ggml_build_opt_adamw( } } - static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { void * ptr = *p; ptr = (void *) GGML_PAD((uintptr_t) ptr, align); @@ -18994,6 +6443,19 @@ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgrap return result; } +struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + if (ggml_is_empty(tensor)) { + return tensor; + } + if (tensor->buffer) { + ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); + } else { + GGML_ASSERT(tensor->data); + memset(tensor->data, 0, ggml_nbytes(tensor)); + } + return tensor; +} + void ggml_graph_reset(struct ggml_cgraph * cgraph) { GGML_ASSERT(cgraph->grads != NULL); @@ -19058,1096 +6520,6 @@ void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tenso cgraph->n_nodes++; } -// Android's libc implementation "bionic" does not support setting affinity -#if defined(__gnu_linux__) -static void set_numa_thread_affinity(int thread_n) { - if (!ggml_is_numa()) { - return; - } - - int node_num; - int rv; - size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); - - switch(g_state.numa.numa_strategy) { - case GGML_NUMA_STRATEGY_DISTRIBUTE: - // run thread on node_num thread_n / (threads per node) - node_num = thread_n % g_state.numa.n_nodes; - break; - case GGML_NUMA_STRATEGY_ISOLATE: - // run thread on current_node - node_num = g_state.numa.current_node; - break; - case GGML_NUMA_STRATEGY_NUMACTL: - // use the cpuset that numactl gave us - rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); - } - return; - default: - return; - } - - struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; - - cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); - CPU_ZERO_S(setsize, cpus); - for (size_t i = 0; i < node->n_cpus; ++i) { - CPU_SET_S(node->cpus[i], setsize, cpus); - } - - rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); - } - - CPU_FREE(cpus); -} - -static void clear_numa_thread_affinity(void) { - if (!ggml_is_numa()) { - return; - } - - size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); - - cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); - CPU_ZERO_S(setsize, cpus); - for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) { - CPU_SET_S(i, setsize, cpus); - } - - int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); - } - - CPU_FREE(cpus); -} -#else -// TODO: Windows etc. -// (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } -static void clear_numa_thread_affinity(void) {} -#endif - -static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { - int n_tasks = 0; - - if (ggml_is_empty(node)) { - // no need to multi-thread a no-op - n_tasks = 1; - return n_tasks; - } - - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - case GGML_OP_CONT: - case GGML_OP_ADD: - case GGML_OP_ADD1: - case GGML_OP_ACC: - { - n_tasks = n_threads; - } break; - case GGML_OP_SUB: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - { - n_tasks = 1; - } break; - case GGML_OP_COUNT_EQUAL: - { - n_tasks = n_threads; - } break; - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_LEAKY_RELU: - { - n_tasks = 1; - } break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(node)) { - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SGN: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_EXP: - { - n_tasks = 1; - } break; - - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_SILU: - { - n_tasks = n_threads; - } break; - default: - GGML_ABORT("fatal error"); - } - break; - case GGML_OP_SILU_BACK: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - case GGML_OP_GROUP_NORM: - case GGML_OP_CONCAT: - case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: - case GGML_OP_OUT_PROD: - { - n_tasks = n_threads; - } break; - case GGML_OP_GET_ROWS: - { - // FIXME: get_rows can use additional threads, but the cost of launching additional threads - // decreases performance with GPU offloading - //n_tasks = n_threads; - n_tasks = 1; - } break; - case GGML_OP_SCALE: - case GGML_OP_SET: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - { - n_tasks = 1; - } break; - case GGML_OP_DIAG_MASK_ZERO: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - case GGML_OP_ADD_REL_POS: - { - n_tasks = n_threads; - } break; - case GGML_OP_CLAMP: - { - n_tasks = 1; //TODO - } break; - case GGML_OP_SOFT_MAX: - { - n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); - } break; - case GGML_OP_IM2COL: - case GGML_OP_IM2COL_BACK: - case GGML_OP_CONV_TRANSPOSE_1D: - case GGML_OP_CONV_TRANSPOSE_2D: - { - n_tasks = n_threads; - } break; - case GGML_OP_POOL_1D: - case GGML_OP_POOL_2D: - case GGML_OP_POOL_2D_BACK: - { - n_tasks = 1; - } break; - case GGML_OP_UPSCALE: - case GGML_OP_PAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_ARGSORT: - case GGML_OP_FLASH_ATTN_EXT: - case GGML_OP_FLASH_ATTN_BACK: - case GGML_OP_SSM_CONV: - case GGML_OP_SSM_SCAN: - { - n_tasks = n_threads; - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_GET_REL_POS: - case GGML_OP_RWKV_WKV: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1_F32: - case GGML_OP_MAP_CUSTOM2_F32: - case GGML_OP_MAP_CUSTOM3_F32: - { - n_tasks = 1; - } break; - case GGML_OP_MAP_CUSTOM1: - { - struct ggml_map_custom1_op_params p; - memcpy(&p, node->op_params, sizeof(p)); - if (p.n_tasks == GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p.n_tasks, n_threads); - } - } break; - case GGML_OP_MAP_CUSTOM2: - { - struct ggml_map_custom2_op_params p; - memcpy(&p, node->op_params, sizeof(p)); - if (p.n_tasks == GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p.n_tasks, n_threads); - } - } break; - case GGML_OP_MAP_CUSTOM3: - { - struct ggml_map_custom3_op_params p; - memcpy(&p, node->op_params, sizeof(p)); - if (p.n_tasks == GGML_N_TASKS_MAX) { - n_tasks = n_threads; - } else { - n_tasks = MIN(p.n_tasks, n_threads); - } - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - case GGML_OP_OPT_STEP_ADAMW: - { - n_tasks = n_threads; - } break; - case GGML_OP_NONE: - { - n_tasks = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); - } - default: - { - fprintf(stderr, "%s: op not implemented: ", __func__); - if (node->op < GGML_OP_COUNT) { - fprintf(stderr, "%s\n", ggml_op_name(node->op)); - } else { - fprintf(stderr, "%d\n", node->op); - } - GGML_ABORT("fatal error"); - } - } - - assert(n_tasks > 0); - - return n_tasks; -} - -static thread_ret_t ggml_graph_compute_secondary_thread(void* data); - -#if defined(_WIN32) -#include "windows.h" - -// TODO: support > 64 CPUs -bool ggml_thread_apply_affinity(bool * mask) { - HANDLE h = GetCurrentThread(); - uint64_t bitmask = 0ULL; - - assert(GGML_MAX_N_THREADS >= 64); - - for (int32_t i = 0; i < 8; i++) { - int32_t idx = i * 8; - uint8_t val = 0; - val |= mask[idx + 0] << 0; - val |= mask[idx + 1] << 1; - val |= mask[idx + 2] << 2; - val |= mask[idx + 3] << 3; - val |= mask[idx + 4] << 4; - val |= mask[idx + 5] << 5; - val |= mask[idx + 6] << 6; - val |= mask[idx + 7] << 7; - bitmask |= (uint64_t)val << idx; - } - - for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) { - if (mask[i]) { - fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n"); - break; - } - } - - DWORD_PTR m = (DWORD_PTR)bitmask; - - m = SetThreadAffinityMask(h, m); - - return m != 0; -} - -static bool ggml_thread_apply_priority(int32_t prio) { - // Note that on Windows the Process Priority Class must be updated in order to set Thread priority. - // This is up to the applications. - DWORD p = THREAD_PRIORITY_NORMAL; - switch (prio) { - case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; - case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; - case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; - case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; - } - - if (prio == GGML_SCHED_PRIO_NORMAL) { - // Keep inherited policy/priority - return true; - } - - if (!SetThreadPriority(GetCurrentThread(), p)) { - fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError()); - return false; - } - - return true; -} - -#elif defined(__APPLE__) -#include -#include - -static bool ggml_thread_apply_affinity(const bool * mask) { - // Not supported on Apple platforms - UNUSED(mask); - return true; -} - -static bool ggml_thread_apply_priority(int32_t prio) { - struct sched_param p; - int32_t policy = SCHED_OTHER; - switch (prio) { - case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; - case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; - case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; - case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; - } - - if (prio == GGML_SCHED_PRIO_NORMAL) { - // Keep inherited policy/priority - return true; - } - - int32_t err = pthread_setschedparam(pthread_self(), policy, &p); - if (err != 0) { - fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); - return false; - } - - return true; -} - -#elif defined(__gnu_linux__) -// TODO: this may not work on BSD, to be verified - -static bool ggml_thread_apply_affinity(const bool * mask) { - cpu_set_t cpuset; - int err; - - CPU_ZERO(&cpuset); - - for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { - if (mask[i]) { - GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); - CPU_SET(i, &cpuset); - } - } - -#ifdef __ANDROID__ - err = sched_setaffinity(0, sizeof(cpuset), &cpuset); - if (err < 0) { - err = errno; - } -#else - err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); -#endif - if (err != 0) { - fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err); - return false; - } - - return true; -} - -static bool ggml_thread_apply_priority(int32_t prio) { - struct sched_param p; - int32_t policy = SCHED_OTHER; - switch (prio) { - case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; - case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; - case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; - case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; - } - - if (prio == GGML_SCHED_PRIO_NORMAL) { - // Keep inherited policy/priority - return true; - } - - int32_t err = pthread_setschedparam(pthread_self(), policy, &p); - if (err != 0) { - fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err); - return false; - } - - return true; -} - -#else // unsupported platforms - -static bool ggml_thread_apply_affinity(const bool * mask) { - UNUSED(mask); - return true; -} - -static bool ggml_thread_apply_priority(int32_t prio) { - UNUSED(prio); - return true; -} - -#endif - -static bool ggml_thread_cpumask_is_valid(const bool * mask) { - for (int i = 0; i < GGML_MAX_N_THREADS; i++) { - if (mask[i]) { return true; } - } - return false; -} - -static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { - if (!strict) { - memcpy(local_mask, global_mask, GGML_MAX_N_THREADS); - return; - } else { - memset(local_mask, 0, GGML_MAX_N_THREADS); - int32_t base_idx = *iter; - for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { - int32_t idx = base_idx + i; - if (idx >= GGML_MAX_N_THREADS) { - // Just a cheaper modulo - idx -= GGML_MAX_N_THREADS; - } - if (global_mask[idx]) { - local_mask[idx] = 1; - *iter = idx + 1; - return; - } - } - } -} - -void ggml_threadpool_free(struct ggml_threadpool* threadpool) { - if (!threadpool) return; - - const int n_threads = threadpool->n_threads_max; - -#ifndef GGML_USE_OPENMP - struct ggml_compute_state* workers = threadpool->workers; - - ggml_mutex_lock(&threadpool->mutex); - - threadpool->stop = true; - threadpool->pause = false; - - ggml_cond_broadcast(&threadpool->cond); - ggml_mutex_unlock(&threadpool->mutex); - - for (int j = 1; j < n_threads; j++) { - int32_t rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); - UNUSED(rc); - } - - ggml_mutex_destroy(&threadpool->mutex); - ggml_cond_destroy(&threadpool->cond); -#endif // GGML_USE_OPENMP - - const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; - ggml_aligned_free(threadpool->workers, workers_size); - ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); -} - -#ifndef GGML_USE_OPENMP -// pause/resume must be called under mutex -static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) { - GGML_PRINT_DEBUG("Pausing threadpool\n"); - threadpool->pause = true; - ggml_cond_broadcast(&threadpool->cond); -} - -static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) { - GGML_PRINT_DEBUG("Resuming threadpool\n"); - threadpool->pause = false; - ggml_cond_broadcast(&threadpool->cond); -} -#endif - -void ggml_threadpool_pause(struct ggml_threadpool * threadpool) { -#ifndef GGML_USE_OPENMP - ggml_mutex_lock(&threadpool->mutex); - if (!threadpool->pause) { - ggml_threadpool_pause_locked(threadpool); - } - ggml_mutex_unlock(&threadpool->mutex); -#else - UNUSED(threadpool); -#endif -} - -void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { -#ifndef GGML_USE_OPENMP - ggml_mutex_lock(&threadpool->mutex); - if (threadpool->pause) { - ggml_threadpool_resume_locked(threadpool); - } - ggml_mutex_unlock(&threadpool->mutex); -#else - UNUSED(threadpool); -#endif -} - -struct ggml_cplan ggml_graph_plan( - const struct ggml_cgraph * cgraph, - int n_threads, - struct ggml_threadpool * threadpool) { - - if (threadpool == NULL) { - GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); - } - if (n_threads <= 0) { - n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; - } - - size_t work_size = 0; - - struct ggml_cplan cplan; - memset(&cplan, 0, sizeof(struct ggml_cplan)); - - int max_tasks = 1; - - // thread scheduling for the different operations + work buffer size estimation - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - const int n_tasks = ggml_get_n_tasks(node, n_threads); - - max_tasks = MAX(max_tasks, n_tasks); - - size_t cur = 0; - - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - if (ggml_is_quantized(node->type) || - // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 - (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || - (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_ACC: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; - } - } break; - case GGML_OP_COUNT_EQUAL: - { - cur = ggml_type_size(node->type)*n_tasks; - } break; - case GGML_OP_MUL_MAT: - { - const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; - - if (node->src[1]->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); - } - } break; - case GGML_OP_MUL_MAT_ID: - { - cur = 0; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; - if (src1->type != vec_dot_type) { - cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); - } - const int n_as = src0->ne[2]; - cur += GGML_PAD(cur, sizeof(int64_t)); // align - cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows - } break; - case GGML_OP_OUT_PROD: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - { - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - GGML_ASSERT(node->src[0]->ne[3] == 1); - GGML_ASSERT(node->src[1]->ne[2] == 1); - GGML_ASSERT(node->src[1]->ne[3] == 1); - - const int64_t ne00 = node->src[0]->ne[0]; // K - const int64_t ne01 = node->src[0]->ne[1]; // Cout - const int64_t ne02 = node->src[0]->ne[2]; // Cin - - const int64_t ne10 = node->src[1]->ne[0]; // L - const int64_t ne11 = node->src[1]->ne[1]; // Cin - - if ((node->src[0]->type == GGML_TYPE_F16 || - node->src[0]->type == GGML_TYPE_BF16) && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; - cur += sizeof(ggml_fp16_t)*ne10*ne11; - } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(float)*ne00*ne01*ne02; - cur += sizeof(float)*ne10*ne11; - } else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_OP_CONV_TRANSPOSE_2D: - { - const int64_t ne00 = node->src[0]->ne[0]; // W - const int64_t ne01 = node->src[0]->ne[1]; // H - const int64_t ne02 = node->src[0]->ne[2]; // Channels Out - const int64_t ne03 = node->src[0]->ne[3]; // Channels In - - const int64_t ne10 = node->src[1]->ne[0]; // W - const int64_t ne11 = node->src[1]->ne[1]; // H - const int64_t ne12 = node->src[1]->ne[2]; // Channels In - - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; - cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - const int64_t ne00 = node->src[0]->ne[0]; // D - - cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - const int64_t D = node->src[0]->ne[0]; - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_BF16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } - } break; - - case GGML_OP_CROSS_ENTROPY_LOSS: - { - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); - } - default: - break; - } - - work_size = MAX(work_size, cur); - } - - if (work_size > 0) { - work_size += CACHE_LINE_SIZE*(n_threads); - } - - cplan.threadpool = threadpool; - cplan.n_threads = MIN(max_tasks, n_threads); - cplan.work_size = work_size; - cplan.work_data = NULL; - - return cplan; -} - -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_threadpool * tp = state->threadpool; - - const struct ggml_cgraph * cgraph = tp->cgraph; - const struct ggml_cplan * cplan = tp->cplan; - - set_numa_thread_affinity(state->ith); - - struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.threadpool=*/ tp, - }; - - for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { - struct ggml_tensor * node = cgraph->nodes[node_n]; - - ggml_compute_forward(¶ms, node); - - if (state->ith == 0 && cplan->abort_callback && - cplan->abort_callback(cplan->abort_callback_data)) { - tp->abort = true; - tp->ec = GGML_STATUS_ABORTED; - } - - ggml_barrier(state->threadpool); - } - - return 0; -} - -#ifndef GGML_USE_OPENMP - -// check if thread is active -static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) { - struct ggml_threadpool * threadpool = state->threadpool; - int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed); - return (state->ith < n_threads); -} - -// check if thread is ready to proceed (exit from polling or sleeping) -static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) { - struct ggml_threadpool * threadpool = state->threadpool; - - if (state->pending || threadpool->stop || threadpool->pause) { return true; } - - // check for new graph/work - int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); - if (new_graph != state->last_graph) { - state->pending = ggml_graph_compute_thread_active(state); - state->last_graph = new_graph; - } - - return state->pending; -} - -// sync thread state after polling -static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) { - // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead - #ifdef GGML_TSAN_ENABLED - atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst); - #else - atomic_thread_fence(memory_order_seq_cst); - #endif - UNUSED(state); -} - -static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { - struct ggml_threadpool * threadpool = state->threadpool; - - // Skip polling for unused threads - if (!ggml_graph_compute_thread_active(state)) { - return state->pending; - } - - // This seems to make 0 ... 100 a decent range for polling level across modern processors. - // Perhaps, we can adjust it dynamically based on load and things. - const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; - - for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) { - // No new work. Keep polling. - ggml_thread_cpu_relax(); - } - - return state->pending; -} - -static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { - struct ggml_threadpool * threadpool = state->threadpool; - - if (ggml_graph_compute_poll_for_work(state)) { - ggml_graph_compute_thread_sync(state); - return state->pending; - } - - ggml_mutex_lock_shared(&threadpool->mutex); - while (!ggml_graph_compute_thread_ready(state)) { - // No new work. Wait for the signal. - GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith); - ggml_cond_wait(&threadpool->cond, &threadpool->mutex); - } - ggml_mutex_unlock_shared(&threadpool->mutex); - - return state->pending; -} - -static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_threadpool * threadpool = state->threadpool; - - ggml_thread_apply_priority(threadpool->prio); - if (ggml_thread_cpumask_is_valid(state->cpumask)) { - ggml_thread_apply_affinity(state->cpumask); - } - - while (true) { - // Check if we need to sleep - while (threadpool->pause) { - GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith); - ggml_mutex_lock_shared(&threadpool->mutex); - if (threadpool->pause) { - ggml_cond_wait(&threadpool->cond, &threadpool->mutex); - } - GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith); - ggml_mutex_unlock_shared(&threadpool->mutex); - } - - // This needs to be checked for after the cond_wait - if (threadpool->stop) break; - - // Check if there is new work - // The main thread is the only one that can dispatch new work - - ggml_graph_compute_check_for_work(state); - if (state->pending) { - state->pending = false; - - ggml_graph_compute_thread(state); - } - } - - return (thread_ret_t) 0; -} - -// Start processing new graph -static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads) -{ - // Always take the mutex here because the worker threads are doing hybrid poll/wait - - ggml_mutex_lock(&threadpool->mutex); - - GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads); - - // Update the number of active threads - atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); - - // Indicate the graph is ready to be processed - // We need the full seq-cst fence here because of the polling threads (used in thread_sync) - atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst); - - if (threadpool->pause) { - // Update main thread prio and affinity to match the threadpool settings - ggml_thread_apply_priority(threadpool->prio); - if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { - ggml_thread_apply_affinity(threadpool->workers[0].cpumask); - } - - // resume does cond broadcast - ggml_threadpool_resume_locked(threadpool); - } else { - ggml_cond_broadcast(&threadpool->cond); - } - - ggml_mutex_unlock(&threadpool->mutex); -} - -#endif // GGML_USE_OPENMP - -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - -static struct ggml_threadpool * ggml_threadpool_new_impl( - struct ggml_threadpool_params * tpp, - struct ggml_cgraph * cgraph, - struct ggml_cplan * cplan) { - - struct ggml_threadpool * threadpool = - ggml_aligned_malloc(sizeof(struct ggml_threadpool)); - { - threadpool->cgraph = cgraph; - threadpool->cplan = cplan; - threadpool->n_graph = 0; - threadpool->n_barrier = 0; - threadpool->n_barrier_passed = 0; - threadpool->current_chunk = 0; - threadpool->stop = false; - threadpool->pause = tpp->paused; - threadpool->abort = false; - threadpool->workers = NULL; - threadpool->n_threads_max = tpp->n_threads; - threadpool->n_threads_cur = tpp->n_threads; - threadpool->poll = tpp->poll; - threadpool->prio = tpp->prio; - threadpool->ec = GGML_STATUS_SUCCESS; - } - - // Allocate and init workers state - const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; - struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); - - memset(workers, 0, workers_size); - for (int j = 0; j < tpp->n_threads; j++) { - workers[j].threadpool = threadpool; - workers[j].ith = j; - } - - threadpool->workers = workers; - -#ifndef GGML_USE_OPENMP - ggml_mutex_init(&threadpool->mutex); - ggml_cond_init(&threadpool->cond); - - // Spin the threads for all workers, and update CPU placements. - // Place the main thread last (towards the higher numbered CPU cores). - - int32_t cpumask_iter = 0; - - for (int j = 1; j < tpp->n_threads; j++) { - ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); - - int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]); - GGML_ASSERT(rc == 0); - } - - ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter); - - if (!threadpool->pause) { - // Update main thread prio and affinity at the start, otherwise we'll do it in resume - ggml_thread_apply_priority(threadpool->prio); - if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { - ggml_thread_apply_affinity(threadpool->workers[0].cpumask); - } - } -#endif // GGML_USE_OPENMP - - return threadpool; -} - -struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) { - return ggml_threadpool_new_impl(tpp, NULL, NULL); -} - -enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { - GGML_ASSERT(cplan); - GGML_ASSERT(cplan->n_threads > 0); - GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); - - int n_threads = cplan->n_threads; - struct ggml_threadpool * threadpool = cplan->threadpool; - - bool disposable_threadpool = false; - - if (threadpool == NULL) { - GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); - disposable_threadpool = true; - - struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); - threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan); - } else { - // Reset some of the parameters that need resetting - // No worker threads should be accessing the parameters below at this stage - threadpool->cgraph = cgraph; - threadpool->cplan = cplan; - threadpool->current_chunk = 0; - threadpool->abort = false; - threadpool->ec = GGML_STATUS_SUCCESS; - } - -#ifdef GGML_USE_OPENMP - if (n_threads > 1) { - #pragma omp parallel num_threads(n_threads) - { - #pragma omp single - { - // update the number of threads from the actual number of threads that we got from OpenMP - n_threads = omp_get_num_threads(); - atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); - } - - ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); - } - } else { - atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); - ggml_graph_compute_thread(&threadpool->workers[0]); - } -#else - if (n_threads > threadpool->n_threads_max) { - GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); - n_threads = threadpool->n_threads_max; - } - - // Kick all threads to start the new graph - ggml_graph_compute_kickoff(threadpool, n_threads); - - // This is a work thread too - ggml_graph_compute_thread(&threadpool->workers[0]); -#endif - - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - - enum ggml_status ret = threadpool->ec; - - if (disposable_threadpool) { - ggml_threadpool_free(threadpool); - } - - return ret; -} - -enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); - - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - - return ggml_graph_compute(cgraph, &cplan); -} - struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -20168,490 +6540,6 @@ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const ch return NULL; } -static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { - const int64_t * ne = tensor->ne; - const size_t * nb = tensor->nb; - - fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", - ggml_type_name(tensor->type), - ggml_op_name (tensor->op), - ggml_n_dims(tensor), - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], - tensor->data, - tensor->name); -} - -static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { - const int64_t * ne = tensor->ne; - const size_t * nb = tensor->nb; - - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", - arg, - ggml_type_name(tensor->type), - ggml_op_name (tensor->op), - ggml_n_dims(tensor), - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], - tensor->data, - tensor->name); -} - -void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - uint64_t size_eval = 0; - - // compute size of intermediate results - for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += ggml_nbytes_pad(cgraph->nodes[i]); - } - - // print - { - FILE * fout = stdout; - - fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval); - - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", - "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); - - for (int i = 0; i < cgraph->n_leafs; ++i) { - ggml_graph_export_leaf(cgraph->leafs[i], fout); - - GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); - GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL); - GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL); - } - - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", - "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); - - for (int i = 0; i < cgraph->n_nodes; ++i) { - ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - - for (int j = 0; j < GGML_MAX_SRC; ++j) { - if (cgraph->nodes[i]->src[j]) { - ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout); - } - } - - fprintf(fout, "\n"); - } - - fprintf(fout, "\n"); - } - - // write binary data - { - FILE * fout = ggml_fopen(fname, "wb"); - - if (!fout) { - fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno)); - return; - } - - // header - { - const uint32_t magic = GGML_FILE_MAGIC; - const uint32_t version = GGML_FILE_VERSION; - const uint32_t n_leafs = cgraph->n_leafs; - const uint32_t n_nodes = cgraph->n_nodes; - - fwrite(&magic, sizeof(uint32_t), 1, fout); - fwrite(&version, sizeof(uint32_t), 1, fout); - fwrite(&n_leafs, sizeof(uint32_t), 1, fout); - fwrite(&n_nodes, sizeof(uint32_t), 1, fout); - fwrite(&size_eval, sizeof(uint64_t), 1, fout); - } - - // leafs - { - for (int i = 0; i < cgraph->n_leafs; ++i) { - const struct ggml_tensor * tensor = cgraph->leafs[i]; - - const uint32_t type = tensor->type; - const uint32_t op = tensor->op; - const int32_t flags = tensor->flags; - - fwrite(&type, sizeof(uint32_t), 1, fout); - fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&flags, sizeof(int32_t), 1, fout); - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const uint64_t ne = tensor->ne[j]; - const uint64_t nb = tensor->nb[j]; - - fwrite(&ne, sizeof(uint64_t), 1, fout); - fwrite(&nb, sizeof(uint64_t), 1, fout); - } - - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); - fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); - - // dump the data - // TODO: pad this to 32 byte boundary - { - const size_t size = ggml_nbytes(tensor); - - fwrite(tensor->data, sizeof(char), size, fout); - } - } - } - - // nodes - { - for (int i = 0; i < cgraph->n_nodes; ++i) { - const struct ggml_tensor * tensor = cgraph->nodes[i]; - - const uint32_t type = tensor->type; - const uint32_t op = tensor->op; - const int32_t flags = tensor->flags; - - fwrite(&type, sizeof(uint32_t), 1, fout); - fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&flags, sizeof(int32_t), 1, fout); - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const uint64_t ne = tensor->ne[j]; - const uint64_t nb = tensor->nb[j]; - - fwrite(&ne, sizeof(uint64_t), 1, fout); - fwrite(&nb, sizeof(uint64_t), 1, fout); - } - - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); - fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); - - // output the op arguments - { - struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; - - for (int j = 0; j < GGML_MAX_SRC; ++j) { - args[j] = tensor->src[j]; - } - - for (int j = 0; j < GGML_MAX_SRC; ++j) { - if (args[j]) { - int32_t idx = -1; - - // check if leaf - { - for (int k = 0; k < cgraph->n_leafs; ++k) { - if (args[j] == cgraph->leafs[k]) { - idx = k; - break; - } - } - } - - // check if node - if (idx == -1) { - for (int k = 0; k < cgraph->n_nodes; ++k) { - if (args[j] == cgraph->nodes[k]) { - idx = cgraph->n_leafs + k; - break; - } - } - } - - if (idx == -1) { - fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); - fclose(fout); - return; - } - - fwrite(&idx, sizeof(int32_t), 1, fout); - } else { - const int32_t nul = -1; - - fwrite(&nul, sizeof(int32_t), 1, fout); - } - } - } - - // dump the data - // TODO: pad this to 32 byte boundary - if ((flags & GGML_TENSOR_FLAG_PARAM)) { - const size_t size = ggml_nbytes(tensor); - - fwrite(tensor->data, sizeof(char), size, fout); - } - } - } - - fclose(fout); - } -} - -struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { - assert(*ctx_data == NULL); - assert(*ctx_eval == NULL); - - struct ggml_cgraph * result = NULL; - - struct ggml_tensor * data = NULL; - - // read file into data - { - FILE * fin = ggml_fopen(fname, "rb"); - if (!fin) { - fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno)); - return result; - } - - size_t fsize = 0; - - fseek(fin, 0, SEEK_END); - fsize = ftell(fin); - fseek(fin, 0, SEEK_SET); - - // create the data context - { - const size_t overhead = 1*ggml_tensor_overhead(); - - struct ggml_init_params params = { - .mem_size = fsize + overhead, - .mem_buffer = NULL, - .no_alloc = false, - }; - - *ctx_data = ggml_init(params); - - if (!*ctx_data) { - fprintf(stderr, "%s: failed to create ggml context\n", __func__); - fclose(fin); - return result; - } - } - - data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); - - { - const size_t ret = fread(data->data, sizeof(char), fsize, fin); - if (ret != fsize) { - fprintf(stderr, "%s: failed to read %s\n", __func__, fname); - fclose(fin); - return result; - } - } - - fclose(fin); - } - - // populate result - { - char * ptr = (char *) data->data; - - const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); - - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); - return result; - } - - const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); - - if (version != GGML_FILE_VERSION) { - fprintf(stderr, "%s: invalid version number\n", __func__); - return result; - } - - const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); - const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); - const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); - const int graph_size = MAX(n_leafs, n_nodes); - - // create the data context - { - const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false); - - struct ggml_init_params params = { - .mem_size = size_eval + overhead, - .mem_buffer = NULL, - .no_alloc = true, - }; - - *ctx_eval = ggml_init(params); - - if (!*ctx_eval) { - fprintf(stderr, "%s: failed to create ggml context\n", __func__); - return result; - } - } - - result = ggml_new_graph_custom(*ctx_eval, graph_size, false); - - result->n_leafs = n_leafs; - result->n_nodes = n_nodes; - - - // leafs - { - uint32_t type; - uint32_t op; - int32_t flags; - - for (uint32_t i = 0; i < n_leafs; ++i) { - type = *(const uint32_t *) ptr; ptr += sizeof(type); - op = *(const uint32_t *) ptr; ptr += sizeof(op); - flags = *(const int32_t *) ptr; ptr += sizeof(flags); - - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - uint64_t ne_cur; - uint64_t nb_cur; - - ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); - nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); - - ne[j] = ne_cur; - nb[j] = nb_cur; - } - - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); - - tensor->op = (enum ggml_op) op; - tensor->flags = flags; - - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; - memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - tensor->nb[j] = nb[j]; - } - - tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor); - - result->leafs[i] = tensor; - - fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); - } - } - - ggml_set_no_alloc(*ctx_eval, false); - - // nodes - { - uint32_t type; - uint32_t op; - int32_t flags; - - for (uint32_t i = 0; i < n_nodes; ++i) { - type = *(const uint32_t *) ptr; ptr += sizeof(type); - op = *(const uint32_t *) ptr; ptr += sizeof(op); - flags = *(const int32_t *) ptr; ptr += sizeof(flags); - - enum ggml_op eop = (enum ggml_op) op; - - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - uint64_t ne_cur; - uint64_t nb_cur; - - ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); - nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); - - ne[j] = ne_cur; - nb[j] = nb_cur; - } - - const char * ptr_name = ptr; ptr += GGML_MAX_NAME; - const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS; - - const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t); - - struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; - - // parse args - for (int j = 0; j < GGML_MAX_SRC; ++j) { - const int32_t arg_idx = ptr_arg_idx[j]; - - if (arg_idx == -1) { - continue; - } - - if (arg_idx < result->n_leafs) { - args[j] = result->leafs[arg_idx]; - } else { - args[j] = result->nodes[arg_idx - result->n_leafs]; - } - } - - // create the tensor - // "view" operations are handled differently - // TODO: handle inplace ops - currently a copy is always made - - struct ggml_tensor * tensor = NULL; - - switch (eop) { - // TODO: implement other view ops - case GGML_OP_RESHAPE: - { - tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]); - } break; - case GGML_OP_VIEW: - { - tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); - - size_t offs; - memcpy(&offs, ptr_op_params, sizeof(offs)); - - tensor->data = ((char *) tensor->data) + offs; - } break; - case GGML_OP_TRANSPOSE: - { - tensor = ggml_transpose(*ctx_eval, args[0]); - } break; - case GGML_OP_PERMUTE: - { - tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); - } break; - default: - { - tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); - - tensor->op = eop; - } break; - } - - memcpy(tensor->name, ptr_name, GGML_MAX_NAME); - memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS); - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - tensor->nb[j] = nb[j]; - } - - for (int j = 0; j < GGML_MAX_SRC; ++j) { - tensor->src[j] = args[j]; - } - - result->nodes[i] = tensor; - - // TODO tensor data is be duplicated due to ggml_new_tensor call above - if (flags & GGML_TENSOR_FLAG_PARAM) { - tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor); - } - - fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); - } - } - } - - return result; -} - void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_LOG_INFO("=== GRAPH ===\n"); @@ -20799,15 +6687,17 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph if (ggml_nelements(node) < 5 && node->data != NULL) { fprintf(fp, " | ("); for (int j = 0; j < ggml_nelements(node); j++) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, j)); - } - else if (node->type == GGML_TYPE_F32 || - node->type == GGML_TYPE_F16 || - node->type == GGML_TYPE_BF16) { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); - } - else { + // FIXME: use ggml-backend to obtain the tensor data + //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + // fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + //} + //else if (node->type == GGML_TYPE_F32 || + // node->type == GGML_TYPE_F16 || + // node->type == GGML_TYPE_BF16) { + // fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + //} + //else + { fprintf(fp, "#"); } if (j < ggml_nelements(node) - 1) { @@ -20852,918 +6742,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph //////////////////////////////////////////////////////////////////////////////// -static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) { - int i = 0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; - // TODO: add function to set tensor from array - for (int64_t j = 0; j < ne; ++j) { - ggml_set_f32_1d(ps[p], j, x[i++]); - } - } -} - -static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) { - int i = 0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; - // TODO: add function to get all elements at once - for (int64_t j = 0; j < ne; ++j) { - x[i++] = ggml_get_f32_1d(ps[p], j); - } - } -} - -static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { - int64_t i = 0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; - // TODO: add function to get all elements at once - for (int64_t j = 0; j < ne; ++j) { - g[i++] = ggml_get_f32_1d(ps[p]->grad, j); - } - } -} - -static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) { - int64_t i = 0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; - // TODO: add function to get all elements at once - for (int64_t j = 0; j < ne; ++j) { - g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale; - } - } -} - -// -// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf -// -// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf) -// - -static enum ggml_opt_result ggml_opt_adam( - struct ggml_context * ctx, - struct ggml_opt_context * opt, - struct ggml_opt_params params, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { - GGML_ASSERT(ggml_is_scalar(f)); - GGML_ASSERT(f->type == GGML_TYPE_F32); - - // these will store the parameters we want to optimize - struct ggml_tensor * ps[GGML_MAX_PARAMS]; - - int np = 0; - int64_t nx = 0; - for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { - GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - - GGML_ASSERT(np < GGML_MAX_PARAMS); - - ps[np++] = gf->nodes[i]; - nx += ggml_nelements(gf->nodes[i]); - } - } - - if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) { - int iter = opt->iter; - ggml_opt_init(opt->ctx, opt, params, nx); - opt->iter = iter; - } - - // constants - float sched = params.adam.sched; - const float alpha = params.adam.alpha; - const float decay = params.adam.decay * alpha; - const float beta1 = params.adam.beta1; - const float beta2 = params.adam.beta2; - const float eps = params.adam.eps; - const float gclip = params.adam.gclip; - const int decay_min_ndim = params.adam.decay_min_ndim; - const int n_accum = MAX(1, params.n_gradient_accumulation); - const float accum_norm = 1.0f / (float) n_accum; - - float * g = opt->adam.g->data; // gradients - float * m = opt->adam.m->data; // first moment - float * v = opt->adam.v->data; // second moment - - float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - - bool cancel = false; - - // compute the function value - float fx = 0; - ggml_set_zero(opt->adam.g); - for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - callback(callback_data, accum_step, &sched, &cancel); - if (cancel) { - return GGML_OPT_RESULT_CANCEL; - } - } - // ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); - ggml_opt_acc_grad(np, ps, g, accum_norm); - fx += ggml_get_f32_1d(f, 0); - } - fx *= accum_norm; - - opt->adam.fx_prev = fx; - opt->adam.fx_best = opt->adam.fx_prev; - if (pf) { - pf[opt->iter % params.past] = opt->adam.fx_prev; - } - - opt->loss_before = opt->adam.fx_prev; - opt->loss_after = opt->adam.fx_prev; - - // initialize - if (opt->just_initialized) { - opt->adam.n_no_improvement = 0; - opt->just_initialized = false; - } - - float * fx_best = &opt->adam.fx_best; - float * fx_prev = &opt->adam.fx_prev; - int * n_no_improvement = &opt->adam.n_no_improvement; - - int iter0 = opt->iter; - - // run the optimizer - for (int t = 0; t < params.adam.n_iter; ++t) { - opt->iter = iter0 + t + 1; - GGML_PRINT_DEBUG ("=== iter %d ===\n", t); - - GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); - GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0)); - GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0)); - - for (int i = 0; i < np; ++i) { - GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, - ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); - } - - const int64_t t_start_wall = ggml_time_us(); - const int64_t t_start_cpu = ggml_cycles(); - UNUSED(t_start_wall); - UNUSED(t_start_cpu); - - { - float gnorm = 1.0f; - if (gclip > 0.0f) { - // gradient clipping - ggml_float sum = 0.0; - for (int64_t i = 0; i < nx; ++i) { - sum += (ggml_float)(g[i]*g[i]); - } - ggml_float norm = sqrt(sum); - if (norm > (ggml_float) gclip) { - gnorm = (float) ((ggml_float) gclip / norm); - } - } - const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter)); - const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter)); - int64_t i = 0; - for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]); - const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched; - for (int64_t j = 0; j < ne; ++j) { - float x = ggml_get_f32_1d(ps[p], j); - float g_ = g[i]*gnorm; - m[i] = m[i]*beta1 + g_*(1.0f - beta1); - v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2); - float mh = m[i]*beta1h; - float vh = v[i]*beta2h; - vh = sqrtf(vh) + eps; - x = x*(1.0f - p_decay) - mh/vh; - ggml_set_f32_1d(ps[p], j, x); - ++i; - } - } - } - - fx = 0; - ggml_set_zero(opt->adam.g); - for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - callback(callback_data, accum_step, &sched, &cancel); - if (cancel) { - return GGML_OPT_RESULT_CANCEL;; - } - } - // ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); - ggml_opt_acc_grad(np, ps, g, accum_norm); - fx += ggml_get_f32_1d(f, 0); - } - fx *= accum_norm; - - opt->loss_after = fx; - - // check convergence - if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { - GGML_PRINT_DEBUG("converged\n"); - - return GGML_OPT_RESULT_OK; - } - - // delta-based convergence test - if (pf != NULL) { - // need at least params.past iterations to start checking for convergence - if (params.past <= iter0 + t) { - const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; - - if (fabsf(rate) < params.delta) { - return GGML_OPT_RESULT_OK; - } - } - - pf[(iter0 + t)%params.past] = fx; - } - - // check for improvement - if (params.max_no_improvement > 0) { - if (fx_best[0] > fx) { - fx_best[0] = fx; - n_no_improvement[0] = 0; - } else { - ++n_no_improvement[0]; - - if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_RESULT_OK; - } - } - } - - fx_prev[0] = fx; - - { - const int64_t t_end_cpu = ggml_cycles(); - GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); - UNUSED(t_end_cpu); - - const int64_t t_end_wall = ggml_time_us(); - GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); - UNUSED(t_end_wall); - } - } - - return GGML_OPT_RESULT_DID_NOT_CONVERGE; -} - -// -// L-BFGS -// -// the L-BFGS implementation below is based on the following implementation: -// -// https://github.com/chokkan/liblbfgs -// - -struct ggml_lbfgs_iteration_data { - float alpha; - float ys; - float * s; - float * y; -}; - -static enum ggml_opt_result linesearch_backtracking( - const struct ggml_opt_params * params, - int nx, - float * x, - float * fx, - float * g, - float * d, - float * step, - const float * xp, - struct ggml_tensor * f, - struct ggml_cgraph * gb, - struct ggml_cplan * cplan, - const int np, - struct ggml_tensor * ps[], - bool * cancel, - ggml_opt_callback callback, - void * callback_data) { - int count = 0; - - float width = 0.0f; - float dg = 0.0f; - float finit = 0.0f; - float dginit = 0.0f; - float dgtest = 0.0f; - - const float dec = 0.5f; - const float inc = 2.1f; - - const int n_accum = MAX(1, params->n_gradient_accumulation); - const float accum_norm = 1.0f / (float) n_accum; - - if (*step <= 0.f) { - return GGML_LINESEARCH_INVALID_PARAMETERS; - } - - // compute the initial gradient in the search direction - ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1); - - // make sure that d points to a descent direction - if (0 < dginit) { - return GGML_LINESEARCH_FAIL; - } - - // initialize local variables - finit = *fx; - dgtest = params->lbfgs.ftol*dginit; - - while (true) { - ggml_vec_cpy_f32(nx, x, xp); - ggml_vec_mad_f32(nx, x, d, *step); - - // evaluate the function and gradient values - { - ggml_opt_set_params(np, ps, x); - - *fx = 0; - memset(g, 0, sizeof(float)*nx); - for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, accum_step, &sched, cancel); - if (*cancel) { - return GGML_OPT_RESULT_CANCEL; - } - } - // ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, cplan); - ggml_opt_acc_grad(np, ps, g, accum_norm); - *fx += ggml_get_f32_1d(f, 0); - } - *fx *= accum_norm; - - } - - ++count; - - if (*fx > finit + (*step)*dgtest) { - width = dec; - } else { - // Armijo condition is satisfied - if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { - return count; - } - - ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1); - - // check the Wolfe condition - if (dg < params->lbfgs.wolfe * dginit) { - width = inc; - } else { - if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { - // regular Wolfe conditions - return count; - } - - if(dg > -params->lbfgs.wolfe*dginit) { - width = dec; - } else { - // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) - return count; - } - } - } - - if (*step < params->lbfgs.min_step) { - return GGML_LINESEARCH_MINIMUM_STEP; - } - if (*step > params->lbfgs.max_step) { - return GGML_LINESEARCH_MAXIMUM_STEP; - } - if (params->lbfgs.max_linesearch <= count) { - return GGML_LINESEARCH_MAXIMUM_ITERATIONS; - } - - (*step) *= width; - } - - GGML_ABORT("line search failed"); - - //return GGML_LINESEARCH_FAIL; -} - -static enum ggml_opt_result ggml_opt_lbfgs( - struct ggml_context * ctx, - struct ggml_opt_context * opt, - struct ggml_opt_params params, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { - if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || - params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { - if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return GGML_OPT_RESULT_INVALID_WOLFE; - } - } - - const int m = params.lbfgs.m; - - // these will store the parameters we want to optimize - struct ggml_tensor * ps[GGML_MAX_PARAMS]; - - int np = 0; - int nx = 0; - for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { - GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - - GGML_ASSERT(np < GGML_MAX_PARAMS); - - ps[np++] = gf->nodes[i]; - nx += ggml_nelements(gf->nodes[i]); - } - } - - if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) { - int iter = opt->iter; - ggml_opt_init(ctx, opt, params, nx); - opt->iter = iter; - } - - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); - cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - - float * x = opt->lbfgs.x->data; // current parameters - float * xp = opt->lbfgs.xp->data; // previous parameters - float * g = opt->lbfgs.g->data; // current gradient - float * gp = opt->lbfgs.gp->data; // previous gradient - float * d = opt->lbfgs.d->data; // search direction - - float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values - - const int n_accum = MAX(1, params.n_gradient_accumulation); - const float accum_norm = 1.0f / (float) n_accum; - - float fx = 0.0f; // cost function value - float xnorm = 0.0f; // ||x|| - float gnorm = 0.0f; // ||g|| - - // initialize x from the graph nodes - ggml_opt_get_params(np, ps, x); - - // the L-BFGS memory - float * lm_alpha = opt->lbfgs.lmal->data; - float * lm_ys = opt->lbfgs.lmys->data; - float * lm_s = opt->lbfgs.lms->data; - float * lm_y = opt->lbfgs.lmy->data; - - bool cancel = false; - - // evaluate the function value and its gradient - { - ggml_opt_set_params(np, ps, x); - - fx = 0; - memset(g, 0, sizeof(float)*nx); - for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - // LBFG-S does not support learning rate -> ignore learning schedule - float sched = 0; - callback(callback_data, accum_step, &sched, &cancel); - if (cancel) { - return GGML_OPT_RESULT_CANCEL; - } - } - // ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(gb, &cplan); - ggml_opt_acc_grad(np, ps, g, accum_norm); - fx += ggml_get_f32_1d(f, 0); - } - fx *= accum_norm; - - opt->loss_before = fx; - opt->loss_after = fx; - } - - // search direction = -gradient - ggml_vec_neg_f32(nx, d, g); - - // ||x||, ||g|| - ggml_vec_norm_f32(nx, &xnorm, x); - ggml_vec_norm_f32(nx, &gnorm, g); - - if (xnorm < 1.0f) { - xnorm = 1.0f; - } - - // already optimized - if (gnorm/xnorm <= params.lbfgs.eps) { - return GGML_OPT_RESULT_OK; - } - - if (opt->just_initialized) { - if (pf) { - pf[0] = fx; - } - opt->lbfgs.fx_best = fx; - - // initial step - ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); - opt->lbfgs.j = 0; - opt->lbfgs.k = 1; - opt->lbfgs.end = 0; - opt->lbfgs.n_no_improvement = 0; - opt->just_initialized = false; - } - - float * fx_best = &opt->lbfgs.fx_best; - float * step = &opt->lbfgs.step; - int * j = &opt->lbfgs.j; - int * k = &opt->lbfgs.k; - int * end = &opt->lbfgs.end; - int * n_no_improvement = &opt->lbfgs.n_no_improvement; - - int ls = 0; - int bound = 0; - - float ys = 0.0f; - float yy = 0.0f; - float beta = 0.0f; - - int it = 0; - - while (true) { - // store the current position and gradient vectors - ggml_vec_cpy_f32(nx, xp, x); - ggml_vec_cpy_f32(nx, gp, g); - - // TODO: instead of passing &cancel here, use the return code of the linesearch - // to determine if the optimization should be cancelled - // this is a simple change, but not doing this atm, since I don't have a nice - // way to test and don't want to break something with so many changes lined up - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); - if (cancel) { - return GGML_OPT_RESULT_CANCEL; - } - - if (ls < 0) { - // linesearch failed - go back to the previous point and return - ggml_vec_cpy_f32(nx, x, xp); - ggml_vec_cpy_f32(nx, g, gp); - - return ls; - } - - opt->loss_after = fx; - - ggml_vec_norm_f32(nx, &xnorm, x); - ggml_vec_norm_f32(nx, &gnorm, g); - - GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); - - if (xnorm < 1.0f) { - xnorm = 1.0f; - } - if (gnorm/xnorm <= params.lbfgs.eps) { - // converged - return GGML_OPT_RESULT_OK; - } - - // delta-based convergence test - if (pf != NULL) { - // need at least params.past iterations to start checking for convergence - if (params.past <= k[0]) { - const float rate = (pf[k[0]%params.past] - fx)/fx; - - if (fabsf(rate) < params.delta) { - return GGML_OPT_RESULT_OK; - } - } - - pf[k[0]%params.past] = fx; - } - - // check for improvement - if (params.max_no_improvement > 0) { - if (fx < fx_best[0]) { - fx_best[0] = fx; - n_no_improvement[0] = 0; - } else { - n_no_improvement[0]++; - - if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_RESULT_OK; - } - } - } - - if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { - // reached the maximum number of iterations - return GGML_OPT_RESULT_DID_NOT_CONVERGE; - } - - // update vectors s and y: - // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. - // y_{k+1} = g_{k+1} - g_{k}. - // - ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); - ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); - - // compute scalars ys and yy: - // ys = y^t \cdot s -> 1 / \rho. - // yy = y^t \cdot y. - // - ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1); - ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1); - - lm_ys[end[0]] = ys; - - // find new search direction - // ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS - - bound = (m <= k[0]) ? m : k[0]; - k[0]++; - it++; - end[0] = (end[0] + 1)%m; - - // initialize search direction with -g - ggml_vec_neg_f32(nx, d, g); - - j[0] = end[0]; - for (int i = 0; i < bound; ++i) { - j[0] = (j[0] + m - 1) % m; - // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1); - lm_alpha[j[0]] /= lm_ys[j[0]]; - // q_{i} = q_{i+1} - \alpha_{i} y_{i} - ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); - } - - ggml_vec_scale_f32(nx, d, ys/yy); - - for (int i = 0; i < bound; ++i) { - // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1); - beta /= lm_ys[j[0]]; - // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} - ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); - j[0] = (j[0] + 1)%m; - } - - step[0] = 1.0; - } - - GGML_ABORT("lbfgs failed"); - - //return GGML_OPT_RESULT_DID_NOT_CONVERGE; -} - -struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { - struct ggml_opt_params result; - - switch (type) { - case GGML_OPT_TYPE_ADAM: - { - result = (struct ggml_opt_params) { - .type = GGML_OPT_TYPE_ADAM, - .graph_size = GGML_DEFAULT_GRAPH_SIZE, - .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ? - .past = 0, - .delta = 1e-5f, - - .max_no_improvement = 100, - - .print_forward_graph = true, - .print_backward_graph = true, - - .n_gradient_accumulation = 1, - - .adam = { - .n_iter = 10000, - .sched = 1.000f, - .decay = 0.0f, - .decay_min_ndim = 2, - .alpha = 0.001f, - .beta1 = 0.9f, - .beta2 = 0.999f, - .eps = 1e-8f, - .eps_f = 1e-5f, - .eps_g = 1e-3f, - .gclip = 0.0f, - }, - }; - } break; - case GGML_OPT_TYPE_LBFGS: - { - result = (struct ggml_opt_params) { - .type = GGML_OPT_TYPE_LBFGS, - .graph_size = GGML_DEFAULT_GRAPH_SIZE, - .n_threads = 1, - .past = 0, - .delta = 1e-5f, - - .max_no_improvement = 0, - - .print_forward_graph = true, - .print_backward_graph = true, - - .n_gradient_accumulation = 1, - - .lbfgs = { - .m = 6, - .n_iter = 100, - .max_linesearch = 20, - - .eps = 1e-5f, - .ftol = 1e-4f, - .wolfe = 0.9f, - .min_step = 1e-20f, - .max_step = 1e+20f, - - .linesearch = GGML_LINESEARCH_DEFAULT, - }, - }; - } break; - } - - return result; -} - -GGML_API void ggml_opt_init( - struct ggml_context * ctx, - struct ggml_opt_context * opt, - struct ggml_opt_params params, - int64_t nx) { - opt->ctx = ctx; - opt->params = params; - opt->iter = 0; - opt->nx = nx; - opt->just_initialized = true; - if (opt->ctx == NULL) { - struct ggml_init_params ctx_opt_params; - if (opt->params.type == GGML_OPT_TYPE_ADAM) { - ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; - if (opt->params.past > 0) { - ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; - } - } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) { - ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); - if (opt->params.past > 0) { - ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; - } - } - ctx_opt_params.mem_buffer = NULL; - ctx_opt_params.no_alloc = false; - - opt->ctx = ggml_init(ctx_opt_params); - } - switch (opt->params.type) { - case GGML_OPT_TYPE_ADAM: - { - opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->adam.v = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->adam.pf = params.past > 0 - ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past) - : NULL; - ggml_set_zero(opt->adam.m); - ggml_set_zero(opt->adam.v); - if (opt->adam.pf) { - ggml_set_zero(opt->adam.pf); - } - } break; - case GGML_OPT_TYPE_LBFGS: - { - opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->lbfgs.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->lbfgs.d = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); - opt->lbfgs.pf = params.past > 0 - ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past) - : NULL; - opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lms = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m); - opt->lbfgs.lmy = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m); - ggml_set_zero(opt->lbfgs.x); - ggml_set_zero(opt->lbfgs.xp); - ggml_set_zero(opt->lbfgs.g); - ggml_set_zero(opt->lbfgs.gp); - ggml_set_zero(opt->lbfgs.d); - if (opt->lbfgs.pf) { - ggml_set_zero(opt->lbfgs.pf); - } - ggml_set_zero(opt->lbfgs.lmal); - ggml_set_zero(opt->lbfgs.lmys); - ggml_set_zero(opt->lbfgs.lms); - ggml_set_zero(opt->lbfgs.lmy); - } break; - } -} - -enum ggml_opt_result ggml_opt( - struct ggml_context * ctx, - struct ggml_opt_params params, - struct ggml_tensor * f) { - bool free_ctx = false; - if (ctx == NULL) { - struct ggml_init_params params_ctx = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - .no_alloc = false, - }; - - ctx = ggml_init(params_ctx); - if (ctx == NULL) { - return GGML_OPT_RESULT_NO_CONTEXT; - } - - free_ctx = true; - } - - enum ggml_opt_result result = GGML_OPT_RESULT_OK; - - struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); - - ggml_opt_init(ctx, opt, params, 0); - result = ggml_opt_resume(ctx, opt, f); - - if (free_ctx) { - ggml_free(ctx); - } - - return result; -} - -enum ggml_opt_result ggml_opt_resume( - struct ggml_context * ctx, - struct ggml_opt_context * opt, - struct ggml_tensor * f) { - - // build forward + backward compute graphs - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true); - ggml_build_forward_expand(gf, f); - - struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf); - ggml_build_backward_expand(ctx, gf, gb, false); - - return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); -} - -enum ggml_opt_result ggml_opt_resume_g( - struct ggml_context * ctx, - struct ggml_opt_context * opt, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { - - GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor"); - - // build forward + backward compute graphs - enum ggml_opt_result result = GGML_OPT_RESULT_OK; - - switch (opt->params.type) { - case GGML_OPT_TYPE_ADAM: - { - result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); - } break; - case GGML_OPT_TYPE_LBFGS: - { - result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); - } break; - } - - if (opt->params.print_forward_graph) { - ggml_graph_print (gf); - ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); - } - - if (opt->params.print_backward_graph) { - ggml_graph_print (gb); - ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); - } - - return result; -} - -//////////////////////////////////////////////////////////////////////////////// - void ggml_set_input(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_INPUT; } @@ -23247,22 +8225,6 @@ int ggml_cpu_has_fma(void) { #endif } -int ggml_cpu_has_neon(void) { -#if defined(__ARM_ARCH) - return ggml_arm_arch_features.has_neon; -#else - return 0; -#endif -} - -int ggml_cpu_has_sve(void) { -#if defined(__ARM_ARCH) - return ggml_arm_arch_features.has_sve; -#else - return 0; -#endif -} - int ggml_cpu_has_arm_fma(void) { #if defined(__ARM_FEATURE_FMA) return 1; @@ -23403,22 +8365,6 @@ int ggml_cpu_has_vsx(void) { #endif } -int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_ARCH) - return ggml_arm_arch_features.has_i8mm; -#else - return 0; -#endif -} - -int ggml_cpu_get_sve_cnt(void) { -#if defined(__ARM_ARCH) - return ggml_arm_arch_features.sve_cnt; -#else - return 0; -#endif -} - void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; diff --git a/include/llama.h b/include/llama.h index 24005548d..ccb48f73c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -2,6 +2,7 @@ #define LLAMA_H #include "ggml.h" +#include "ggml-cpu.h" #include "ggml-backend.h" #include diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 131d7c177..3df6e1f42 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -11,6 +11,7 @@ #include #include +#include constexpr int kVecSize = 1 << 16; @@ -136,7 +137,7 @@ int main(int argc, char** argv) { auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1; - const auto * funcs = ggml_get_type_traits(ggml_type); + const auto * funcs = ggml_get_type_traits_cpu(ggml_type); Stat simple, ggml; diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 88e66ea13..e9af8a363 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -9,6 +9,7 @@ #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -236,7 +237,8 @@ int main(int argc, char** argv) { int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); - const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0); + const auto * funcs = ggml_get_type_traits(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0); + const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0); std::vector q40; std::vector q41; @@ -282,10 +284,10 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type); + const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type); vdot->from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/spm-headers/ggml-cpu.h b/spm-headers/ggml-cpu.h new file mode 120000 index 000000000..66e629607 --- /dev/null +++ b/spm-headers/ggml-cpu.h @@ -0,0 +1 @@ +../ggml/include/ggml-cpu.h \ No newline at end of file diff --git a/src/llama.cpp b/src/llama.cpp index 3f534596e..3e563d811 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21900,6 +21900,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { + ggml_cpu_init(); // some ARM features are detected at runtime + static std::string s; s = ""; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2e3ad79f0..46346cbd0 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,6 +16,7 @@ #include +#include #include #include diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp index cf54237db..d85bf912b 100644 --- a/tests/test-barrier.cpp +++ b/tests/test-barrier.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "ggml-cpu.h" #include "ggml-backend.h" #include diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 2200ad93d..c712dba7f 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -1,5 +1,6 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows #include "ggml.h" +#include "ggml-cpu.h" #include #include diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index d50417ba0..000e60adf 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -1,6 +1,7 @@ // Unit tests for quantization specific functions - quantize, dequantize and dot product #include "ggml.h" +#include "ggml-cpu.h" #undef NDEBUG #include @@ -78,18 +79,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { // Total dot product error static float dot_product_error( - const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2 + const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2 ) { std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); - const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); + const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type); qfns->from_float(test_data1, tmp_q1.data(), test_size); vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); const float dot_ref = dot_product(test_data1, test_data2, test_size); @@ -132,6 +133,7 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); // deprecated - skip if (qfns->blck_size == 0) { @@ -166,7 +168,7 @@ int main(int argc, char * argv[]) { printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error); } - const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); + const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index bdbdd90a8..221424de8 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -1,6 +1,7 @@ // Benchmark quantization specific functions on synthetic data #include "ggml.h" +#include "ggml-cpu.h" #undef NDEBUG #include @@ -271,6 +272,7 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { continue; } @@ -328,7 +330,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); + const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type); vdot->from_float(test_data1, test_q1, size); return test_q1[0]; }; @@ -346,7 +348,7 @@ int main(int argc, char * argv[]) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); + qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); return result; }; size_t quantized_size = ggml_row_size(type, size); diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 246bb227d..4656b30f0 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "ggml-cpu.h" #include #include From e2292aaa17cf8530b0d0d899909588c3a095799d Mon Sep 17 00:00:00 2001 From: Plamen Minev Date: Fri, 1 Nov 2024 16:55:10 +0200 Subject: [PATCH 026/279] metal : fix minor string leaks (ggml/1004) --- ggml/src/ggml-metal.m | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index a2b4d49d5..f9bd6faa4 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -450,7 +450,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } + +#if !__has_feature(objc_arc) + [options release]; +#endif } +#if GGML_METAL_EMBED_LIBRARY + [src release]; +#endif // GGML_METAL_EMBED_LIBRARY } } From 284e5b0275cc1292096e72e808e41d17e8cdf019 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Sat, 2 Nov 2024 05:09:12 -0400 Subject: [PATCH 027/279] cmake : make it possible linking ggml as external lib (ggml/1003) --- ggml/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 82b81cf12..34b81bd7f 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1396,7 +1396,7 @@ if (EMSCRIPTEN) endif() target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) -target_include_directories(ggml PUBLIC ../include) +target_include_directories(ggml PUBLIC $ $) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump From ce027adfb3b131f0d2368294fc276bb0e342b3f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 4 Nov 2024 10:33:37 +0200 Subject: [PATCH 028/279] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 48863847c..020c60f34 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -bb78a40dc60e04c626bac2b65840b509988e990d +a099cb514d6687e436a5a423d1fb0448be0feb20 From 329ed914c959c510d076fb06b43eeb3f7b804d6f Mon Sep 17 00:00:00 2001 From: leo-pony Date: Mon, 4 Nov 2024 19:08:22 +0800 Subject: [PATCH 029/279] CANN: adjust backend registry refactor. (#10158) remove buffer->iface.get_name that used in cann as it was removed in backend registry refactor PR. --- ggml/src/ggml-cann.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index f8ac11e41..776340881 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_cann_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; return buffer; From f8e58135cff1c373df2934306f9c9da99673c2ed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 4 Nov 2024 13:43:32 +0200 Subject: [PATCH 030/279] metal : move dequantize templates to beginning of MSL source (#0) --- ggml/src/ggml-metal.metal | 874 +++++++++++++++++++------------------- 1 file changed, 436 insertions(+), 438 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 57eb34f13..3eb976633 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -12,6 +12,442 @@ using namespace metal; #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; + +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + float4x4 temp = *(((device float4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; + } +} + +template +void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 3); + const float d = xb->d; + const float md = -16.h * xb->d; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[i/2][2*(i%2)+0] = d * x0 + md; + reg[i/2][2*(i%2)+1] = d * x1 + md; + } +} + +template +void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 4); + const float d = xb->d; + const float m = xb->m; + const ushort mask = il ? 0x00F0 : 0x000F; + + const uint32_t qh = *((device const uint32_t *)xb->qh); + + const int x_mv = il ? 4 : 0; + + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[i/2][2*(i%2)+0] = d * x0 + m; + reg[i/2][2*(i%2)+1] = d * x1 + m; + } +} + +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const half d = xb->d; + + for (int i = 0; i < 16; i++) { + reg[i/4][i%4] = (qs[i + 16*il] * d); + } +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const float d = xb->d; + const float min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + float dl, ml; + uint8_t sc = xb->scales[il]; + + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; + + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) + : (scale_2&kmask2) | ((scale_1&kmask1) << 4); + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + const float ml = 4.f * dl; + + il = (il/2) & 3; + const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl *= coef; + + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); + } +} + +static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { + return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} + : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; +} + +template +void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { + device const uchar * q = xb->qs; + + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.h; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; + + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.f; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + const float qh_val = il<2 ? 16.f : 256.f; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } +} + +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * ql = (device const uint8_t *)xb->ql; + device const uint8_t * qh = (device const uint8_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; + + ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + qh = qh + 32*(il/8) + 16*(il&1); + float sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; + + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const float coef = il>1 ? 1.f/16.f : 1.f; + const float ml = d_all * sc * 32.f; + const float dl = d_all * sc * coef; + for (int i = 0; i < 16; ++i) { + const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) + : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); + reg[i/4][i%4] = dl * q - ml; + } +} + +template +void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. + device const uint16_t * q2 = xb->qs + 4*ib32; + const uint32_t aux32_g = q2[0] | (q2[1] << 16); + const uint32_t aux32_s = q2[2] | (q2[3] << 16); + thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; + const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + +template +void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * q3 = xb->qs + 8*ib32; + device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); + uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } + grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); + grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); + signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } +} + +template +void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 8*ib32; + device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); + constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); + reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); + } + grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); + reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); + } +} + +template +void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * signs = qs + QK_K/8; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); + constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); + for (int i = 0; i < 8; ++i) { + reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); + reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); + } +} + +template +void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + const float d = xb->d; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint16_t * qh = xb->qh; + const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); + const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); + const uint16_t h = qh[ib32] >> 6*il; + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml; + reg[1][i] = dl * (grid1[i] >> 4) + ml; + reg[2][i] = dl * (grid2[i] & 0xf) + ml; + reg[3][i] = dl * (grid2[i] >> 4) + ml; + } +} + +template +void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + device const uint16_t * sc = (device const uint16_t *)xb->scales; + + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = scale.f16; + + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * qh = xb->qh + 2*ib32 + il; + + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); + const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml1; + reg[1][i] = dl * (grid1[i] >> 4) + ml1; + reg[2][i] = dl * (grid2[i] & 0xf) + ml2; + reg[3][i] = dl * (grid2[i] >> 4) + ml2; + } +} + +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + +template +void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; + const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)xb->d * (ls - 32); + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + enum ggml_sort_order { GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC, @@ -3339,10 +3775,6 @@ static inline int best_index_int8(int n, constant float * val, float x) { return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } -constexpr constant static float kvalues_iq4nl_f[16] = { - -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f -}; - kernel void kernel_cpy_f32_iq4_nl( device const float * src0, device void * dst, @@ -5457,440 +5889,6 @@ kernel void kernel_mul_mv_iq4_xs_f32( kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } -//============================= templates and their specializations ============================= - -// NOTE: this is not dequantizing - we are simply fitting the template -template -void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - float4x4 temp = *(((device float4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} - -template -void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - half4x4 temp = *(((device half4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} - -template -void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; - } -} - -template -void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; - } -} - -template -void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 3); - const float d = xb->d; - const float md = -16.h * xb->d; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[i/2][2*(i%2)+0] = d * x0 + md; - reg[i/2][2*(i%2)+1] = d * x1 + md; - } -} - -template -void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 4); - const float d = xb->d; - const float m = xb->m; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[i/2][2*(i%2)+0] = d * x0 + m; - reg[i/2][2*(i%2)+1] = d * x1 + m; - } -} - -template -void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { - device const int8_t * qs = ((device const int8_t *)xb->qs); - const half d = xb->d; - - for (int i = 0; i < 16; i++) { - reg[i/4][i%4] = (qs[i + 16*il] * d); - } -} - -template -void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { - const float d = xb->d; - const float min = xb->dmin; - device const uint8_t * q = (device const uint8_t *)xb->qs; - float dl, ml; - uint8_t sc = xb->scales[il]; - - q = q + 32*(il/8) + 16*(il&1); - il = (il/2)%4; - - half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * q = (device const uint8_t *)xb->qs; - device const uint8_t * h = (device const uint8_t *)xb->hmask; - device const int8_t * scales = (device const int8_t *)xb->scales; - - q = q + 32 * (il/8) + 16 * (il&1); - h = h + 16 * (il&1); - uint8_t m = 1 << (il/2); - uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ - ((il/4)>0 ? 12 : 3); - uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; - uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; - int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) - : (scale_2&kmask2) | ((scale_1&kmask1) << 4); - float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); - const float ml = 4.f * dl; - - il = (il/2) & 3; - const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl *= coef; - - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); - } -} - -static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { - return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} - : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; -} - -template -void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { - device const uchar * q = xb->qs; - - short is = (il/4) * 2; - q = q + (il/4) * 32 + 16 * (il&1); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.h; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il<2 ? 0x0F : 0xF0; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { - device const uint8_t * q = xb->qs; - device const uint8_t * qh = xb->qh; - - short is = (il/4) * 2; - q = q + 32 * (il/4) + 16 * (il&1); - qh = qh + 16 * (il&1); - uint8_t ul = 1 << (il/2); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.f; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il<2 ? 0x0F : 0xF0; - const float qh_val = il<2 ? 16.f : 256.f; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; - } -} - -template -void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * ql = (device const uint8_t *)xb->ql; - device const uint8_t * qh = (device const uint8_t *)xb->qh; - device const int8_t * scales = (device const int8_t *)xb->scales; - - ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); - qh = qh + 32*(il/8) + 16*(il&1); - float sc = scales[(il%2) + 2 * ((il/2))]; - il = (il/2) & 3; - - const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; - const float coef = il>1 ? 1.f/16.f : 1.f; - const float ml = d_all * sc * 32.f; - const float dl = d_all * sc * coef; - for (int i = 0; i < 16; ++i) { - const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) - : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); - reg[i/4][i%4] = dl * q - ml; - } -} - -template -void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. - device const uint16_t * q2 = xb->qs + 4*ib32; - const uint32_t aux32_g = q2[0] | (q2[1] << 16); - const uint32_t aux32_s = q2[2] | (q2[3] << 16); - thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; - const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint16_t * q2 = xb->qs + 4*ib32; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); - uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); - signs = ksigns_iq2xs[q2[2*il+1] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * q3 = xb->qs + 8*ib32; - device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; - constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); - constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); - uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } - grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); - grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); - signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 8*ib32; - device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); - constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); - constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); - reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); - } - grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); - grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); - reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); - } -} - -template -void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * signs = qs + QK_K/8; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); - constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); - for (int i = 0; i < 8; ++i) { - reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); - reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); - } -} - -template -void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - const float d = xb->d; - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint16_t * qh = xb->qh; - const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); - const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); - const uint16_t h = qh[ib32] >> 6*il; - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml; - reg[1][i] = dl * (grid1[i] >> 4) + ml; - reg[2][i] = dl * (grid2[i] & 0xf) + ml; - reg[3][i] = dl * (grid2[i] >> 4) + ml; - } -} - -template -void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - device const uint16_t * sc = (device const uint16_t *)xb->scales; - - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const float d = scale.f16; - - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * qh = xb->qh + 2*ib32 + il; - - const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); - const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml1; - reg[1][i] = dl * (grid1[i] >> 4) + ml1; - reg[2][i] = dl * (grid2[i] & 0xf) + ml2; - reg[3][i] = dl * (grid2[i] >> 4) + ml2; - } -} - -template -void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { - device const uint16_t * q4 = (device const uint16_t *)xb->qs; - const float d = xb->d; - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - -template -void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; - const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)xb->d * (ls - 32); - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - template kernel void kernel_get_rows_q( device const void * src0, From 05697f670b1ea28b80c39854832ea53527f75c55 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 4 Nov 2024 13:49:34 +0200 Subject: [PATCH 031/279] metal : simplify f16 and f32 dequant kernels (#0) --- ggml/src/ggml-metal.metal | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 3eb976633..ff9d37490 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -19,18 +19,12 @@ constexpr constant static float kvalues_iq4nl_f[16] = { // NOTE: this is not dequantizing - we are simply fitting the template template void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - float4x4 temp = *(((device float4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } + reg = (type4x4)(*src); } template void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - half4x4 temp = *(((device half4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } + reg = (type4x4)(*src); } template From ea02c753ebf9342114cb173f10b3ffc2af1e7d04 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 4 Nov 2024 13:10:23 +0100 Subject: [PATCH 032/279] cuda : clear error after changing peer access (#10153) --- ggml/src/ggml-cuda.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b57f1b3b7..e68e40550 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0); if (err != cudaErrorPeerAccessAlreadyEnabled) { CUDA_CHECK(err); + } else { + // reset the error + cudaGetLastError(); } } else { cudaError_t err = cudaDeviceDisablePeerAccess(id_other); if (err != cudaErrorPeerAccessNotEnabled) { CUDA_CHECK(err); + } else { + // reset the error + cudaGetLastError(); } } } From 6a066b9978533e2ab9890b7f4f8c0262d91798b3 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:08:33 -0600 Subject: [PATCH 033/279] fix build break on arm64 linux (#10166) This fixes the build break from the recent changes to move the CPU backend to separate files https://github.com/ggerganov/llama.cpp/pull/10144 --- ggml/src/ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7aa6dce89..f792406e1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4,7 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" - +#include "ggml-cpu.h" #include #include From 9e0ecfb697d297355e43c20559d29bcc71beb0c3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 4 Nov 2024 16:33:29 +0100 Subject: [PATCH 034/279] server : clarify /slots endpoint, add is_processing (#10162) * server : clarify /slots endpoint, add is_processing * fix tests --- examples/server/README.md | 11 +++++------ examples/server/server.cpp | 16 ++++++++-------- examples/server/tests/features/steps/steps.py | 10 +++++----- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 1629e456b..15f95db1e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte ### GET `/slots`: Returns the current slots processing state -This endpoint can be disabled with `--no-slots` +> [!WARNING] +> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments. + +This endpoint is disabled by default and can be enabled with `--slots` If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots. @@ -709,6 +712,7 @@ Example: "grammar": "", "id": 0, "ignore_eos": false, + "is_processing": false, "logit_bias": [], "min_p": 0.05000000074505806, "mirostat": 0, @@ -741,7 +745,6 @@ Example: "temperature" ], "seed": 42, - "state": 1, "stop": [ "\n" ], @@ -755,10 +758,6 @@ Example: ] ``` -Possible values for `slot[i].state` are: -- `0`: SLOT_STATE_IDLE -- `1`: SLOT_STATE_PROCESSING - ### GET `/metrics`: Prometheus compatible metrics exporter This endpoint is only accessible if `--metrics` is set. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8531a784d..f0b89b22c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1566,11 +1566,11 @@ struct server_context { for (server_slot & slot : slots) { json slot_data = get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["id_task"] = slot.id_task; - slot_data["state"] = slot.state; - slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); - slot_data["next_token"] = { + slot_data["id"] = slot.id; + slot_data["id_task"] = slot.id_task; + slot_data["is_processing"] = slot.is_processing(); + slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); + slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, @@ -1581,10 +1581,10 @@ struct server_context { {"stopping_word", slot.stopping_word}, }; - if (slot_data["state"] == SLOT_STATE_IDLE) { - n_idle_slots++; - } else { + if (slot.is_processing()) { n_processing_slots++; + } else { + n_idle_slots++; } slots_data.push_back(slot_data); diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 2e418d8aa..687b163f4 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): match expected_slot_status_string: case 'idle': - expected_slot_status = 0 + expected_slot_status = False case 'busy': - expected_slot_status = 1 + expected_slot_status = True case _: assert False, "unknown status" - expected_slots = [{'id': slot_id, 'state': expected_slot_status} + expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status} for slot_id in range(context.n_slots)] await request_slots_status(context, expected_slots) @@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context, if status_code == 503 and status_code == expected_http_status_code: return if status_code == 200 and status_code == expected_http_status_code: - n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots) - n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots) + n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots) + n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots) if ((slots_idle is None or slots_idle == n_slots_idle) and (slots_processing is None or slots_processing == n_slots_processing)): return From 401558b7ba7a08175c153cd3607230f63c8a528e Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 4 Nov 2024 17:34:08 +0100 Subject: [PATCH 035/279] ggml : fix q4xx mat mul, increase ggml_aligned_malloc alignment (#10167) --- ggml/src/ggml-cpu.c | 5 ++--- ggml/src/ggml.c | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index 4b8ffb629..09ba49b13 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -304,6 +304,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_Q8_0] = { + .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, #if defined (__ARM_FEATURE_MATMUL_INT8) @@ -13692,9 +13693,7 @@ void ggml_cpu_init(void) { uint16_t u16; ggml_fp16_t fp16; } u = {i}; - // FIXME: this table is used in conversion functions outside of compute - // current code depends on ggml_init initializing this table - float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + float f = GGML_FP16_TO_FP32(u.fp16); ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7dc3340a1..1ccf78d98 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -220,8 +220,10 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi void * ggml_aligned_malloc(size_t size) { + const int alignment = 64; + #if defined(_MSC_VER) || defined(__MINGW32__) - return _aligned_malloc(size, TENSOR_ALIGNMENT); + return _aligned_malloc(size, alignment); #else if (size == 0) { GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); @@ -229,8 +231,9 @@ void * ggml_aligned_malloc(size_t size) { } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); + int result = hbw_posix_memalign(&aligned_memory, alignment, size); #elif TARGET_OS_OSX + GGML_UNUSED(alignment); kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); int result = EFAULT; switch (alloc_status) { @@ -248,7 +251,7 @@ void * ggml_aligned_malloc(size_t size) { break; } #else - int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); + int result = posix_memalign(&aligned_memory, alignment, size); #endif if (result != 0) { // Handle allocation failure From d5a409e57fe8bd24fef597ab8a31110d390a6392 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 4 Nov 2024 20:06:58 +0100 Subject: [PATCH 036/279] ggml : fix gelu tables initialization (#10172) --- ggml/src/ggml-cpu.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index 09ba49b13..0cb5b824a 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -13678,6 +13678,13 @@ int ggml_cpu_get_sve_cnt(void) { } void ggml_cpu_init(void) { + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + ggml_critical_section_start(); static bool is_first_call = true; @@ -13685,8 +13692,7 @@ void ggml_cpu_init(void) { if (is_first_call) { // initialize GELU, Quick GELU, SILU and EXP F32 tables { - // FIXME: this may be called before ggml_init - //const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); for (int i = 0; i < (1 << 16); ++i) { union { @@ -13698,9 +13704,9 @@ void ggml_cpu_init(void) { ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } - //const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - //GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); + GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); } #if defined(__ARM_ARCH) From 340736477651095a98a3b10e19b038ec62593a1d Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Mon, 4 Nov 2024 22:06:31 +0000 Subject: [PATCH 037/279] Q6_K AVX improvements (#10118) * q6_k instruction reordering attempt * better subtract method * should be theoretically faster small improvement with shuffle lut, likely because all loads are already done at that stage * optimize bit fiddling * handle -32 offset separately. bsums exists for a reason! * use shift * Update ggml-quants.c * have to update ci macos version to 13 as 12 doesnt work now. 13 is still x86 --- .github/workflows/build.yml | 2 +- ggml/src/ggml-quants.c | 87 ++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 51 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 423173b97..02dcee963 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -92,7 +92,7 @@ jobs: name: llama-bin-macos-arm64.zip macOS-latest-cmake-x64: - runs-on: macos-12 + runs-on: macos-13 steps: - name: Clone diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index f792406e1..82a463f27 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined __AVX__ - const __m128i m4 = _mm_set1_epi8(0xF); const __m128i m3 = _mm_set1_epi8(3); - const __m128i m32s = _mm_set1_epi8(32); - const __m128i m2 = _mm_set1_epi8(2); + const __m128i m15 = _mm_set1_epi8(15); __m256 acc = _mm256_setzero_ps(); @@ -9119,12 +9117,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; + // handle the q6_k -32 offset separately using bsums + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1); const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales); + const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8)); + const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5); + const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5); __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); - __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + int is = 0; + for (int j = 0; j < QK_K/128; ++j) { const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; @@ -9132,26 +9138,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); - const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4); - const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4); - const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4); - const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4); - const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4); - const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4); + const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2); + const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2); + const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48)); + const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48)); + const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2); + const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2); const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0); - const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1); - const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2); - const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3); - const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4); - const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5); - const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6); - const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7); + const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0); + const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1); + const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2); + const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3); + const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4); + const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5); + const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6); + const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7); const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; @@ -9162,15 +9168,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0); - __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1); - __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2); - __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3); - __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4); - __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5); - __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6); - __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7); - __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); @@ -9180,32 +9177,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - p16_4 = _mm_sub_epi16(p16_4, q8s_4); - p16_5 = _mm_sub_epi16(p16_5, q8s_5); - p16_6 = _mm_sub_epi16(p16_6, q8s_6); - p16_7 = _mm_sub_epi16(p16_7, q8s_7); - - const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); + const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); + const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); + const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); + const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); + is += 4; p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); + p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1); p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); - p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); + p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3); p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); - p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5); + p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5); p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); - p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7); + p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); @@ -9214,8 +9199,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r } - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); + sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0); + sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1); + const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc); From a9e8a9a0306a8093eef93b0022d9f45510490072 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 4 Nov 2024 23:17:01 +0100 Subject: [PATCH 038/279] ggml : fix arch check in bf16_to_fp32 (#10164) --- ggml/src/ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1ccf78d98..e6a7824ba 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -395,6 +395,8 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { 16))); } } +#endif +#if defined(__AVX2__) if (ggml_cpu_has_avx2()) { for (; i + 8 <= n; i += 8) { _mm256_storeu_ps(y + i, From b8deef0ec0af5febac1d2cfd9119ff330ed0b762 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 5 Nov 2024 05:23:04 -0700 Subject: [PATCH 039/279] llama : add <|tool_call|> formatting to Granite template (#10177) Branch: GraniteToolCallTemplate Signed-off-by: Gabe Goodhart --- src/llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3e563d811..0cdf0c073 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21799,8 +21799,11 @@ static int32_t llama_chat_apply_template_internal( // IBM Granite template for (const auto & message : chat) { std::string role(message->role); - ss << "<|start_of_role|>" << role << "<|end_of_role|>" - << message->content << "<|end_of_text|>\n"; + ss << "<|start_of_role|>" << role << "<|end_of_role|>"; + if (role == "assistant_tool_call") { + ss << "<|tool_call|>"; + } + ss << message->content << "<|end_of_text|>\n"; } if (add_ass) { ss << "<|start_of_role|>assistant<|end_of_role|>\n"; From a1eaf6a9600bb1608753420ba886a3b0a208ffc0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Nov 2024 10:24:23 +0200 Subject: [PATCH 040/279] metal : add quantized FA support (#10149) * metal : add quantized FA (vec) support ggml-ci * metal : add quantized FA (non-vec) support * metal : fix support check ggml-ci * metal : clean-up * metal : clean-up (cont) * metal : fix shared memory calc + reduce smem + comments * metal : float-correctness * metal : minor [no ci] --- ggml/src/ggml-metal.m | 302 +++++++++++++++++++++---- ggml/src/ggml-metal.metal | 456 +++++++++++++++++++++++++------------- 2 files changed, 567 insertions(+), 191 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f9bd6faa4..aee354cdd 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -255,9 +255,49 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, - //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, - //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F16_F16, @@ -710,9 +750,49 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, support_simdgroup_mm); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, flash_attn_ext_q4_0_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, flash_attn_ext_q4_0_h128, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, flash_attn_ext_q4_1_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, flash_attn_ext_q4_1_h128, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, flash_attn_ext_q5_0_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, flash_attn_ext_q5_0_h128, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, flash_attn_ext_q5_1_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, flash_attn_ext_q5_1_h128, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, flash_attn_ext_q8_0_h112, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); @@ -869,13 +949,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_LEAKY_RELU: return true; case GGML_OP_FLASH_ATTN_EXT: - if (op->src[1]->type != GGML_TYPE_F16) { - return false; - } - if (op->src[2]->type != GGML_TYPE_F16) { - return false; - } - if (op->src[0]->ne[0] == 256) { + if (op->src[1]->type != op->src[2]->type) { return false; } return support_simdgroup_mm; // TODO: over-restricted for vec-kernels @@ -2822,6 +2896,7 @@ static void ggml_metal_encode_node( GGML_ASSERT(ne11 % 32 == 0); GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == src2->type); GGML_ASSERT(ggml_are_same_shape (src1, src2)); @@ -2869,26 +2944,154 @@ static void ggml_metal_encode_node( bool use_vec_kernel = false; if (ne01 >= 4 || (ne00%128 != 0)) { - switch (ne00) { - case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; - case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; - case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; - case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + switch (src1->type) { + case GGML_TYPE_F16: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q4_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q4_1: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q5_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q5_1: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q8_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; default: - { - GGML_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } } } else { use_vec_kernel = true; switch (ne00) { - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + case 128: + { + switch (src1->type) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } + } + } break; + case 256: + { + switch (src1->type) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } + } + } break; default: { GGML_LOG_ERROR("unsupported size: %lld\n", ne00); @@ -2942,10 +3145,16 @@ static void ggml_metal_encode_node( GGML_ASSERT(nqptg % 8 == 0); GGML_ASSERT(ncpsg % 32 == 0); + // 16*32*(nsg) + // the shared memory needed for the simdgroups to load the KV cache + // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG + // +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16)) + int64_t nsgmax = 2; while (true) { - const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsgmax); if (smem > device.maxThreadgroupMemoryLength) { break; } @@ -2956,16 +3165,15 @@ static void ggml_metal_encode_node( // simdgroups per threadgroup (a.k.a. warps) const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; - const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsg); - //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg); GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); - - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - + [encoder setThreadgroupMemoryLength:smem atIndex:0]; +#undef FATTN_SMEM [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; } else { - // half1x4 kernel + // half4x4 kernel const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! @@ -2973,8 +3181,28 @@ static void ggml_metal_encode_node( GGML_ASSERT(nqptg % 1 == 0); GGML_ASSERT(ncpsg % 32 == 0); + // ne00 + 2*ncpsg*(nsg) + // for each query, we load it as f16 in shared memory (ne00) + // and store the attention scores (nqptg x ncpsg) as f32 + // + // 2*ne00*(nsg) + // each simdgroup has a full f32 head vector in shared mem to accumulate results + // +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16)) + + int64_t nsgmax = 2; + + while (true) { + const size_t smem = FATTN_SMEM(nsgmax); + if (smem > device.maxThreadgroupMemoryLength) { + break; + } + nsgmax *= 2; + } + nsgmax /= 2; + // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); + const int64_t nsgt = MAX(2, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))); int64_t nsg = 1; while (nsg <= nsgt) { @@ -2982,12 +3210,12 @@ static void ggml_metal_encode_node( } nsg /= 2; - const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsg); - //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg); GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - + [encoder setThreadgroupMemoryLength:smem atIndex:0]; +#undef FATTN_SMEM [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; } } break; diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index ff9d37490..b9ea9f08e 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -2723,46 +2723,10 @@ kernel void kernel_leaky_relu_f32( dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope; } -typedef void (flash_attn_ext_f16_t)( - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device float * dst, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant uint64_t & nb21, - constant uint64_t & nb22, - constant uint64_t & nb23, - constant uint64_t & nb31, - constant int64_t & ne1, - constant int64_t & ne2, - constant float & scale, - constant float & max_bias, - constant float & m0, - constant float & m1, - constant uint32_t & n_head_log2, - constant float & logit_softcap, - threadgroup half * shared, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]); - // ref: https://arxiv.org/pdf/2307.08691.pdf -template // head size, queries per threadgroup, cache items per threadgroup -kernel void kernel_flash_attn_ext_f16( +// D - head size, Q - queries per threadgroup, KV - key/value processed per each simdgroup, C - cache items per threadgroup +template +kernel void kernel_flash_attn_ext( device const char * q, device const char * k, device const char * v, @@ -2800,15 +2764,15 @@ kernel void kernel_flash_attn_ext_f16( ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups - const short iq3 = tgpig[2]; - const short iq2 = tgpig[1]; - const short iq1 = tgpig[0]*Q; + const int iq3 = tgpig[2]; + const int iq2 = tgpig[1]; + const int iq1 = tgpig[0]*Q; - const short D4 = D/4; - const short D8 = D/8; - //const short Q8 = Q/8; - const short NW = N_SIMDWIDTH; - const short SH = (C + Q); // shared memory per simdgroup in (half) + const short D4 = D/4; + const short D8 = D/8; + const short D16 = D/16; + const short NW = N_SIMDWIDTH; + const short SH = (C + Q); // shared memory per simdgroup in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half) const short TF = T/2; // shared memory size per query in (float) @@ -2818,6 +2782,9 @@ kernel void kernel_flash_attn_ext_f16( threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix + threadgroup half * skv = (threadgroup half *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory + threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4 + // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) simdgroup_half8x8 lo[D8]; @@ -2849,25 +2816,28 @@ kernel void kernel_flash_attn_ext_f16( threadgroup_barrier(mem_flags::mem_threadgroup); { - float S[Q] = { [0 ... Q-1] = 0.0h }; + float S[Q] = { [0 ... Q-1] = 0.0f }; float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 }; + // thread indices inside the simdgroup + const short tx = tiisg%4; + const short ty = tiisg/4; + // assume K and V are same shape const short ne22 = ne12; const short ne23 = ne13; - // broadcast + // broadcast k const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; - const short rv2 = ne02/ne22; - const short rv3 = ne03/ne23; - - // k indices const short ik2 = iq2/rk2; const short ik3 = iq3/rk3; - // v indices + // broadcast v + const short rv2 = ne02/ne22; + const short rv3 = ne03/ne23; + const short iv2 = iq2/rv2; const short iv3 = iq3/rv3; @@ -2906,13 +2876,59 @@ kernel void kernel_flash_attn_ext_f16( for (short cc = 0; cc < C/8; ++cc) { simdgroup_float8x8 mqk = make_filled_simdgroup_matrix(0.h); - device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); + // this is compile-time check, so it does not have runtime overhead + if (is_same::value) { + // we can read directly from global memory + device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); - for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mk; - simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mk; + simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose - simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); + simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); + } + } else { + for (short ii = 0; ii < D16; ii += 4) { + device const block_q * pk4 = (device const block_q *) ((device const char *) k + ((ic + 8*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); + + if (D16%4 == 0) { + // the head is evenly divisible by 4*16 = 64, so no need for bound checks + half4x4 tmp; + dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + + simdgroup_barrier(mem_flags::mem_threadgroup); + +#pragma unroll + for (short k = 0; k < 4; ++k) { + simdgroup_half8x8 mk; + + simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); + + simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); + } + } else { + if (ii + tx < D16) { + half4x4 tmp; + dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < D16; ++k) { + simdgroup_half8x8 mk; + + simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); + + simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); + } + } + } } simdgroup_store(mqk, ss + 8*cc, TF, 0, false); @@ -2977,16 +2993,61 @@ kernel void kernel_flash_attn_ext_f16( // O = O + (Q*K^T)*V { for (short cc = 0; cc < C/8; ++cc) { - device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); + simdgroup_float8x8 ms; + simdgroup_load(ms, ss + 8*cc, TF, 0, false); - for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mk; - simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false); + if (is_same::value) { + // we can read directly from global memory + device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); +#pragma unroll + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mv; + simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false); - simdgroup_float8x8 mv; - simdgroup_load(mv, ss + 8*cc, TF, 0, false); + simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]); + } + } else { + for (short ii = 0; ii < D16; ii += 4) { + device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 8*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); - simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]); + if (D16%4 == 0) { + // no need for bound checks + half4x4 tmp; + dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + + simdgroup_barrier(mem_flags::mem_threadgroup); + +#pragma unroll + for (short k = 0; k < 4; ++k) { + simdgroup_half8x8 mv; + + simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + + simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + } + } else { + if (ii + tx < D16) { + half4x4 tmp; + dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < D16; ++k) { + simdgroup_half8x8 mv; + + simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + + simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + } + } + } } } } @@ -3003,7 +3064,7 @@ kernel void kernel_flash_attn_ext_f16( // reduce the warps sequentially for (short sg = 1; sg < nsg; ++sg) { - float S = { 0.0h }; + float S = { 0.0f }; float M = { -FLT_MAX/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3082,15 +3143,54 @@ kernel void kernel_flash_attn_ext_f16( } } -template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>; -template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>; -template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>; -template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>; -template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>; -//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>; +typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; -template // head size, queries per threadgroup, cache items per threadgroup -kernel void kernel_flash_attn_ext_vec_f16( +template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +// NOTE: can use half instead of float precision for some extra perf +// D - head size, Q - queries per threadgroup, C - cache items per threadgroup +template +kernel void kernel_flash_attn_ext_vec( device const char * q, device const char * k, device const char * v, @@ -3128,36 +3228,27 @@ kernel void kernel_flash_attn_ext_vec_f16( ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups - const short iq3 = tgpig[2]; - const short iq2 = tgpig[1]; - const short iq1 = tgpig[0]; + const int iq3 = tgpig[2]; + const int iq2 = tgpig[1]; + const int iq1 = tgpig[0]; - const short D4 = D/4; - const short NW = N_SIMDWIDTH; - const short SH = (C + Q); // shared memory per simdgroup in (half) + const short D4 = D/4; + const short D16 = D/16; + const short NW = N_SIMDWIDTH; + const short NW4 = NW/4; + const short SH = C; // shared memory per simdgroup in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half) - float slope = 1.0f; - - // ALiBi - if (max_bias > 0.0f) { - const uint32_t h = iq2; - - const float base = h < n_head_log2 ? m0 : m1; - const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - - slope = pow(base, exp); - } - - //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data - threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 - threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix - threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 - threadgroup half4 * sr4 = (threadgroup half4 *) (shared + sgitg*D + 1*T); // scratch buffer for the results + //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data + threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 + threadgroup half4x4 * sq44 = (threadgroup half4x4 *) (shared + 0*D); // same as above but in half4x4 + threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention + threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 + threadgroup float4x4 * sr44 = (threadgroup float4x4 *) (shared + 2*sgitg*D + Q*T); // scratch buffer for the results // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) - half4 lo[D4/NW]; + float4x4 lo[D16/NW4]; // load heads from Q to shared memory device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03)); @@ -3171,8 +3262,8 @@ kernel void kernel_flash_attn_ext_vec_f16( } // zero out lo - for (short i = tiisg; i < D4; i += NW) { - lo[i/NW] = 0.0h; + for (short i = 0; i < D16/NW4; i += NW4) { + lo[i] = float4x4(0.0f); } // zero out shared memory SH @@ -3183,38 +3274,52 @@ kernel void kernel_flash_attn_ext_vec_f16( threadgroup_barrier(mem_flags::mem_threadgroup); { - float S = { 0.0h }; - float M = { -FLT_MAX/2 }; + float S = 0.0f; + float M = -FLT_MAX/2; + + // thread indices inside the simdgroup + const short tx = tiisg%8; + const short ty = tiisg/8; // assume K and V are same shape const short ne22 = ne12; const short ne23 = ne13; - // broadcast + // broadcast k const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; + const short ik2 = iq2/rk2; + const short ik3 = iq3/rk3; + + // broadcast v const short rv2 = ne02/ne22; const short rv3 = ne03/ne23; - // k indices - const short ik2 = iq2 / rk2; - const short ik3 = iq3 / rk3; - - // v indices - const short iv2 = iq2 / rv2; - const short iv3 = iq3 / rv3; + const short iv2 = iq2/rv2; + const short iv3 = iq3/rv3; // load the queries from shared memory into local memory - float4 mq[D4/NW]; + float4x4 mq[D16/NW4]; - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - mq[ii/NW] = (float4) sq4[i]; + for (short ii = 0; ii < D16; ii += NW4) { + mq[ii/NW4] = (float4x4) sq44[ii + tx]; } // pointer to the mask - device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31); + device const half * mp = (device const half *) (mask + iq1*nb31); + + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = iq2; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } // loop over the KV cache // each simdgroup handles blocks of Q rows and C columns @@ -3226,47 +3331,54 @@ kernel void kernel_flash_attn_ext_vec_f16( // Q*K^T { -#pragma unroll + // each simdgroup processes 1 query and 4 keys for (short cc = 0; cc < C/4; ++cc) { - float4 mqk = { 0.0h }; + float mqk = 0.0; - device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13)); + device const block_q * pk = (device const block_q *) ((device const char *) k + ((ic + 4*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - const short i = ii + tiisg; + for (short ii = 0; ii < D16; ii += NW4) { + const short i = ii + tx; float4x4 mk; - mk[0] = (float4) pk4[i + 0*(nb11/8)]; - mk[1] = (float4) pk4[i + 1*(nb11/8)]; - mk[2] = (float4) pk4[i + 2*(nb11/8)]; - mk[3] = (float4) pk4[i + 3*(nb11/8)]; + dequantize_func(pk + i/nl, i%nl, mk); - mqk += (float4) (mq[ii/NW] * mk); + mqk += + dot(mq[ii/NW4][0], mk[0]) + + dot(mq[ii/NW4][1], mk[1]) + + dot(mq[ii/NW4][2], mk[2]) + + dot(mq[ii/NW4][3], mk[3]); } - // reduce the results from the threads in the simdgroup - mqk += simd_shuffle_down(mqk, 16); - mqk += simd_shuffle_down(mqk, 8); + // simdgroup reduce + // [ 0 .. 7] -> [ 0] + // [ 8 .. 15] -> [ 8] + // [16 .. 23] -> [16] + // [24 .. 31] -> [24] + //mqk += simd_shuffle_down(mqk, 16); + //mqk += simd_shuffle_down(mqk, 8); mqk += simd_shuffle_down(mqk, 4); mqk += simd_shuffle_down(mqk, 2); mqk += simd_shuffle_down(mqk, 1); // mqk = mqk*scale + mask*slope - if (tiisg == 0) { + if (tx == 0) { mqk *= scale; if (logit_softcap != 0.0f) { mqk = logit_softcap*precise::tanh(mqk); } - mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f; + mqk += (mask != q) ? ((float) mp[ic + 4*cc + ty])*slope : (float) 0.0f; - ss4[cc] = mqk; + ss[4*cc + ty] = mqk; } } } + simdgroup_barrier(mem_flags::mem_threadgroup); + // online softmax { const short p = tiisg; @@ -3286,29 +3398,32 @@ kernel void kernel_flash_attn_ext_vec_f16( // O = diag(ms)*O #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - lo[ii/NW] *= ms; + for (short ii = 0; ii < D16; ii += NW4) { + lo[ii/NW4] *= ms; } } + simdgroup_barrier(mem_flags::mem_threadgroup); + // O = O + (Q*K^T)*V { #pragma unroll for (short cc = 0; cc < C/4; ++cc) { - device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23)); + device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 4*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); + + const float4x4 lss(ss[4*cc + ty]); #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - const short i = ii + tiisg; + for (short ii = 0; ii < D16; ii += NW4) { + const short i = ii + tx; - lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; - lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; - lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; - lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; + float4x4 mv; + dequantize_func(pv4 + i/nl, i%nl, mv); + + lo[ii/NW4] += mv*lss; } } } - } // these are needed for reducing the results from the simdgroups (reuse the ss buffer) @@ -3318,10 +3433,32 @@ kernel void kernel_flash_attn_ext_vec_f16( } } + // simdgroup reduce + // [ 0, 8, 16, 24] -> [ 0] + // [ 1, 9, 17, 25] -> [ 1] + // [ 2, 10, 18, 26] -> [ 2] + // [ 3, 11, 19, 27] -> [ 3] + // [ 4, 12, 20, 28] -> [ 4] + // [ 5, 13, 21, 29] -> [ 5] + // [ 6, 14, 22, 30] -> [ 6] + // [ 7, 15, 23, 31] -> [ 7] + for (short ii = 0; ii < D16; ii += NW4) { + lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 16); + lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 8); + + lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 16); + lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 8); + + lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 16); + lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 8); + + lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 16); + lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 8); + } + // store results to shared memory - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - sr4[i] = lo[ii/NW]; + for (short i = tiisg; i < D16; i += NW4) { + sr44[i] = lo[i/NW4]; } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3348,30 +3485,41 @@ kernel void kernel_flash_attn_ext_vec_f16( } // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1; + for (short i = tiisg; i < D16; i += NW) { + sr44[i] = sr44[i]*ms0 + sr44[i + r*D16]*ms1; } } threadgroup_barrier(mem_flags::mem_threadgroup); } - device float4 * dst4 = (device float4 *) dst; + device float4x4 * dst44 = (device float4x4 *) dst; // final rescale with 1/S and store to global memory if (sgitg == 0) { const float S = ss[0]; - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S; + for (short i = tiisg; i < D16; i += NW) { + dst44[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = sr44[i]/S; } } } -template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>; -//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>; +typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; + +template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; template kernel void kernel_cpy( From 1dc04b2deed2d2f2ae3aff9b14ae29674dee1fb8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Nov 2024 11:20:10 +0200 Subject: [PATCH 041/279] ggml : adjust is_first_call init value (#10193) ggml-ci --- ggml/src/ggml.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e6a7824ba..266a0d6f0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1407,11 +1407,11 @@ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const str //////////////////////////////////////////////////////////////////////////////// struct ggml_context * ggml_init(struct ggml_init_params params) { - static bool is_first_call = false; + static bool is_first_call = true; ggml_critical_section_start(); - if (!is_first_call) { + if (is_first_call) { // initialize time system (required on Windows) ggml_time_init(); @@ -1422,7 +1422,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } u = {i}; ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); } - is_first_call = true; + + is_first_call = false; } ggml_critical_section_end(); From 94d8cb8be13b7c4d04eeca5a2b956b9148e6f222 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 6 Nov 2024 12:10:07 +0100 Subject: [PATCH 042/279] metal : fix from ptr buffer name (#10189) --- ggml/src/ggml-metal.m | 5 +++-- src/llama.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index aee354cdd..9966a9e2f 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -4072,7 +4072,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { @@ -4082,7 +4082,8 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; UNUSED(dev); } diff --git a/src/llama.cpp b/src/llama.cpp index 0cdf0c073..6719edb38 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9134,7 +9134,7 @@ static bool llm_load_tensors( // print memory requirements per buffer type for (auto & buf : model.bufs) { - LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name From b11f9ba9b8ce319f04b88afe40d264e6b7f4ba46 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Nov 2024 13:29:01 +0200 Subject: [PATCH 043/279] server : remove hack for extra parallel slot (#10187) ggml-ci --- examples/server/server.cpp | 53 +++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f0b89b22c..1c7f0fd1d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -378,8 +378,8 @@ struct server_queue { std::condition_variable condition_tasks; // callback functions - std::function callback_new_task; - std::function callback_update_slots; + std::function callback_new_task; + std::function callback_update_slots; // Add a new task to the end of the queue int post(server_task task, bool front = false) { @@ -431,7 +431,7 @@ struct server_queue { } // Register function to process a new task - void on_new_task(std::function callback) { + void on_new_task(std::function callback) { callback_new_task = std::move(callback); } @@ -481,7 +481,7 @@ struct server_queue { lock.unlock(); QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(task); + callback_new_task(std::move(task)); } // all tasks in the current loop is processed, slots data is now ready @@ -644,17 +644,12 @@ struct server_context { bool load_model(const common_params & params_) { params = params_; - // reserve one extra sequence (seq_id == 0) for extra features - params.n_parallel += 1; - common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; loras = llama_init.lora_adapters; - params.n_parallel -= 1; // but be sneaky about it - if (model == nullptr) { SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); return false; @@ -1288,16 +1283,16 @@ struct server_context { void send_embedding(const server_slot & slot, const llama_batch & batch) { server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + res.id = slot.id_task; + res.error = false; + res.stop = true; const int n_embd = llama_n_embd(model); std::vector embd_res(n_embd, 0.0f); for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { continue; } @@ -1332,12 +1327,12 @@ struct server_context { void send_rerank(const server_slot & slot, const llama_batch & batch) { server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + res.id = slot.id_task; + res.error = false; + res.stop = true; for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { continue; } @@ -1510,7 +1505,7 @@ struct server_context { // Functions to process the task // - void process_single_task(const server_task & task) { + void process_single_task(server_task task) { switch (task.type) { case SERVER_TASK_TYPE_INFERENCE: { @@ -1646,7 +1641,7 @@ struct server_context { std::string filename = task.data.at("filename"); std::string filepath = task.data.at("filepath"); - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); + const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; @@ -1688,7 +1683,7 @@ struct server_context { slot->cache_tokens.resize(slot->n_ctx); size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); + size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); if (nread == 0) { slot->cache_tokens.resize(0); send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); @@ -1731,7 +1726,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); server_task_result result; @@ -1808,8 +1803,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1836,7 +1831,7 @@ struct server_context { slot.i_batch = batch.n_tokens; - common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); + common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); slot.n_past += 1; @@ -1983,8 +1978,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2033,9 +2028,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -2048,7 +2043,7 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); From 5c333e014059122245c318e7ed4ec27d1085573c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Nov 2024 19:53:51 +0200 Subject: [PATCH 044/279] metal : add BF16 support (#8439) * ggml : add initial BF16 support ggml-ci * metal : add mul_mat_id BF16 support ggml-ci * metal : check for bfloat support on the Metal device ggml-ci * metal : better var names [no ci] * metal : do not build bfloat kernels when not supported ggml-ci * metal : try to fix BF16 support check ggml-ci * metal : this should correctly check bfloat support --- common/common.cpp | 3 + ggml/src/ggml-metal.m | 438 ++++++++++++++++++++++--------------- ggml/src/ggml-metal.metal | 58 ++++- tests/test-backend-ops.cpp | 2 +- 4 files changed, 317 insertions(+), 184 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c8cbaae11..19674af15 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } + if (s == "bf16") { + return GGML_TYPE_BF16; + } if (s == "q8_0") { return GGML_TYPE_Q8_0; } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 9966a9e2f..f13adee38 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -36,16 +36,18 @@ static struct ggml_backend_metal_device_context { id mtl_device; int mtl_device_ref_count; - bool support_simdgroup_reduction; - bool support_simdgroup_mm; + bool has_simdgroup_reduction; + bool has_simdgroup_mm; + bool has_bfloat; char name[128]; } g_ggml_ctx_dev_main = { - /*.mtl_device =*/ nil, - /*.mtl_device_ref_count =*/ 0, - /*.support_simdgroup_reduction =*/ false, - /*.support_simdgroup_mm =*/ false, - /*.name =*/ "", + /*.mtl_device =*/ nil, + /*.mtl_device_ref_count =*/ 0, + /*.has_simdgroup_reduction =*/ false, + /*.has_simdgroup_mm =*/ false, + /*.has_bfloat =*/ false, + /*.name =*/ "", }; // acquire @@ -55,10 +57,13 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); - ctx->support_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; - ctx->support_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; - ctx->support_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + + ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } @@ -120,6 +125,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, + GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, @@ -146,10 +152,14 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, @@ -170,10 +180,11 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, - //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, + //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, @@ -195,6 +206,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, @@ -216,6 +228,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, @@ -300,8 +313,11 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, + GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, GGML_METAL_KERNEL_TYPE_CPY_F16_F16, GGML_METAL_KERNEL_TYPE_CPY_F16_F32, + GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, + GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, @@ -480,7 +496,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de // dictionary of preprocessor macros NSMutableDictionary * prep = [NSMutableDictionary dictionary]; - MTLCompileOptions* options = [MTLCompileOptions new]; + MTLCompileOptions * options = [MTLCompileOptions new]; options.preprocessorMacros = prep; //[options setFastMathEnabled:false]; @@ -530,9 +546,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de } } - GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx_dev->support_simdgroup_reduction ? "true" : "false"); - GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx_dev->support_simdgroup_mm ? "true" : "false"); - GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; ctx->capture_started = false; @@ -578,8 +595,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } - const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; - const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; + const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; + const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; + const bool has_bfloat = ctx_dev->has_bfloat; // simd_sum and simd_max requires MTLGPUFamilyApple7 @@ -607,14 +625,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, get_rows_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); @@ -635,101 +654,108 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, mul_mv_id_bf16_f32, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, mul_mm_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true); @@ -745,58 +771,61 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, flash_attn_ext_q4_0_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, flash_attn_ext_q4_0_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, flash_attn_ext_q4_1_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, flash_attn_ext_q4_1_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, flash_attn_ext_q5_0_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, flash_attn_ext_q5_0_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, flash_attn_ext_q5_1_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, flash_attn_ext_q5_1_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, flash_attn_ext_q8_0_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, flash_attn_ext_q4_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, flash_attn_ext_q4_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, flash_attn_ext_q4_1_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, flash_attn_ext_q4_1_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, flash_attn_ext_q5_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, flash_attn_ext_q5_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, flash_attn_ext_q5_1_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, flash_attn_ext_q5_1_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, flash_attn_ext_q8_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, cpy_bf16_f32, has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, cpy_bf16_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); @@ -886,15 +915,18 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs } static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) { - for (size_t i = 0, n = 3; i < n; ++i) { - if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { - return false; + const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; + const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; + const bool has_bfloat = ctx_dev->has_bfloat; + + if (!has_bfloat) { + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { + return false; + } } } - const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; - const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; - switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -932,7 +964,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_SOFT_MAX: case GGML_OP_RMS_NORM: case GGML_OP_GROUP_NORM: - return support_simdgroup_reduction; + return has_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ROPE: return true; @@ -952,13 +984,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex if (op->src[1]->type != op->src[2]->type) { return false; } - return support_simdgroup_mm; // TODO: over-restricted for vec-kernels + return has_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - return support_simdgroup_reduction && + return has_simdgroup_reduction && (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32); case GGML_OP_CPY: case GGML_OP_DUP: @@ -969,6 +1001,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex switch (op->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -981,10 +1014,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } case GGML_TYPE_F16: switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_F16: return true; - default: + default: + return false; + } + case GGML_TYPE_BF16: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_BF16: + return true; + default: return false; } default: @@ -1855,6 +1896,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; default: break; } @@ -1863,6 +1905,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; @@ -1940,6 +1983,25 @@ static void ggml_metal_encode_node( nrows = 4; } } break; + case GGML_TYPE_BF16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; + nrows = 4; + } + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; + nrows = 4; + } + } break; case GGML_TYPE_Q4_0: { nth0 = 8; @@ -2158,12 +2220,12 @@ static void ggml_metal_encode_node( if ([device supportsFamily:MTLGPUFamilyApple7] && ne00 % 32 == 0 && ne00 >= 64 && dst_rows > dst_rows_min) { - // some Metal matrix data types require aligned pointers // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; default: break; } @@ -2172,6 +2234,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; @@ -2241,6 +2304,13 @@ static void ggml_metal_encode_node( nth1 = 1; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; } break; + case GGML_TYPE_BF16: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32].pipeline; + } break; case GGML_TYPE_Q4_0: { nth0 = 8; @@ -2438,6 +2508,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; @@ -3237,6 +3308,7 @@ static void ggml_metal_encode_node( switch (dstt) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_BF16].pipeline; break; case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; @@ -3254,6 +3326,14 @@ static void ggml_metal_encode_node( default: GGML_ABORT("not implemented"); }; } break; + case GGML_TYPE_BF16: + { + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; default: GGML_ABORT("not implemented"); } diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index b9ea9f08e..16b5da3ff 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -12,6 +12,20 @@ using namespace metal; #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 +// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf +// +// cmd: +// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal.metal +// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal +// +#if __METAL_VERSION__ < 310 +#define GGML_METAL_NO_BFLOAT +#endif + +#if !defined(GGML_METAL_NO_BFLOAT) +typedef matrix bfloat4x4; +#endif + constexpr constant static float kvalues_iq4nl_f[16] = { -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f }; @@ -27,6 +41,13 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) reg = (type4x4)(*src); } +#if !defined(GGML_METAL_NO_BFLOAT) +template +void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} +#endif + template void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { device const uint16_t * qs = ((device const uint16_t *)xb + 1); @@ -2041,6 +2062,10 @@ typedef decltype(kernel_mul_mv) mul_mv_t; template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t kernel_mul_mv; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv; +template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; +#endif template kernel void kernel_mul_mv_1row( @@ -2110,6 +2135,9 @@ kernel void kernel_mul_mv_1row( typedef decltype(kernel_mul_mv_1row) mul_mv_1row_t; template [[host_name("kernel_mul_mv_f16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; +#endif // Assumes row size (ne00) is a multiple of 4 template @@ -2169,6 +2197,9 @@ kernel void kernel_mul_mv_l4( typedef decltype(kernel_mul_mv_l4) mul_mv_l4_t; template [[host_name("kernel_mul_mv_f16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; +#endif static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / max(0.001f, high - low); @@ -3565,10 +3596,17 @@ kernel void kernel_cpy( typedef decltype(kernel_cpy) kernel_cpy_t; -template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy; +#endif +template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy; +#endif kernel void kernel_cpy_f32_q8_0( device const float * src0, @@ -6473,6 +6511,9 @@ typedef decltype(kernel_get_rows_f) get_rows_f_t; template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f; +#endif typedef decltype(kernel_get_rows_q) get_rows_q_t; @@ -6504,6 +6545,9 @@ typedef decltype(kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm; +#endif template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -6532,6 +6576,9 @@ typedef decltype(kernel_mul_mm_id) mat_mm_id_t; template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#endif template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -6755,6 +6802,9 @@ typedef decltype(kernel_mul_mv_id>>; template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_id_bf16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#endif template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 46346cbd0..6cc77edab 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3599,7 +3599,7 @@ static std::vector> make_test_cases_eval() { for (int n_mats : {4}) { for (int n_used : {2}) { for (bool b : {false}) { - for (int n : {1}) { + for (int n : {1, 32}) { int m = 512; int k = 256; test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); From 3bcd40b3c593d14261fb2abfabad3c0fb5b9e318 Mon Sep 17 00:00:00 2001 From: Zhiyuan Li Date: Thu, 7 Nov 2024 18:19:10 +1100 Subject: [PATCH 045/279] Optimize RWKV6 Operator Naming and Implement Multi-core CPU/ SYCL Acceleration (#10133) * rwkv6: rename to wkv6 * rwkv6: support avx2 avx512 armv8 armv9 * rwkv6: update cuda file name * rwkv6: rename params * wkv on sycl * sycl: add some ops * sycl: Enhance OP support judgment * wkv6: drop armv9 and tranfer to GGML style ggml-ci * sync : ggml * update the function to use appropriate types * fix define error * Update ggml/src/ggml-cpu.c * add appropriate asserts * move element-wise functions outside * put the declaration outside the loop * rewrite to be more inline with the common pattern for distributing threads * use recommended way GGML_TENSOR_LOCALS --------- Co-authored-by: Georgi Gerganov Co-authored-by: Diego Devesa Co-authored-by: Plamen Minev Co-authored-by: Yuri Khrustalev Co-authored-by: Meng, Hengyu --- docs/backend/SYCL.md | 2 +- ggml/include/ggml.h | 4 +- ggml/src/ggml-cpu.c | 208 +++- ggml/src/ggml-cuda.cu | 8 +- ggml/src/ggml-cuda/rwkv-wkv.cuh | 5 - ggml/src/ggml-cuda/{rwkv-wkv.cu => wkv6.cu} | 6 +- ggml/src/ggml-cuda/wkv6.cuh | 5 + ggml/src/ggml-sycl.cpp | 1121 +++---------------- ggml/src/ggml-sycl/backend.hpp | 3 + ggml/src/ggml-sycl/common.cpp | 40 + ggml/src/ggml-sycl/common.hpp | 258 +++++ ggml/src/ggml-sycl/concat.cpp | 1 + ggml/src/ggml-sycl/element_wise.cpp | 1011 +++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 76 ++ ggml/src/ggml-sycl/outprod.cpp | 55 + ggml/src/ggml-sycl/outprod.hpp | 11 + ggml/src/ggml-sycl/presets.hpp | 6 + ggml/src/ggml-sycl/wkv6.cpp | 138 +++ ggml/src/ggml-sycl/wkv6.hpp | 10 + ggml/src/ggml.c | 12 +- src/llama.cpp | 8 +- tests/test-backend-ops.cpp | 16 +- 22 files changed, 1977 insertions(+), 1027 deletions(-) delete mode 100644 ggml/src/ggml-cuda/rwkv-wkv.cuh rename ggml/src/ggml-cuda/{rwkv-wkv.cu => wkv6.cu} (93%) create mode 100644 ggml/src/ggml-cuda/wkv6.cuh create mode 100644 ggml/src/ggml-sycl/element_wise.cpp create mode 100644 ggml/src/ggml-sycl/element_wise.hpp create mode 100644 ggml/src/ggml-sycl/outprod.cpp create mode 100644 ggml/src/ggml-sycl/outprod.hpp create mode 100644 ggml/src/ggml-sycl/wkv6.cpp create mode 100644 ggml/src/ggml-sycl/wkv6.hpp diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index ea34182e4..bc8c0f886 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -377,7 +377,7 @@ found 2 SYCL devices: |Chosen Device ID|Setting| |-|-| -|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| +|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action| |1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`| |0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8a0bcbff8..0d143d2fe 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -509,7 +509,7 @@ extern "C" { GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, GGML_OP_ADD_REL_POS, - GGML_OP_RWKV_WKV, + GGML_OP_RWKV_WKV6, GGML_OP_UNARY, @@ -1819,7 +1819,7 @@ extern "C" { struct ggml_tensor * pw, struct ggml_tensor * ph); - GGML_API struct ggml_tensor * ggml_rwkv_wkv( + GGML_API struct ggml_tensor * ggml_rwkv_wkv6( struct ggml_context * ctx, struct ggml_tensor * k, struct ggml_tensor * v, diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index 0cb5b824a..98c3e21ae 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -11642,24 +11642,30 @@ static void ggml_compute_forward_add_rel_pos( } } -// ggml_compute_forward_rwkv_wkv +// ggml_compute_forward_rwkv_wkv6 -static void ggml_compute_forward_rwkv_wkv_f32( +static void ggml_compute_forward_rwkv_wkv6_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const size_t T = dst->src[1]->ne[3]; - const size_t C = dst->ne[0]; - const size_t H = dst->src[1]->ne[2]; - const size_t n_seqs = dst->src[5]->ne[1]; + const int64_t T = dst->src[1]->ne[3]; + const int64_t C = dst->ne[0]; + const int64_t HEADS = dst->src[1]->ne[2]; + const int64_t n_seqs = dst->src[5]->ne[1]; + const int64_t head_size = C / HEADS; float * dst_data = (float *) dst->data; float * state = ((float *) dst->data) + C * T; - if (params->ith != 0) { + const int ith = params->ith; + const int nth = params->nth; + + if (ith >= HEADS) { return; } - memset(dst_data, 0, T * C * sizeof(float)); + const int h_start = (HEADS * ith) / nth; + const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? + (HEADS * (ith + 1)) / nth : HEADS; float * k = (float *) dst->src[0]->data; float * v = (float *) dst->src[1]->data; @@ -11667,54 +11673,160 @@ static void ggml_compute_forward_rwkv_wkv_f32( float * time_faaaa = (float *) dst->src[3]->data; float * time_decay = (float *) dst->src[4]->data; - size_t t_stride = H * (C / H); + size_t t_stride = HEADS * head_size; // Same to C - size_t h_stride = C / H; - size_t h_stride_2d = (C / H) * (C / H); + size_t h_stride = C / HEADS; + GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS + size_t h_stride_2d = head_size * head_size; - // basically fused operations: - // dst = r @ (time_faaaa * (k @ v) + state), - // state = time_decay * state + (k @ v), - // recursive through each token - for (size_t t = 0; t < T; t++) { - size_t t_offset = t * t_stride; - size_t state_offset = (C / H) * C * (t / (T / n_seqs)); - float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; + if (ith == 0) { + memset(dst_data, 0, T * C * sizeof(float)); + } + ggml_barrier(params->threadpool); - for (size_t h = 0; h < H; h++) { - size_t h_offset = h * h_stride; - size_t t_h_offset = t_offset + h_offset; - size_t h_2d_offset = h * h_stride_2d; - for (size_t i = 0; i < C / H; i++) { - size_t t_h_i_offset = t_h_offset + i; - size_t h_i_offset = h_offset + i; - size_t h_2d_i_offset = h_2d_offset + i * h_stride; + #if defined(__AVX__) && !defined(__AVX512F__) + #define GGML_F32X GGML_F32x8 + #define GGML_F32X_SET1 GGML_F32x8_SET1 + #define GGML_F32X_LOAD GGML_F32x8_LOAD + #define GGML_F32X_STORE GGML_F32x8_STORE + #define GGML_F32X_MUL GGML_F32x8_MUL + #define GGML_F32X_FMA GGML_F32x8_FMA + #define WKV_VECTOR_SIZE 8 + #elif defined(__AVX512F__) + #define GGML_F32X GGML_F32x16 + #define GGML_F32X_SET1 GGML_F32x16_SET1 + #define GGML_F32X_LOAD GGML_F32x16_LOAD + #define GGML_F32X_STORE GGML_F32x16_STORE + #define GGML_F32X_MUL GGML_F32x16_MUL + #define GGML_F32X_FMA GGML_F32x16_FMA + #define WKV_VECTOR_SIZE 16 + #elif defined(__ARM_NEON) && defined(__aarch64__) + #define GGML_F32X GGML_F32x4 + #define GGML_F32X_SET1 GGML_F32x4_SET1 + #define GGML_F32X_LOAD GGML_F32x4_LOAD + #define GGML_F32X_STORE GGML_F32x4_STORE + #define GGML_F32X_MUL GGML_F32x4_MUL + #define GGML_F32X_FMA GGML_F32x4_FMA + #define WKV_VECTOR_SIZE 4 + #endif - float k_val = k[t_h_i_offset]; - float r_val = r[t_h_i_offset]; - float time_faaaa_val = time_faaaa[h_i_offset]; - // RWKV v6: different time_decay for each token. - float time_decay_val = time_decay[t_h_i_offset]; + #ifdef WKV_VECTOR_SIZE + const int64_t vec_count = head_size / WKV_VECTOR_SIZE; - for (size_t j = 0; j < C / H; j ++) { - size_t t_h_j_offset = t_h_offset + j; - size_t h_2d_i_j_offset = h_2d_i_offset + j; + for (int64_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + size_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; - float v_val = v[t_h_j_offset]; - float kv_val = v_val * k_val; - float prev_state_val = state_prev[h_2d_i_j_offset]; - float temp_val = kv_val * time_faaaa_val + prev_state_val; - dst_data[t_h_j_offset] += temp_val * r_val; - state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + for (int64_t h = h_start; h < h_end; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (int64_t i = 0; i < head_size; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_i_offset = h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float r_val = r[t_h_i_offset]; + float time_faaaa_val = time_faaaa[h_i_offset]; + float time_decay_val = time_decay[t_h_i_offset]; + + // Broadcast scalar values to vectors + GGML_F32X k_vec = GGML_F32X_SET1(k_val); + GGML_F32X r_vec = GGML_F32X_SET1(r_val); + GGML_F32X time_faaaa_vec = GGML_F32X_SET1(time_faaaa_val); + GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val); + + for (int64_t j = 0; j < vec_count; j++) { + size_t base_j = j * WKV_VECTOR_SIZE; + size_t t_h_j_offset = t_h_offset + base_j; + size_t h_2d_i_j_offset = h_2d_i_offset + base_j; + + // Load x elements at once + GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]); + GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]); + GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]); + + // Compute kv = v * k + GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec); + + // Compute temp = kv * time_faaaa + prev_state + GGML_F32X temp_vec = GGML_F32X_FMA(prev_state_vec, kv_vec, time_faaaa_vec); + + // Update dst: dst += temp * r + dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, r_vec); + GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec); + + // Update state: state = prev_state * time_decay + kv + GGML_F32X new_state_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, time_decay_vec); + GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], new_state_vec); + } + + // Handle remaining elements, this will not be used. + for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + float temp_val = kv_val * time_faaaa_val + prev_state_val; + dst_data[t_h_j_offset] += temp_val * r_val; + state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + } } } } - } + + #else + // basically fused operations: + // dst = r @ (time_faaaa * (k @ v) + state), + // state = time_decay * state + (k @ v), + // recursive through each token + for (int64_t t = 0; t < T; t++) { + size_t t_offset = t * t_stride; + size_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; + + for (int64_t h = h_start; h < h_end; h++) { + size_t h_offset = h * h_stride; + size_t t_h_offset = t_offset + h_offset; + size_t h_2d_offset = h * h_stride_2d; + + for (int64_t i = 0; i < head_size; i++) { + size_t t_h_i_offset = t_h_offset + i; + size_t h_i_offset = h_offset + i; + size_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float k_val = k[t_h_i_offset]; + float r_val = r[t_h_i_offset]; + float time_faaaa_val = time_faaaa[h_i_offset]; + // RWKV v6: different time_decay for each token. + float time_decay_val = time_decay[t_h_i_offset]; + + for (int64_t j = 0; j < head_size; j++) { + size_t t_h_j_offset = t_h_offset + j; + size_t h_2d_i_j_offset = h_2d_i_offset + j; + + float v_val = v[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + float temp_val = kv_val * time_faaaa_val + prev_state_val; + dst_data[t_h_j_offset] += temp_val * r_val; + state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; + } + } + } + } + #endif } -static void ggml_compute_forward_rwkv_wkv( + +static void ggml_compute_forward_rwkv_wkv6( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -11723,7 +11835,7 @@ static void ggml_compute_forward_rwkv_wkv( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_rwkv_wkv_f32(params, dst); + ggml_compute_forward_rwkv_wkv6_f32(params, dst); } break; default: { @@ -12475,9 +12587,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_add_rel_pos(params, tensor); } break; - case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_WKV6: { - ggml_compute_forward_rwkv_wkv(params, tensor); + ggml_compute_forward_rwkv_wkv6(params, tensor); } break; case GGML_OP_MAP_UNARY: { @@ -12775,7 +12887,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: - case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_WKV6: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e68e40550..e27c8e87d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -36,7 +36,7 @@ #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" -#include "ggml-cuda/rwkv-wkv.cuh" +#include "ggml-cuda/wkv6.cuh" #include #include @@ -2319,8 +2319,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_CROSS_ENTROPY_LOSS: ggml_cuda_cross_entropy_loss(ctx, dst); break; - case GGML_OP_RWKV_WKV: - ggml_cuda_op_rwkv_wkv(ctx, dst); + case GGML_OP_RWKV_WKV6: + ggml_cuda_op_rwkv_wkv6(ctx, dst); break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_cuda_cross_entropy_loss_back(ctx, dst); @@ -3153,7 +3153,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: - case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_WKV6: return true; case GGML_OP_FLASH_ATTN_EXT: { #ifndef FLASH_ATTN_AVAILABLE diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cuh b/ggml/src/ggml-cuda/rwkv-wkv.cuh deleted file mode 100644 index 13795247f..000000000 --- a/ggml/src/ggml-cuda/rwkv-wkv.cuh +++ /dev/null @@ -1,5 +0,0 @@ -#include "common.cuh" - -#define CUDA_WKV_BLOCK_SIZE 64 - -void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/rwkv-wkv.cu b/ggml/src/ggml-cuda/wkv6.cu similarity index 93% rename from ggml/src/ggml-cuda/rwkv-wkv.cu rename to ggml/src/ggml-cuda/wkv6.cu index 098e92d35..42578341a 100644 --- a/ggml/src/ggml-cuda/rwkv-wkv.cu +++ b/ggml/src/ggml-cuda/wkv6.cu @@ -1,5 +1,5 @@ #include "common.cuh" -#include "rwkv-wkv.cuh" +#include "wkv6.cuh" static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) { const int tid = threadIdx.x; @@ -64,7 +64,7 @@ static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const } } -void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const float * k_d = (const float *)dst->src[0]->data; const float * v_d = (const float *)dst->src[1]->data; const float * r_d = (const float *)dst->src[2]->data; @@ -83,7 +83,7 @@ void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); GGML_ASSERT(C % H == 0); - GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); + GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); // The current cuda kernel is designed for RWKV6, HEAD_SIZE == 64 rwkv_wkv_f32<<>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d); } diff --git a/ggml/src/ggml-cuda/wkv6.cuh b/ggml/src/ggml-cuda/wkv6.cuh new file mode 100644 index 000000000..a7124ee51 --- /dev/null +++ b/ggml/src/ggml-cuda/wkv6.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_WKV_BLOCK_SIZE 64 + +void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index a62c67f4f..255bc64c6 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1194,272 +1194,8 @@ typedef void (*ggml_sycl_op_mul_mat_t)( float *dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const queue_ptr &stream); -typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream); -static __dpct_inline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} -static __dpct_inline__ float op_add(const float a, const float b) { - return a + b; -} - -static __dpct_inline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __dpct_inline__ float op_div(const float a, const float b) { - return a / b; -} - -template -static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) / - ne3; - const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) % - ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; - i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - -static void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= ne) { - return; - } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; - } -} - -static void gelu_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - float xi = x[i]; - dst[i] = 0.5f * xi * - (1.0f + - sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi))); -} - -static void silu_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i])); -} - -static void gelu_quick_f32(const float *x, float *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const float GELU_QUICK_COEF = -1.702f; - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i]))); -} - -static void tanh_f32(const float *x, float *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = sycl::tanh((float)(x[i])); -} - -static void relu_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmax((float)(x[i]), (float)0); -} - -static void hardsigmoid_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); -} - -static void hardswish_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); -} - -static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = sycl::fmax((float)(x[i]), (float)0) + - sycl::fmin((float)(x[i]), 0.0f) * negative_slope; -} - -static void sqr_f32(const float * x, float * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] * x[i]; -} - -static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { - int index = item_ct1.get_local_id(0) + - item_ct1.get_group(0) * item_ct1.get_local_range(0); - if (index >= ne10 * ne11 * ne12 * ne13) { - return; - } - // operation - int i10 = index % ne10; - int i11 = (index / ne10) % ne11; - int i12 = (index / (ne10 * ne11)) % ne12; - int i13 = (index / (ne10 * ne11 * ne12)) % ne13; - - int i00 = i10 / sf0; - int i01 = i11 / sf1; - int i02 = i12 / sf2; - int i03 = i13 / sf3; - - dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); -} - -static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; - } - - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (nidx < ne00 && item_ct1.get_group(1) < ne01 && - item_ct1.get_group(0) < ne02) { - int offset_src = nidx + item_ct1.get_group(1) * ne00 + - item_ct1.get_group(0) * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } else { - dst[offset_dst] = 0.0f; - } -} template static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, @@ -2148,297 +1884,6 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens (void) dst; } -template -struct bin_bcast_sycl { - template - void operator()(ggml_backend_sycl_context & ctx, - const struct ggml_tensor *src0, - const struct ggml_tensor *src1, struct ggml_tensor *dst, - const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - int nr0 = ne10/ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne0[] = {ne0, ne1, ne2, ne3}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb0[] = {nb0, nb1, nb2, nb3}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne0); - collapse(cne1); - } - } - { - int64_t ne0 = cne0[0]; - int64_t ne1 = cne0[1]; - int64_t ne2 = cne0[2]; - int64_t ne3 = cne0[3]; - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb0[0]; - size_t nb1 = cnb0[1]; - size_t nb2 = cnb0[2]; - size_t nb3 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - sycl::range<3> block_dims(1, 1, 1); - block_dims[2] = std::min(hne0, block_size); - block_dims[1] = std::min( - ne1, block_size / (unsigned int)block_dims[2]); - block_dims[0] = std::min( - std::min( - ne2 * ne3, block_size / (unsigned int)block_dims[2] / - (unsigned int)block_dims[1]), - 64U); - - sycl::range<3> block_nums( - (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], - (ne1 + block_dims[1] - 1) / block_dims[1], - (hne0 + block_dims[2] - 1) / block_dims[2]); - - if (block_nums[0] > 65535) { - // this is the maximum number of blocks in z direction, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * - sycl::range<3>(1, 1, block_size), - sycl::range<3>(1, 1, block_size)), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast_unravel( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, - s13, item_ct1); - }); - } - } else { - /* - DPCT1049:16: The work-group size passed to the SYCL kernel may - exceed the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if - needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, - ne2, ne3, ne10, ne11, ne12, ne13, - s1, s2, s3, s11, s12, s13, - item_ct1); - }); - } - } - } -}; - -static void acc_f32_sycl(const float *x, const float *y, float *dst, - const int n_elements, const int ne10, const int ne11, - const int ne12, const int nb1, const int nb2, - const int offset, queue_ptr stream) { - int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, - item_ct1); - }); -} - -static void gelu_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - gelu_f32(x, dst, k, item_ct1); - }); -} - -static void silu_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - silu_f32(x, dst, k, item_ct1); - }); -} - -static void gelu_quick_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - gelu_quick_f32(x, dst, k, item_ct1); - }); -} - -static void tanh_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - tanh_f32(x, dst, k, item_ct1); - }); -} - -static void relu_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - relu_f32(x, dst, k, item_ct1); - }); -} - -static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - hardsigmoid_f32(x, dst, k, item_ct1); - }); -} - -static void hardswish_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - hardswish_f32(x, dst, k, item_ct1); - }); -} - -static void leaky_relu_f32_sycl(const float *x, float *dst, const int k, - const float negative_slope, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - leaky_relu_f32(x, dst, k, negative_slope, item_ct1); - }); -} - -static void sqr_f32_sycl(const float *x, float *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - sqr_f32(x, dst, k, item_ct1); - }); -} - -static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, queue_ptr stream) { - int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; - sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); - }); -} - -static void pad_f32_sycl(const float *x, float *dst, const int ne00, - const int ne01, const int ne02, const int ne0, - const int ne1, const int ne2, queue_ptr stream) { - int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; - sycl::range<3> gridDim(ne2, ne1, num_blocks); - stream->parallel_for( - sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1); - }); -} static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, @@ -2816,6 +2261,58 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, } } +static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols, + const int nrows, queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = 256 * sizeof(float); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor shared_data( + sycl::range<1>(shared_mem/sizeof(float)), cgh); + sycl::local_accessor shared_indices( + sycl::range<1>(shared_mem/sizeof(float)), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + const int tid = item_ct1.get_local_id(2); + const int row = item_ct1.get_global_id(1); + + float max_val = -INFINITY; + int max_idx = -1; + + for (int col = tid; col < ncols; col += 256) { + float val = x[row * ncols + col]; + if (val > max_val) { + max_val = val; + max_idx = col; + } + } + + shared_data[tid] = max_val; + shared_indices[tid] = max_idx; + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int stride = 256/2; stride > 0; stride >>= 1) { + if (tid < stride) { + float val1 = shared_data[tid]; + float val2 = shared_data[tid + stride]; + if (val2 > val1) { + shared_data[tid] = val2; + shared_indices[tid] = shared_indices[tid + stride]; + } + } + item_ct1.barrier(sycl::access::fence_space::local_space); + } + + + if (tid == 0) { + dst[row] = shared_indices[0]; + } + }); + }); +} static void diag_mask_inf_f32_sycl(const float *x, float *dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, @@ -2946,33 +2443,6 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te } } -template -inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, - (sycl::half *)dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { - op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { - op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, - main_stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, - ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -2986,230 +2456,6 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens (void) src1_d; } -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes - - acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); - - (void) dst; -} - -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -static void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -static void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - - leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const float sf0 = (float)dst->ne[0]/src0->ne[0]; - const float sf1 = (float)dst->ne[1]/src0->ne[1]; - const float sf2 = (float)dst->ne[2]/src0->ne[2]; - const float sf3 = (float)dst->ne[3]/src0->ne[3]; - - upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, - main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - pad_f32_sycl(src0_dd, dst_dd, - src0->ne[0], src0->ne[1], src0->ne[2], - dst->ne[0], dst->ne[1], dst->ne[2], main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, @@ -3379,6 +2625,23 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens (void) src1_dd; } +inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne = ggml_nelements(src0); + + sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -3419,6 +2682,25 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten (void) src1_dd; } +inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, @@ -3489,46 +2771,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso (void) src1_dd; } -static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const ggml_sycl_op_flatten_t op) try { - const int64_t nrows0 = ggml_nrows(src0); - - const bool use_src1 = src1 != nullptr; - const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - - // dd = data device - float * src0_ddf = (float *) src0->data; - float * src1_ddf = use_src1 ? (float *) src1->data : nullptr; - float * dst_ddf = (float *) dst->data; - - ggml_sycl_pool_alloc src0_f(ctx.pool()); - ggml_sycl_pool_alloc src1_f(ctx.pool()); - ggml_sycl_pool_alloc dst_f(ctx.pool()); - - ggml_sycl_set_device(ctx.device); - queue_ptr main_stream = ctx.stream(); - // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", - // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device); - - // do the computation - op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); - // print_ggml_tensor("tensor", dst); -} -catch (sycl::exception const &exc) { - - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -3908,115 +3150,24 @@ static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, const ggml_tenso GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_norm); GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - - static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm); GGML_SYCL_DEBUG("call %s done\n", __func__); } +static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { @@ -4632,6 +3783,11 @@ static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_im2col); } +static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum); +} + static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum_rows); @@ -4642,6 +3798,11 @@ static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argsort); } +static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argmax); +} + static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { (void) src0; (void) src1; @@ -4673,6 +3834,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_func_t func; switch (tensor->op) { + case GGML_OP_ARGMAX: + func = ggml_sycl_argmax; + break; case GGML_OP_CONV_TRANSPOSE_1D: func = ggml_sycl_op_conv_transpose_1d; break; @@ -4686,19 +3850,32 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens func = ggml_sycl_dup; break; case GGML_OP_ADD: + case GGML_OP_ADD1: // TODO: more efficient implementation func = ggml_sycl_add; break; + case GGML_OP_SUB: + func = ggml_sycl_sub; + break; case GGML_OP_ACC: func = ggml_sycl_acc; break; case GGML_OP_MUL: func = ggml_sycl_mul; break; + case GGML_OP_LOG: + func = ggml_sycl_log; + break; case GGML_OP_DIV: func = ggml_sycl_div; break; case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_NEG: + func = ggml_sycl_neg; + break; + case GGML_UNARY_OP_STEP: + func = ggml_sycl_step; + break; case GGML_UNARY_OP_GELU: func = ggml_sycl_gelu; break; @@ -4714,12 +3891,18 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_UNARY_OP_RELU: func = ggml_sycl_relu; break; + case GGML_UNARY_OP_SIGMOID: + func = ggml_sycl_sigmoid; + break; case GGML_UNARY_OP_HARDSIGMOID: func = ggml_sycl_hardsigmoid; break; case GGML_UNARY_OP_HARDSWISH: func = ggml_sycl_hardswish; break; + case GGML_UNARY_OP_EXP: + func = ggml_sycl_exp; + break; default: return false; } @@ -4757,12 +3940,24 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens } func = ggml_sycl_mul_mat_id; break; + case GGML_OP_OUT_PROD: + func = ggml_sycl_op_out_prod; + break; case GGML_OP_SCALE: func = ggml_sycl_scale; break; case GGML_OP_SQR: func = ggml_sycl_sqr; break; + case GGML_OP_SQRT: + func = ggml_sycl_sqrt; + break; + case GGML_OP_SIN: + func = ggml_sycl_sin; + break; + case GGML_OP_COS: + func = ggml_sycl_cos; + break; case GGML_OP_CLAMP: func = ggml_sycl_clamp; break; @@ -4794,6 +3989,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_POOL_2D: func = ggml_sycl_pool2d; break; + case GGML_OP_SUM: + func = ggml_sycl_sum; + break; case GGML_OP_SUM_ROWS: func = ggml_sycl_sum_rows; break; @@ -4803,6 +4001,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_TIMESTEP_EMBEDDING: func = ggml_sycl_op_timestep_embedding; break; + case GGML_OP_RWKV_WKV6: + func = ggml_sycl_op_rwkv_wkv6; + break; default: return false; } @@ -5125,13 +4326,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_EXP: return ggml_is_contiguous(op->src[0]); default: return false; @@ -5168,6 +4373,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } return true; } break; + case GGML_OP_OUT_PROD: + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -5213,10 +4420,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; - int dim = op->op_params[0]; - return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16 && dim == 2; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; } break; case GGML_OP_DUP: + case GGML_OP_ARGMAX: case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_REPEAT: @@ -5225,11 +4432,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_TRANSPOSE: case GGML_OP_NORM: case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_LOG: + case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_RMS_NORM: case GGML_OP_SCALE: case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: case GGML_OP_CLAMP: return true; case GGML_OP_CONT: @@ -5243,6 +4456,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g // TODO: add support for the new F32 operations return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: + case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: @@ -5251,6 +4465,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_RWKV_WKV6: return true; default: return false; @@ -5268,9 +4483,23 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_ return buft_ctx->device == sycl_ctx->device; } +static int64_t get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return op->ne[1]; // this will increse the speed of prefill in test + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; - return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; + return get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); } diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index d21b5f8dd..85748a5b4 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -26,5 +26,8 @@ #include "softmax.hpp" #include "tsembd.hpp" #include "im2col.hpp" +#include "wkv6.hpp" +#include "outprod.hpp" +#include "element_wise.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index cf5291b31..97ab2003c 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -62,3 +62,43 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block } return sycl_down_blk_size; } + +void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const ggml_sycl_op_flatten_t op) try { + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + // dd = data device + float * src0_ddf = (float *) src0->data; + float * src1_ddf = use_src1 ? (float *) src1->data : nullptr; + float * dst_ddf = (float *) dst->data; + + ggml_sycl_pool_alloc src0_f(ctx.pool()); + ggml_sycl_pool_alloc src1_f(ctx.pool()); + ggml_sycl_pool_alloc dst_f(ctx.pool()); + + ggml_sycl_set_device(ctx.device); + queue_ptr main_stream = ctx.stream(); + // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", + // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device); + + // do the computation + op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + // print_ggml_tensor("tensor", dst); +} +catch (sycl::exception const &exc) { + + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index bc0faa867..4549fa5e9 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -404,4 +404,262 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); +typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream); + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) / + ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) % + ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; + i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + + +template +struct bin_bcast_sycl { + template + void operator()(ggml_backend_sycl_context & ctx, + const struct ggml_tensor *src0, + const struct ggml_tensor *src1, struct ggml_tensor *dst, + const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, + queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10/ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne0[] = {ne0, ne1, ne2, ne3}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + size_t cnb0[] = {nb0, nb1, nb2, nb3}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne0); + collapse(cne1); + } + } + { + int64_t ne0 = cne0[0]; + int64_t ne1 = cne0[1]; + int64_t ne2 = cne0[2]; + int64_t ne3 = cne0[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb0[0]; + size_t nb1 = cnb0[1]; + size_t nb2 = cnb0[2]; + size_t nb3 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min( + ne1, block_size / (unsigned int)block_dims[2]); + block_dims[0] = std::min( + std::min( + ne2 * ne3, block_size / (unsigned int)block_dims[2] / + (unsigned int)block_dims[1]), + 64U); + + sycl::range<3> block_nums( + (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * + sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, + s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, + ne2, ne3, ne10, ne11, ne12, ne13, + s1, s2, s3, s11, s12, s13, + item_ct1); + }); + } + } + } +}; + +template +inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, + (sycl::half *)dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, + main_stream); + } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, + main_stream); + } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { + op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, + main_stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} + + +void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const ggml_sycl_op_flatten_t op); + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index 632eedb9d..c90c452d8 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -106,6 +106,7 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst, concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); }); break; + // dim >=2 will be dispatched to the default path default: stream->parallel_for( sycl::nd_range<3>(gridDim * diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp new file mode 100644 index 000000000..e5cd736eb --- /dev/null +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -0,0 +1,1011 @@ +#include "common.hpp" +#include "element_wise.hpp" + +void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +void gelu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f * xi * + (1.0f + + sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi))); +} + +void silu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i])); +} + +void gelu_quick_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_QUICK_COEF = -1.702f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i]))); +} + +void tanh_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::tanh((float)(x[i])); +} + +void relu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0); +} + +void sigmoid_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i])); +} + +void sqrt_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::sqrt(x[i]); +} + +void sin_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::sin(x[i]); +} + +void cos_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::cos(x[i]); +} + +void hardsigmoid_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +void hardswish_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +void exp_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::exp(x[i]); +} + +void log_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + float xi = x[i]; + if (xi <= 0) { + dst[i] = -INFINITY; + } else { + dst[i] = sycl::log(xi); + } +} + +void neg_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = -x[i]; +} + +void step_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] > 0.0f; +} + +void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0) + + sycl::fmin((float)(x[i]), 0.0f) * negative_slope; +} + +void sqr_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] * x[i]; +} + +void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, + const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int ne13, const float sf0, const float sf1, + const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { + int index = item_ct1.get_local_id(0) + + item_ct1.get_group(0) * item_ct1.get_local_range(0); + if (index >= ne10 * ne11 * ne12 * ne13) { + return; + } + // operation + int i10 = index % ne10; + int i11 = (index / ne10) % ne11; + int i12 = (index / (ne10 * ne11)) % ne12; + int i13 = (index / (ne10 * ne11 * ne12)) % ne13; + + int i00 = i10 / sf0; + int i01 = i11 / sf1; + int i02 = i12 / sf2; + int i03 = i13 / sf3; + + dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); +} + +void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00 && item_ct1.get_group(1) < ne01 && + item_ct1.get_group(0) < ne02) { + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + + + +void acc_f32_sycl(const float *x, const float *y, float *dst, + const int n_elements, const int ne10, const int ne11, + const int ne12, const int nb1, const int nb2, + const int offset, queue_ptr stream) { + int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); +} + +void gelu_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_f32(x, dst, k, item_ct1); + }); +} + +void silu_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + silu_f32(x, dst, k, item_ct1); + }); +} + +void gelu_quick_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_quick_f32(x, dst, k, item_ct1); + }); +} + +void tanh_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + tanh_f32(x, dst, k, item_ct1); + }); +} + +void relu_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + relu_f32(x, dst, k, item_ct1); + }); +} + +void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + hardsigmoid_f32(x, dst, k, item_ct1); + }); +} + +void hardswish_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + hardswish_f32(x, dst, k, item_ct1); + }); +} + +void exp_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + exp_f32(x, dst, k, item_ct1); + }); +} + +void log_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + log_f32(x, dst, k, item_ct1); + }); +} + +void neg_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + neg_f32(x, dst, k, item_ct1); + }); +} + +void step_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + step_f32(x, dst, k, item_ct1); + }); +} + +void sigmoid_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sigmoid_f32(x, dst, k, item_ct1); + }); +} + +void sqrt_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sqrt_f32(x, dst, k, item_ct1); + }); +} + +void sin_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sin_f32(x, dst, k, item_ct1); + }); +} + +void cos_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cos_f32(x, dst, k, item_ct1); + }); +} + +void leaky_relu_f32_sycl(const float *x, float *dst, const int k, + const float negative_slope, + queue_ptr stream) { + const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + leaky_relu_f32(x, dst, k, negative_slope, item_ct1); + }); +} + +void sqr_f32_sycl(const float *x, float *dst, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sqr_f32(x, dst, k, item_ct1); + }); +} + +void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, + const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int ne13, const float sf0, const float sf1, + const float sf2, const float sf3, queue_ptr stream) { + int dst_size = ne10 * ne11 * ne12 * ne13; + int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); + }); +} + +void pad_f32_sycl(const float *x, float *dst, const int ne00, + const int ne01, const int ne02, const int ne0, + const int ne1, const int ne2, queue_ptr stream) { + int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1); + }); +} + +inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} +inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + + leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const float sf0 = (float)dst->ne[0]/src0->ne[0]; + const float sf1 = (float)dst->ne[1]/src0->ne[1]; + const float sf2 = (float)dst->ne[2]/src0->ne[2]; + const float sf3 = (float)dst->ne[3]/src0->ne[3]; + + upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, + main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + pad_f32_sycl(src0_dd, dst_dd, + src0->ne[0], src0->ne[1], src0->ne[2], + dst->ne[0], dst->ne[1], dst->ne[2], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); + + (void) dst; +} + +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const queue_ptr &main_stream) { + + ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + + +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqrt); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sin); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_cos); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sigmoid); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + + +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_exp); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_log); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_neg); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_step); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + + + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sub); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp new file mode 100644 index 000000000..8152edf58 --- /dev/null +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -0,0 +1,76 @@ +#ifndef GGML_SYCL_ELEMENTWISE_HPP +#define GGML_SYCL_ELEMENTWISE_HPP + +#include "common.hpp" + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_sub(const float a, const float b) { + return a - b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + + +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +#endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp new file mode 100644 index 000000000..c2779df0e --- /dev/null +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -0,0 +1,55 @@ +#include +#include "outprod.hpp" + + +void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_TENSOR_BINARY_OP_LOCALS + + // Get SYCL queue + dpct::queue_ptr stream = ctx.stream(); + + // Dimension checks + GGML_ASSERT(ne01 == ne11); // Inner dimensions must match + GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows + GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols + + // Get data pointers + const float* src0_d = (const float*)src0->data; + const float* src1_d = (const float*)src1->data; + float* dst_d = (float*)dst->data; + + // GEMM parameters + const float alpha = 1.0f; + const float beta = 0.0f; + + // Handle transposition of src1 + const bool src1_T = ggml_is_transposed(src1); + const oneapi::mkl::transpose src1_op = + src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; + const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + + try { + // Perform matrix multiplication using oneMKL GEMM + oneapi::mkl::blas::gemm(*stream, + oneapi::mkl::transpose::nontrans, src1_op, + ne0, ne1, ne01, + alpha, + src0_d, ne00, + src1_d, ldb, + beta, + dst_d, ne0); + } + catch (sycl::exception const& exc) { + std::cerr << exc.what() << std::endl; + GGML_ASSERT(false); + } +} diff --git a/ggml/src/ggml-sycl/outprod.hpp b/ggml/src/ggml-sycl/outprod.hpp new file mode 100644 index 000000000..9c042738a --- /dev/null +++ b/ggml/src/ggml-sycl/outprod.hpp @@ -0,0 +1,11 @@ +#ifndef GGML_SYCL_OUTPROD_HPP +#define GGML_SYCL_OUTPROD_HPP + +#include "common.hpp" + +void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst); + + +#endif // GGML_SYCL_OUTPROD_HPP + diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 340ab8e93..af1890727 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -25,6 +25,11 @@ #define SYCL_RELU_BLOCK_SIZE 256 #define SYCL_HARDSIGMOID_BLOCK_SIZE 256 #define SYCL_HARDSWISH_BLOCK_SIZE 256 +#define SYCL_EXP_BLOCK_SIZE 256 +#define SYCL_NEG_BLOCK_SIZE 256 +#define SYCL_SIGMOID_BLOCK_SIZE 256 +#define SYCL_SQRT_BLOCK_SIZE 256 +#define SYCL_SIN_BLOCK_SIZE 256 #define SYCL_SQR_BLOCK_SIZE 256 #define SYCL_CPY_BLOCK_SIZE 32 #define SYCL_SCALE_BLOCK_SIZE 256 @@ -41,6 +46,7 @@ #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 +#define SYCL_ARGMAX_BLOCK_SIZE 256 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 #define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp new file mode 100644 index 000000000..4c737f4bf --- /dev/null +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -0,0 +1,138 @@ +#include +#include "wkv6.hpp" + +constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE + +// Helper function for the main kernel +static void rwkv_wkv_f32_kernel( + const int B, const int T, const int C, const int H, + const float* k, const float* v, const float* r, + const float* tf, const float* td, const float* s, + float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) { + + const int tid = item_ct1.get_local_id(2); + const int bid = item_ct1.get_group(2); + + const int head_size = WKV_BLOCK_SIZE; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + // Set up shared memory pointers + float* _k = shared_mem; + float* _r = _k + head_size; + float* _tf = _r + head_size; + float* _td = _tf + head_size; + + // Local state array + float state[WKV_BLOCK_SIZE]; + + // Load initial state + #pragma unroll + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; + } + + // Sync threads before shared memory operations + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Load time-mixing parameters + _tf[tid] = tf[head_i * head_size + tid]; + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Main sequence processing loop + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; + t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; + t += C) { + + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Load current timestep data to shared memory + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + + item_ct1.barrier(sycl::access::fence_space::local_space); + + const float _v = v[t]; + float y = 0; + + // Process in chunks of 4 for better vectorization + sycl::float4 k4, r4, tf4, td4, s4, kv4; + #pragma unroll + for (int j = 0; j < head_size; j += 4) { + // Load data in vec4 chunks + k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); + td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); + s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]); + + // Compute key-value product + sycl::float4 kv4 = k4 * _v; + + // Accumulate weighted sum + y += sycl::dot(r4, tf4 * kv4 + s4); + + // Update state + s4 = s4 * td4 + kv4; + + // Store updated state + state[j] = s4.x(); + state[j+1] = s4.y(); + state[j+2] = s4.z(); + state[j+3] = s4.w(); + } + + dst[t] = y; + } + + // Save final state + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; + } +} + +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + + const float* k_d = (const float*)dst->src[0]->data; + const float* v_d = (const float*)dst->src[1]->data; + const float* r_d = (const float*)dst->src[2]->data; + const float* tf_d = (const float*)dst->src[3]->data; + const float* td_d = (const float*)dst->src[4]->data; + const float* s_d = (const float*)dst->src[5]->data; + float* dst_d = (float*)dst->data; + + const int64_t B = dst->src[5]->ne[1]; + const int64_t T = dst->src[0]->ne[3]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[2]; + + GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64 + + dpct::queue_ptr stream = ctx.stream(); + + // Calculate execution configuration + const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td + sycl::range<3> block_dims(1, 1, C / H); + sycl::range<3> grid_dims(1, 1, B * H); + + // Submit kernel + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rwkv_wkv_f32_kernel( + B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, + item_ct1, shared_mem_acc.get_pointer() + ); + }); + }); +} diff --git a/ggml/src/ggml-sycl/wkv6.hpp b/ggml/src/ggml-sycl/wkv6.hpp new file mode 100644 index 000000000..ddfa3377b --- /dev/null +++ b/ggml/src/ggml-sycl/wkv6.hpp @@ -0,0 +1,10 @@ +#ifndef GGML_SYCL_WKV6_HPP +#define GGML_SYCL_WKV6_HPP + +#include "common.hpp" + +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst); + + +#endif // GGML_SYCL_WKV6_HPP diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 266a0d6f0..bc034015f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -975,7 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "WIN_UNPART", "GET_REL_POS", "ADD_REL_POS", - "RWKV_WKV", + "RWKV_WKV6", "UNARY", @@ -1070,7 +1070,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "win_unpart(x)", "get_rel_pos(x)", "add_rel_pos(x)", - "rwkv_wkv(k, v, r, tf, td, s)", + "rwkv_wkv6(k, v, r, tf, td, s)", "unary(x)", @@ -4503,9 +4503,9 @@ struct ggml_tensor * ggml_add_rel_pos_inplace( return ggml_add_rel_pos_impl(ctx, a, pw, ph, true); } -// ggml_rwkv_wkv +// ggml_rwkv_wkv6 -struct ggml_tensor * ggml_rwkv_wkv( +struct ggml_tensor * ggml_rwkv_wkv6( struct ggml_context * ctx, struct ggml_tensor * k, struct ggml_tensor * v, @@ -4537,7 +4537,7 @@ struct ggml_tensor * ggml_rwkv_wkv( const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - result->op = GGML_OP_RWKV_WKV; + result->op = GGML_OP_RWKV_WKV6; result->src[0] = k; result->src[1] = v; result->src[2] = r; @@ -6084,7 +6084,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_GET_REL_POS: case GGML_OP_ADD_REL_POS: - case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_WKV6: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/src/llama.cpp b/src/llama.cpp index 6719edb38..034441e1f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7011,7 +7011,7 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, - {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV}}, + {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}}, {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs); op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C); } break; - case GGML_OP_RWKV_WKV: + case GGML_OP_RWKV_WKV6: { // FIXME const int64_t S = 123; @@ -7140,7 +7140,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_tensor * tf = w; ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens); ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); - op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state); + op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); @@ -10083,7 +10083,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( v = ggml_transpose(ctx, v); r = ggml_transpose(ctx, r); - struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); + struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6cc77edab..9d48a2717 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1614,8 +1614,8 @@ struct test_ssm_scan : public test_case { } }; -// GGML_OP_RWKV_WKV -struct test_rwkv_wkv : public test_case { +// GGML_OP_RWKV_WKV6 +struct test_rwkv_wkv6 : public test_case { const ggml_type type; const int64_t head_count; @@ -1627,7 +1627,7 @@ struct test_rwkv_wkv : public test_case { return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); } - test_rwkv_wkv(ggml_type type = GGML_TYPE_F32, + test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32, int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32) : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {} @@ -1639,7 +1639,7 @@ struct test_rwkv_wkv : public test_case { ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector{ head_size, head_count }.data()); ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector{ 1, head_size, head_count, n_tokens }.data()); ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector{ head_size * head_size * head_count, n_seqs }.data()); - ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s); + ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s); return out; } }; @@ -3499,10 +3499,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4)); - test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1)); - test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1)); - test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4)); - test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4)); + test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1)); + test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1)); + test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4)); + test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4)); #if 1 for (ggml_type type_a : base_types) { From 2319126a70b541f8670225a04a38202bbdccbedb Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:02:08 -0600 Subject: [PATCH 046/279] fix q4_0_8_8 format for corrupted tokens issue (#10198) Co-authored-by: EC2 Default User --- ggml/src/ggml-cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index 98c3e21ae..de1de18ec 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -409,6 +409,8 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .gemm = ggml_gemm_q4_0_4x8_q8_0, }, [GGML_TYPE_Q4_0_8_8] = { + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, From 5107e8cea35be46a27cfc940e6841c0cf81c0525 Mon Sep 17 00:00:00 2001 From: wwoodsTM <104587230+wwoodsTM@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:20:25 -0700 Subject: [PATCH 047/279] DRY: Fixes clone functionality (#10192) --- src/llama-sampling.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c2cfe0a77..fd8ca8a9e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1876,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) { const auto * ctx = (llama_sampler_dry *) smpl->ctx; - // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying - auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); + llama_vocab dummy_vocab; + + // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying + auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); + // Copy the state, including the processed breakers { auto * result_ctx = (llama_sampler_dry *) result->ctx; From 60e17ce23c2740369af6304113a2dfa0454eaf26 Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Thu, 7 Nov 2024 11:46:12 -0500 Subject: [PATCH 048/279] Remove identical wte/etw logic for jais (#10203) --- convert_hf_to_gguf.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 76ee6cef5..39afa5ef4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3748,10 +3748,7 @@ class JaisModel(Model): # Embeddings scale self.embeddings_scale = 1.0 - # note: For some JAIS flavors, output is tied to (same as) wte in original model - self.output_is_wte = False if 'mup_embeddings_scale' in self.hparams: - self.output_is_wte = True # Hack (?) self.embeddings_scale = self.hparams['mup_embeddings_scale'] elif 'embeddings_scale' in self.hparams: self.embeddings_scale = self.hparams['embeddings_scale'] @@ -3808,10 +3805,7 @@ class JaisModel(Model): if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): tensors.append((new_name, data_torch * self.embeddings_scale)) - if self.output_is_wte: - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - assert not self.output_is_wte tensors.append((new_name, data_torch * self.width_scale)) else: tensors.append((new_name, data_torch)) From 97404c4a0374cac45c8c34a32d13819de1dd023d Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 7 Nov 2024 18:16:08 +0100 Subject: [PATCH 049/279] ggml : add ggml-cpu.h to the public headers (#10204) --- ggml/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index cfa6e3f70..6866a25d3 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -218,12 +218,12 @@ include(CMakePackageConfigHelpers) # all public headers set(GGML_PUBLIC_HEADERS include/ggml.h + include/ggml-cpu.h include/ggml-alloc.h include/ggml-backend.h include/ggml-blas.h include/ggml-cann.h include/ggml-cuda.h - include/ggml.h include/ggml-kompute.h include/ggml-metal.h include/ggml-rpc.h From a2c6fd747c77fe183e2f556a4a2f1fb0a0be4c7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 Nov 2024 23:07:55 +0200 Subject: [PATCH 050/279] scripts : sync update --- scripts/sync-ggml-am.sh | 88 ++++++++++------------------------------- scripts/sync-ggml.sh | 46 +++++---------------- 2 files changed, 30 insertions(+), 104 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index fba29b935..06a04745b 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -114,46 +114,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # replace filenames: # - # CMakelists.txt -> ggml/CMakeLists.txt - # src/CMakeLists.txt -> ggml/src/CMakeLists.txt - # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake + # CMakelists.txt -> ggml/CMakeLists.txt + # src/CMakeLists.txt -> ggml/src/CMakeLists.txt + # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake # - # src/ggml.c -> ggml/src/ggml.c - # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c - # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h - # src/ggml-alloc.c -> ggml/src/ggml-alloc.c - # src/ggml-amx/* -> ggml/src/ggml-amx/ - # src/ggml-amx.cpp -> ggml/src/ggml-amx.cpp - # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h - # src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp - # src/ggml-cann/* -> ggml/src/ggml-cann/ - # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp - # src/ggml-common.h -> ggml/src/ggml-common.h - # src/ggml-cuda/* -> ggml/src/ggml-cuda/ - # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu - # src/ggml-impl.h -> ggml/src/ggml-impl.h - # src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp - # src/ggml-metal.m -> ggml/src/ggml-metal.m - # src/ggml-quants.c -> ggml/src/ggml-quants.c - # src/ggml-quants.h -> ggml/src/ggml-quants.h - # src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp - # src/ggml-sycl/* -> ggml/src/ggml-sycl/ - # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp - # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp - # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/ + # src/ggml*.c -> ggml/src/ggml*.c + # src/ggml*.cpp -> ggml/src/ggml*.cpp + # src/ggml*.h -> ggml/src/ggml*.h + # src/ggml*.cu -> ggml/src/ggml*.cu + # src/ggml*.m -> ggml/src/ggml*.m + # src/ggml-amx/* -> ggml/src/ggml-amx/ + # src/ggml-cann/* -> ggml/src/ggml-cann/ + # src/ggml-cuda/* -> ggml/src/ggml-cuda/ + # src/ggml-sycl/* -> ggml/src/ggml-sycl/ + # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/ # - # include/ggml.h -> ggml/include/ggml.h - # include/ggml-alloc.h -> ggml/include/ggml-alloc.h - # include/ggml-amx.h -> ggml/include/ggml-amx.h - # include/ggml-backend.h -> ggml/include/ggml-backend.h - # include/ggml-blas.h -> ggml/include/ggml-blas.h - # include/ggml-cann.h -> ggml/include/ggml-cann.h - # include/ggml-cuda.h -> ggml/include/ggml-cuda.h - # include/ggml-kompute.h -> ggml/include/ggml-kompute.h - # include/ggml-metal.h -> ggml/include/ggml-metal.h - # include/ggml-rpc.h -> ggml/include/ggml-rpc.h - # include/ggml-sycl.h -> ggml/include/ggml-sycl.h - # include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h + # include/ggml*.h -> ggml/include/ggml*.h # # tests/test-opt.cpp -> tests/test-opt.cpp # tests/test-grad0.cpp -> tests/test-grad0.cpp @@ -168,41 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ - -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \ -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index f5d87324a..8192a8673 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -4,43 +4,17 @@ cp -rpv ../ggml/CMakeLists.txt ./ggml/CMakeLists.txt cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake -cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c -cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c -cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h -cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c -cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/ -cp -rpv ../ggml/src/ggml-amx.cpp ./ggml/src/ggml-amx.cpp -cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h -cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp -cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ -cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp -cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h -cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ -cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu -cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h -cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp -cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m -cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal -cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c -cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h -cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp -cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ -cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp -cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp -cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/ +cp -rpv ../ggml/src/ggml*.c ./ggml/src/ +cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/ +cp -rpv ../ggml/src/ggml*.h ./ggml/src/ +cp -rpv ../ggml/src/ggml*.cu ./ggml/src/ +cp -rpv ../ggml/src/ggml*.m ./ggml/src/ +cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ +cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ +cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ +cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/ -cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h -cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h -cp -rpv ../ggml/include/ggml-amx.h ./ggml/include/ggml-amx.h -cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h -cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h -cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h -cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h -cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h -cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h -cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h -cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h -cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h +cp -rpv ../ggml/include/ggml*.h ./ggml/include/ cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp From 3b08828674f561c78af182d47fc0636fc3ccd1e9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 Nov 2024 23:08:24 +0200 Subject: [PATCH 051/279] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 020c60f34..e82984f49 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -a099cb514d6687e436a5a423d1fb0448be0feb20 +89952d649e0c5cabbb9ff8c4906f5a843a789fb2 From eec4d71737b32f312e0082b671629a0368e1a20d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 7 Nov 2024 23:11:36 +0200 Subject: [PATCH 052/279] scripts : add amx to sync-ggml.sh [no ci] --- scripts/sync-ggml.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 8192a8673..f29554c82 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -9,6 +9,7 @@ cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/ cp -rpv ../ggml/src/ggml*.h ./ggml/src/ cp -rpv ../ggml/src/ggml*.cu ./ggml/src/ cp -rpv ../ggml/src/ggml*.m ./ggml/src/ +cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/ cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ From a71d81cf8c1afb26b166f897c94ee1581f9fac7d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 7 Nov 2024 17:31:10 -0400 Subject: [PATCH 053/279] server : revamp chat UI with vuejs and daisyui (#10175) * server : simple chat UI with vuejs and daisyui * move old files to legacy folder * embed deps into binary * basic markdown support * add conversation history, save to localStorage * fix bg-base classes * save theme preferences * fix tests * regenerate, edit, copy buttons * small fixes * docs: how to use legacy ui * better error handling * make CORS preflight more explicit * add GET method for CORS * fix tests * clean up a bit * better auto scroll * small fixes * use collapse-arrow * fix closeAndSaveConfigDialog * small fix * remove console.log * fix style for
 element

* lighter bubble color (less distract when reading)
---
 .editorconfig                                 |    10 +
 Makefile                                      |    17 +-
 examples/server/CMakeLists.txt                |    17 +-
 examples/server/README.md                     |    10 +
 examples/server/chat.mjs                      |     2 +-
 examples/server/deps.sh                       |    19 +-
 examples/server/public/completion.js          |    29 +-
 examples/server/public/deps_daisyui.min.css   |    13 +
 examples/server/public/deps_markdown-it.js    |  8442 +++++++
 examples/server/public/deps_tailwindcss.js    |    82 +
 .../server/public/deps_vue.esm-browser.js     | 18160 ++++++++++++++++
 examples/server/public/index.html             |  1851 +-
 .../{public => public_legacy}/colorthemes.css |     0
 examples/server/public_legacy/completion.js   |   209 +
 .../{public => public_legacy}/favicon.ico     |   Bin
 .../{public => public_legacy}/index-new.html  |     0
 examples/server/public_legacy/index.html      |  1303 ++
 .../server/{public => public_legacy}/index.js |     0
 .../json-schema-to-grammar.mjs                |     0
 examples/server/public_legacy/loading.html    |    12 +
 .../prompt-formats.js                         |     0
 .../{public => public_legacy}/style.css       |     0
 .../system-prompts.js                         |     0
 .../theme-beeninorder.css                     |     0
 .../theme-ketivah.css                         |     0
 .../theme-mangotango.css                      |     0
 .../theme-playground.css                      |     0
 .../theme-polarnight.css                      |     0
 .../theme-snowstorm.css                       |     0
 examples/server/server.cpp                    |    71 +-
 .../server/tests/features/security.feature    |     2 +-
 grammars/README.md                            |     2 +-
 tests/run-json-schema-to-grammar.mjs          |     2 +-
 33 files changed, 28884 insertions(+), 1369 deletions(-)
 create mode 100644 examples/server/public/deps_daisyui.min.css
 create mode 100644 examples/server/public/deps_markdown-it.js
 create mode 100644 examples/server/public/deps_tailwindcss.js
 create mode 100644 examples/server/public/deps_vue.esm-browser.js
 rename examples/server/{public => public_legacy}/colorthemes.css (100%)
 create mode 100644 examples/server/public_legacy/completion.js
 rename examples/server/{public => public_legacy}/favicon.ico (100%)
 rename examples/server/{public => public_legacy}/index-new.html (100%)
 create mode 100644 examples/server/public_legacy/index.html
 rename examples/server/{public => public_legacy}/index.js (100%)
 rename examples/server/{public => public_legacy}/json-schema-to-grammar.mjs (100%)
 create mode 100644 examples/server/public_legacy/loading.html
 rename examples/server/{public => public_legacy}/prompt-formats.js (100%)
 rename examples/server/{public => public_legacy}/style.css (100%)
 rename examples/server/{public => public_legacy}/system-prompts.js (100%)
 rename examples/server/{public => public_legacy}/theme-beeninorder.css (100%)
 rename examples/server/{public => public_legacy}/theme-ketivah.css (100%)
 rename examples/server/{public => public_legacy}/theme-mangotango.css (100%)
 rename examples/server/{public => public_legacy}/theme-playground.css (100%)
 rename examples/server/{public => public_legacy}/theme-polarnight.css (100%)
 rename examples/server/{public => public_legacy}/theme-snowstorm.css (100%)

diff --git a/.editorconfig b/.editorconfig
index f88f8da67..eac38a15f 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -24,6 +24,16 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 
+[examples/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
+[examples/server/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 
diff --git a/Makefile b/Makefile
index eb1da90f1..b9131eae5 100644
--- a/Makefile
+++ b/Makefile
@@ -1455,22 +1455,13 @@ llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
-	examples/server/colorthemes.css.hpp \
-	examples/server/style.css.hpp \
-	examples/server/theme-beeninorder.css.hpp \
-	examples/server/theme-ketivah.css.hpp \
-	examples/server/theme-mangotango.css.hpp \
-	examples/server/theme-playground.css.hpp \
-	examples/server/theme-polarnight.css.hpp \
-	examples/server/theme-snowstorm.css.hpp \
 	examples/server/index.html.hpp \
-	examples/server/index-new.html.hpp \
-	examples/server/index.js.hpp \
 	examples/server/completion.js.hpp \
-	examples/server/system-prompts.js.hpp \
-	examples/server/prompt-formats.js.hpp \
-	examples/server/json-schema-to-grammar.mjs.hpp \
 	examples/server/loading.html.hpp \
+	examples/server/deps_daisyui.min.css.hpp \
+	examples/server/deps_markdown-it.js.hpp \
+	examples/server/deps_tailwindcss.js.hpp \
+	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 3e717e882..93e876f5a 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -15,22 +15,13 @@ set(TARGET_SRCS
     httplib.h
 )
 set(PUBLIC_ASSETS
-    colorthemes.css
-    style.css
-    theme-beeninorder.css
-    theme-ketivah.css
-    theme-mangotango.css
-    theme-playground.css
-    theme-polarnight.css
-    theme-snowstorm.css
     index.html
-    index-new.html
-    index.js
     completion.js
-    system-prompts.js
-    prompt-formats.js
-    json-schema-to-grammar.mjs
     loading.html
+    deps_daisyui.min.css
+    deps_markdown-it.js
+    deps_tailwindcss.js
+    deps_vue.esm-browser.js
 )
 
 foreach(asset ${PUBLIC_ASSETS})
diff --git a/examples/server/README.md b/examples/server/README.md
index 15f95db1e..562494077 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
 }
 ```
 
+### Legacy completion web UI
+
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+
+For example:
+
+```sh
+./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
+```
+
 ### Extending or building alternative Web Front End
 
 You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs
index a79c8a3cd..4fef5655a 100644
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,7 +1,7 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
 import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
 
 const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
index d28378901..1ff80d056 100755
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 PUBLIC=$DIR/public
 
 echo "download js bundle files"
-curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
-echo >> $PUBLIC/index.js # add newline
+
+# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
+
+curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
+echo >> $PUBLIC/deps_tailwindcss.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
+curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
+echo >> $PUBLIC/deps_daisyui.min.css # add newline
+
+curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
+echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
+
+curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
+echo >> $PUBLIC/deps_markdown-it.js # add newline
+
+ls -lah $PUBLIC
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index 36818f764..54a0f22f5 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -1,12 +1,16 @@
 const paramDefaults = {
   stream: true,
-  n_predict: 500,
   temperature: 0.2,
-  stop: [""]
 };
 
 let generation_settings = null;
 
+export class CompletionError extends Error {
+  constructor(message, name, data) {
+    super(message);
+    this.name = name;
+  }
+};
 
 // Completes the prompt as a generator. Recommended for most use cases.
 //
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
 
   const completionParams = { ...paramDefaults, ...params, prompt };
 
-  const response = await fetch(`${api_url}/completion`, {
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
     method: 'POST',
     body: JSON.stringify(completionParams),
     headers: {
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
     signal: controller.signal,
   });
 
+  const status = response.status;
+  if (status !== 200) {
+    try {
+      const body = await response.json();
+      if (body && body.error && body.error.message) {
+        throw new CompletionError(body.error.message, 'ServerError');
+      }
+    } catch (err) {
+      throw new CompletionError(err.message, 'ServerError');
+    }
+  }
+
   const reader = response.body.getReader();
   const decoder = new TextDecoder();
 
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
       for (const line of lines) {
         const match = regex.exec(line);
         if (match) {
-          result[match[1]] = match[2]
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
           // since we know this is llama.cpp, let's just decode the json in data
           if (result.data) {
             result.data = JSON.parse(result.data);
diff --git a/examples/server/public/deps_daisyui.min.css b/examples/server/public/deps_daisyui.min.css
new file mode 100644
index 000000000..bc8529651
--- /dev/null
+++ b/examples/server/public/deps_daisyui.min.css
@@ -0,0 +1,13 @@
+.alert{display:grid;width:100%;grid-auto-flow:row;align-content:flex-start;align-items:center;justify-items:center;gap:1rem;text-align:center}@media (min-width:640px){.alert{grid-auto-flow:column;grid-template-columns:auto minmax(auto,1fr);justify-items:start;text-align:start}}.artboard{width:100%}.avatar{position:relative;display:inline-flex}.avatar>div{display:block;aspect-ratio:1/1;overflow:hidden}.avatar img{height:100%;width:100%;object-fit:cover}.avatar.placeholder>div{display:flex;align-items:center;justify-content:center}.badge{display:inline-flex;align-items:center;justify-content:center;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);height:1.25rem;font-size:.875rem;line-height:1.25rem;width:fit-content;padding-left:.563rem;padding-right:.563rem}.btm-nav{position:fixed;bottom:0;left:0;right:0;display:flex;width:100%;flex-direction:row;align-items:center;justify-content:space-around;padding-bottom:env(safe-area-inset-bottom)}.btm-nav>*{position:relative;display:flex;height:100%;flex-basis:100%;cursor:pointer;flex-direction:column;align-items:center;justify-content:center;gap:.25rem}.breadcrumbs{max-width:100%;overflow-x:auto}.breadcrumbs>ol,.breadcrumbs>ul{display:flex;align-items:center;white-space:nowrap;min-height:min-content}.breadcrumbs>ol>li,.breadcrumbs>ul>li{display:flex;align-items:center}.breadcrumbs>ol>li>a,.breadcrumbs>ul>li>a{display:flex;cursor:pointer;align-items:center}@media(hover:hover){.breadcrumbs>ol>li>a:hover,.breadcrumbs>ul>li>a:hover{text-decoration-line:underline}}.btn{display:inline-flex;height:3rem;min-height:3rem;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;border-radius:var(--rounded-btn,.5rem);border-color:transparent;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1em}.btn-disabled,.btn:disabled,.btn[disabled]{pointer-events:none}.btn-square{height:3rem;width:3rem;padding:0}.btn-circle{height:3rem;width:3rem;border-radius:9999px;padding:0}:where(.btn:is(input[type=checkbox])),:where(.btn:is(input[type=radio])){width:auto;appearance:none}.btn:is(input[type=checkbox]):after,.btn:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.card{position:relative;display:flex;flex-direction:column}.card:focus{outline:2px solid transparent;outline-offset:2px}.card-body{display:flex;flex:1 1 auto;flex-direction:column}.card-body :where(p){flex-grow:1}.card-actions{display:flex;flex-wrap:wrap;align-items:flex-start;gap:.5rem}.card figure{display:flex;align-items:center;justify-content:center}.card.image-full{display:grid}.card.image-full:before{position:relative;content:""}.card.image-full:before,.card.image-full>*{grid-column-start:1;grid-row-start:1}.card.image-full>figure img{height:100%;object-fit:cover}.card.image-full>.card-body{position:relative}.carousel{display:inline-flex;overflow-x:scroll;scroll-snap-type:x mandatory;scroll-behavior:smooth}.carousel-vertical{flex-direction:column;overflow-y:scroll;scroll-snap-type:y mandatory}.carousel-item{box-sizing:content-box;display:flex;flex:none;scroll-snap-align:start}.carousel-start .carousel-item{scroll-snap-align:start}.carousel-center .carousel-item{scroll-snap-align:center}.carousel-end .carousel-item{scroll-snap-align:end}.chat{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));column-gap:.75rem;padding-top:.25rem;padding-bottom:.25rem}.chat-image{grid-row:span 2/span 2;align-self:flex-end}.chat-header{grid-row-start:1;font-size:.875rem;line-height:1.25rem}.chat-footer{grid-row-start:3;font-size:.875rem;line-height:1.25rem}.chat-bubble{position:relative;display:block;width:fit-content;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;max-width:90%}.chat-bubble:before{position:absolute;bottom:0;height:.75rem;width:.75rem;background-color:inherit;content:"";mask-size:contain;mask-repeat:no-repeat;mask-position:center}.chat-start{place-items:start;grid-template-columns:auto 1fr}.chat-start .chat-header{grid-column-start:2}.chat-start .chat-footer{grid-column-start:2}.chat-start .chat-image{grid-column-start:1}.chat-start .chat-bubble{grid-column-start:2}.chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}[dir=rtl] .chat-start .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}.chat-end{place-items:end;grid-template-columns:1fr auto}.chat-end .chat-header{grid-column-start:1}.chat-end .chat-footer{grid-column-start:1}.chat-end .chat-image{grid-column-start:2}.chat-end .chat-bubble{grid-column-start:1}.chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 1 3 L 3 3 C 2 3 0 1 0 0'/%3e%3c/svg%3e")}[dir=rtl] .chat-end .chat-bubble:before{mask-image:url("data:image/svg+xml,%3csvg width='3' height='3' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m 0 3 L 3 3 L 3 0 C 3 1 1 3 0 3'/%3e%3c/svg%3e")}.checkbox{flex-shrink:0}.collapse:not(td):not(tr):not(colgroup){visibility:visible}.collapse{position:relative;display:grid;overflow:hidden;grid-template-rows:auto 0fr;transition:grid-template-rows .2s}.collapse-content,.collapse-title,.collapse>input[type=checkbox],.collapse>input[type=radio]{grid-column-start:1;grid-row-start:1}.collapse>input[type=checkbox],.collapse>input[type=radio]{appearance:none;opacity:0}.collapse-content{visibility:hidden;grid-column-start:1;grid-row-start:2;min-height:0;transition:visibility .2s}.collapse-open,.collapse:focus:not(.collapse-close),.collapse[open]{grid-template-rows:auto 1fr}.collapse:not(.collapse-close):has(>input[type=checkbox]:checked),.collapse:not(.collapse-close):has(>input[type=radio]:checked){grid-template-rows:auto 1fr}.collapse-open>.collapse-content,.collapse:focus:not(.collapse-close)>.collapse-content,.collapse:not(.collapse-close)>input[type=checkbox]:checked~.collapse-content,.collapse:not(.collapse-close)>input[type=radio]:checked~.collapse-content,.collapse[open]>.collapse-content{visibility:visible;min-height:fit-content}:root .countdown{line-height:1em}.countdown{display:inline-flex}.countdown>*{height:1em;display:inline-block;overflow-y:hidden}.countdown>:before{position:relative;content:"00\A 01\A 02\A 03\A 04\A 05\A 06\A 07\A 08\A 09\A 10\A 11\A 12\A 13\A 14\A 15\A 16\A 17\A 18\A 19\A 20\A 21\A 22\A 23\A 24\A 25\A 26\A 27\A 28\A 29\A 30\A 31\A 32\A 33\A 34\A 35\A 36\A 37\A 38\A 39\A 40\A 41\A 42\A 43\A 44\A 45\A 46\A 47\A 48\A 49\A 50\A 51\A 52\A 53\A 54\A 55\A 56\A 57\A 58\A 59\A 60\A 61\A 62\A 63\A 64\A 65\A 66\A 67\A 68\A 69\A 70\A 71\A 72\A 73\A 74\A 75\A 76\A 77\A 78\A 79\A 80\A 81\A 82\A 83\A 84\A 85\A 86\A 87\A 88\A 89\A 90\A 91\A 92\A 93\A 94\A 95\A 96\A 97\A 98\A 99\A";white-space:pre;top:calc(var(--value) * -1em)}.diff{position:relative;display:grid;width:100%;overflow:hidden;container-type:inline-size;grid-template-columns:auto 1fr}.diff-resizer{position:relative;top:50%;z-index:1;height:3rem;width:25rem;min-width:1rem;max-width:calc(100cqi - 1rem);resize:horizontal;overflow:hidden;opacity:0;transform-origin:100% 100%;scale:4;translate:1.5rem -1.5rem;clip-path:inset(calc(100% - .75rem) 0 0 calc(100% - .75rem))}.diff-item-1,.diff-item-2,.diff-resizer{position:relative;grid-column-start:1;grid-row-start:1}.diff-item-1:after{pointer-events:none;position:absolute;bottom:0;right:1px;top:50%;z-index:1;height:2rem;width:2rem;--tw-content:'';content:var(--tw-content);translate:50% -50%}.diff-item-2{overflow:hidden}.diff-item-1>*,.diff-item-2>*{pointer-events:none;position:absolute;bottom:0;left:0;top:0;height:100%;width:100cqi;max-width:none;object-fit:cover;object-position:center}.divider{display:flex;flex-direction:row;align-items:center;align-self:stretch}.divider:after,.divider:before{height:.125rem;width:100%;flex-grow:1;--tw-content:'';content:var(--tw-content)}.divider-start:before{display:none}.divider-end:after{display:none}.drawer{position:relative;display:grid;grid-auto-columns:max-content auto}.drawer-content{grid-column-start:2;grid-row-start:1;min-width:0}.drawer-side{pointer-events:none;position:fixed;inset-inline-start:0;top:0;grid-column-start:1;grid-row-start:1;display:grid;width:100%;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(1,minmax(0,1fr));align-items:flex-start;justify-items:start;overflow-x:hidden;overflow-y:hidden;overscroll-behavior:contain;height:100vh;height:100dvh}.drawer-side>.drawer-overlay{position:sticky;top:0;place-self:stretch}.drawer-side>*{grid-column-start:1;grid-row-start:1}.drawer-side>:not(.drawer-overlay){transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);will-change:transform;transform:translateX(-100%)}[dir=rtl] .drawer-side>:not(.drawer-overlay){transform:translateX(100%)}.drawer-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}.drawer-toggle:checked~.drawer-side{pointer-events:auto;visibility:visible;overflow-y:auto}.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.drawer-end{grid-auto-columns:auto max-content}.drawer-end>.drawer-toggle~.drawer-content{grid-column-start:1}.drawer-end>.drawer-toggle~.drawer-side{grid-column-start:2;justify-items:end}.drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(100%)}[dir=rtl] .drawer-end>.drawer-toggle~.drawer-side>:not(.drawer-overlay){transform:translateX(-100%)}.drawer-end>.drawer-toggle:checked~.drawer-side>:not(.drawer-overlay){transform:translateX(0)}.dropdown{position:relative;display:inline-block}.dropdown>:not(summary):focus{outline:2px solid transparent;outline-offset:2px}.dropdown .dropdown-content{position:absolute}.dropdown:is(:not(details)) .dropdown-content{visibility:hidden;opacity:0}.dropdown-end .dropdown-content{inset-inline-end:0}.dropdown-left .dropdown-content{bottom:auto;inset-inline-end:100%;top:0}.dropdown-right .dropdown-content{bottom:auto;inset-inline-start:100%;top:0}.dropdown-bottom .dropdown-content{bottom:auto;top:100%}.dropdown-top .dropdown-content{bottom:100%;top:auto}.dropdown-end.dropdown-right .dropdown-content{bottom:0;top:auto}.dropdown-end.dropdown-left .dropdown-content{bottom:0;top:auto}.dropdown.dropdown-open .dropdown-content,.dropdown:focus-within .dropdown-content,.dropdown:not(.dropdown-hover):focus .dropdown-content{visibility:visible;opacity:1}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{visibility:visible;opacity:1}}.dropdown:is(details) summary::-webkit-details-marker{display:none}.file-input{height:3rem;flex-shrink:1;padding-inline-end:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.file-input::file-selector-button{margin-inline-end:1rem;display:inline-flex;height:100%;flex-shrink:0;cursor:pointer;user-select:none;flex-wrap:wrap;align-items:center;justify-content:center;padding-left:1rem;padding-right:1rem;text-align:center;font-size:.875rem;line-height:1.25rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);line-height:1em}.footer{display:grid;width:100%;grid-auto-flow:row;place-items:start}.footer>*{display:grid;place-items:start}.footer-center{place-items:center;text-align:center}.footer-center>*{place-items:center}@media (min-width:48rem){.footer{grid-auto-flow:column}.footer-center{grid-auto-flow:row dense}}.form-control{display:flex;flex-direction:column}.label{display:flex;user-select:none;align-items:center;justify-content:space-between}.hero{display:grid;width:100%;place-items:center;background-size:cover;background-position:center}.hero>*{grid-column-start:1;grid-row-start:1}.hero-overlay{grid-column-start:1;grid-row-start:1;height:100%;width:100%}.hero-content{z-index:0;display:flex;align-items:center;justify-content:center}.indicator{position:relative;display:inline-flex;width:max-content}.indicator :where(.indicator-item){z-index:1;position:absolute;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));white-space:nowrap}.input{flex-shrink:1;appearance:none;height:3rem;padding-left:1rem;padding-right:1rem;font-size:.875rem;line-height:1.25rem;line-height:2}.input-md[type=number]::-webkit-inner-spin-button,.input[type=number]::-webkit-inner-spin-button{margin-top:-1rem;margin-bottom:-1rem;margin-inline-end:-1rem}.input-xs[type=number]::-webkit-inner-spin-button{margin-top:-.25rem;margin-bottom:-.25rem;margin-inline-end:0}.input-sm[type=number]::-webkit-inner-spin-button{margin-top:0;margin-bottom:0;margin-inline-end:0}.input-lg[type=number]::-webkit-inner-spin-button{margin-top:-1.5rem;margin-bottom:-1.5rem;margin-inline-end:-1.5rem}.join{display:inline-flex;align-items:stretch}.join :where(.join-item){border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:not(:first-child):not(:last-child),.join :not(:first-child):not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0;border-end-start-radius:0;border-start-start-radius:0}.join .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .join-item{border-start-end-radius:0;border-end-end-radius:0}.join .dropdown .join-item:first-child:not(:last-child),.join :first-child:not(:last-child) .dropdown .join-item{border-start-end-radius:inherit;border-end-end-radius:inherit}.join :where(.join-item:first-child:not(:last-child)),.join :where(:first-child:not(:last-child).join-item){border-end-start-radius:inherit;border-start-start-radius:inherit}.join .join-item:last-child:not(:first-child),.join :last-child:not(:first-child) .join-item{border-end-start-radius:0;border-start-start-radius:0}.join :where(.join-item:last-child:not(:first-child)),.join :where(:last-child:not(:first-child).join-item){border-start-end-radius:inherit;border-end-end-radius:inherit}@supports not selector(:has(*)){:where(.join*){border-radius:inherit}}@supports selector(:has(*)){:where(.join:has(.join-item)){border-radius:inherit}}.kbd{display:inline-flex;align-items:center;justify-content:center}.link{cursor:pointer;text-decoration-line:underline}.link-hover{text-decoration-line:none}@media(hover:hover){.link-hover:hover{text-decoration-line:underline}}.mask{mask-size:contain;mask-repeat:no-repeat;mask-position:center}.mask-half-1{mask-size:200%;mask-position:left}.mask-half-1:where([dir=rtl],[dir=rtl]*){mask-position:right}.mask-half-2{mask-size:200%;mask-position:right}.mask-half-2:where([dir=rtl],[dir=rtl]*){mask-position:left}.menu{display:flex;flex-direction:column;flex-wrap:wrap;font-size:.875rem;line-height:1.25rem}.menu :where(liul){position:relative;white-space:nowrap}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){display:grid;grid-auto-flow:column;align-content:flex-start;align-items:center;gap:.5rem;grid-auto-columns:minmax(auto,max-content) auto max-content;user-select:none}.menu li.disabled{cursor:not-allowed;user-select:none}.menu :where(li>.menu-dropdown:not(.menu-dropdown-show)){display:none}:where(.menuli){position:relative;display:flex;flex-shrink:0;flex-direction:column;flex-wrap:wrap;align-items:stretch}:where(.menuli) .badge{justify-self:end}.mockup-code{position:relative;overflow:hidden;overflow-x:auto}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-window{position:relative;overflow:hidden;overflow-x:auto}.mockup-window pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.mockup-browser{position:relative;overflow:hidden;overflow-x:auto}.mockup-browser pre[data-prefix]:before{content:attr(data-prefix);display:inline-block;text-align:right}.modal{pointer-events:none;position:fixed;inset:0;margin:0;display:grid;height:100%;max-height:none;width:100%;max-width:none;justify-items:center;padding:0;opacity:0;overscroll-behavior:contain;z-index:999}.modal-scroll{overscroll-behavior:auto}:where(.modal){align-items:center}.modal-box{max-height:calc(100vh - 5em)}.modal-open,.modal-toggle:checked+.modal,.modal:target,.modal[open]{pointer-events:auto;visibility:visible;opacity:1}.modal-action{display:flex}.modal-toggle{position:fixed;height:0;width:0;appearance:none;opacity:0}:root:has(:is(.modal-open,.modal:target,.modal-toggle:checked+.modal,.modal[open])){overflow:hidden;scrollbar-gutter:stable}.navbar{display:flex;align-items:center}:where(.navbar>:not(script,style)){display:inline-flex;align-items:center}.navbar-start{width:50%;justify-content:flex-start}.navbar-center{flex-shrink:0}.navbar-end{width:50%;justify-content:flex-end}.progress{position:relative;width:100%;appearance:none;overflow:hidden}.radial-progress{position:relative;display:inline-grid;height:var(--size);width:var(--size);place-content:center;border-radius:9999px;background-color:transparent;vertical-align:middle;box-sizing:content-box}.radial-progress::-moz-progress-bar{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-value{appearance:none;background-color:transparent}.radial-progress::-webkit-progress-bar{appearance:none;background-color:transparent}.radial-progress:after,.radial-progress:before{position:absolute;border-radius:9999px;content:""}.radial-progress:before{inset:0;background:radial-gradient(farthest-side,currentColor 98%,#0000) top/var(--thickness) var(--thickness) no-repeat,conic-gradient(currentColor calc(var(--value) * 1%),#0000 0);-webkit-mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)));mask:radial-gradient(farthest-side,#0000 calc(99% - var(--thickness)),#000 calc(100% - var(--thickness)))}.radial-progress:after{inset:calc(50% - var(--thickness)/ 2);transform:rotate(calc(var(--value) * 3.6deg - 90deg)) translate(calc(var(--size)/ 2 - 50%))}.radio{flex-shrink:0}.range{height:1.5rem;width:100%;cursor:pointer}.range:focus{outline:0}.rating{position:relative;display:inline-flex}.rating :where(input){cursor:pointer;border-radius:0}.select{display:inline-flex;cursor:pointer;user-select:none;appearance:none;height:3rem;min-height:3rem;padding-inline-start:1rem;padding-inline-end:2.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.select[multiple]{height:auto}.stack{display:inline-grid}.stack>*{grid-column-start:1;grid-row-start:1;transform:translateY(10%) scale(.9);z-index:1}.stack>:nth-child(2){transform:translateY(5%) scale(.95);z-index:2}.stack>:nth-child(1){transform:translateY(0) scale(1);z-index:3}.stats{display:inline-grid}:where(.stats){grid-auto-flow:column}.stat{display:inline-grid;width:100%;grid-template-columns:repeat(1,1fr)}.stat-figure{grid-column-start:2;grid-row:span 3/span 3;grid-row-start:1;place-self:center;justify-self:end}.stat-title{grid-column-start:1;white-space:nowrap}.stat-value{grid-column-start:1;white-space:nowrap}.stat-desc{grid-column-start:1;white-space:nowrap}.stat-actions{grid-column-start:1;white-space:nowrap}.steps{display:inline-grid;grid-auto-flow:column;overflow:hidden;overflow-x:auto;counter-reset:step;grid-auto-columns:1fr}.steps .step{display:grid;grid-template-columns:repeat(1,minmax(0,1fr));grid-template-rows:repeat(2,minmax(0,1fr));place-items:center;text-align:center}.swap{position:relative;display:inline-grid;user-select:none;place-content:center}.swap>*{grid-column-start:1;grid-row-start:1}.swap input{appearance:none}.swap .swap-indeterminate,.swap .swap-on,.swap input:indeterminate~.swap-on{opacity:0}.swap input:checked~.swap-off,.swap input:indeterminate~.swap-off,.swap-active .swap-off{opacity:0}.swap input:checked~.swap-on,.swap input:indeterminate~.swap-indeterminate,.swap-active .swap-on{opacity:1}.tabs{display:grid;align-items:flex-end}.tabs-lifted:has(.tab-content[class*=" rounded-"]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])),.tabs-lifted:has(.tab-content[class^=rounded-]) .tab:first-child:not(:is(.tab-active,[aria-selected=true])){border-bottom-color:transparent}.tab{position:relative;grid-row-start:1;display:inline-flex;height:2rem;cursor:pointer;user-select:none;appearance:none;flex-wrap:wrap;align-items:center;justify-content:center;text-align:center;font-size:.875rem;line-height:1.25rem;line-height:2;--tab-padding:1rem}.tab:is(input[type=radio]){width:auto;border-bottom-right-radius:0;border-bottom-left-radius:0}.tab:is(input[type=radio]):after{--tw-content:attr(aria-label);content:var(--tw-content)}.tab:not(input):empty{cursor:default;grid-column-start:span 9999}.tab-content{grid-column-start:1;grid-column-end:span 9999;grid-row-start:2;margin-top:calc(var(--tab-border) * -1);display:none;border-color:transparent;border-width:var(--tab-border,0)}:checked+.tab-content:nth-child(2),:is(.tab-active,[aria-selected=true])+.tab-content:nth-child(2){border-start-start-radius:0}:is(.tab-active,[aria-selected=true])+.tab-content,input.tab:checked+.tab-content{display:block}.table{position:relative;width:100%}.table :where(.table-pin-rowstheadtr){position:sticky;top:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-rowstfoottr){position:sticky;bottom:0;z-index:1;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table :where(.table-pin-colstrth){position:sticky;left:0;right:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.table-zebra tbody tr:nth-child(even) :where(.table-pin-colstrth){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}.textarea{min-height:3rem;flex-shrink:1;padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;line-height:2}.timeline{position:relative;display:flex}:where(.timeline>li){position:relative;display:grid;flex-shrink:0;align-items:center;grid-template-rows:var(--timeline-row-start,minmax(0,1fr)) auto var(--timeline-row-end,minmax(0,1fr));grid-template-columns:var(--timeline-col-start,minmax(0,1fr)) auto var(--timeline-col-end,minmax(0,1fr))}.timeline>li>hr{width:100%;border-width:0}:where(.timeline>li>hr):first-child{grid-column-start:1;grid-row-start:2}:where(.timeline>li>hr):last-child{grid-column-start:3;grid-column-end:none;grid-row-start:2;grid-row-end:auto}.timeline-start{grid-column-start:1;grid-column-end:4;grid-row-start:1;grid-row-end:2;margin:.25rem;align-self:flex-end;justify-self:center}.timeline-middle{grid-column-start:2;grid-row-start:2}.timeline-end{grid-column-start:1;grid-column-end:4;grid-row-start:3;grid-row-end:4;margin:.25rem;align-self:flex-start;justify-self:center}.toast{position:fixed;display:flex;min-width:fit-content;flex-direction:column;white-space:nowrap}.toggle{flex-shrink:0}.alert{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));padding:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-b2,oklch(var(--b2)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1));background-color:var(--alert-bg)}.alert-info{border-color:var(--fallback-in,oklch(var(--in)/.2));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-in,oklch(var(--in)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-success{border-color:var(--fallback-su,oklch(var(--su)/.2));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-su,oklch(var(--su)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-warning{border-color:var(--fallback-wa,oklch(var(--wa)/.2));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));--alert-bg:var(--fallback-wa,oklch(var(--wa)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.alert-error{border-color:var(--fallback-er,oklch(var(--er)/.2));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));--alert-bg:var(--fallback-er,oklch(var(--er)/1));--alert-bg-mix:var(--fallback-b1,oklch(var(--b1)/1))}.avatar-group{display:flex;overflow:hidden}.avatar-group :where(.avatar){overflow:hidden;border-radius:9999px;border-width:4px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.badge{border-radius:var(--rounded-badge,1.9rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-neutral{--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.badge-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.badge-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.badge-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.badge-info{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.badge-success{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.badge-warning{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.badge-error{border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.badge-ghost{--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.badge-outline{border-color:currentColor;--tw-border-opacity:0.5;background-color:transparent;color:currentColor}.badge-outline.badge-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}.badge-outline.badge-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}.badge-outline.badge-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}.badge-outline.badge-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}.badge-outline.badge-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}.badge-outline.badge-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}.badge-outline.badge-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}.badge-outline.badge-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}.btm-nav{height:4rem;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));color:currentColor}.btm-nav>*{border-color:currentColor}.btm-nav>:not(.active){padding-top:.125rem}.btm-nav>:where(.active){border-top-width:2px;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.btm-nav>.disabled,.btm-nav>[disabled]{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btm-nav>.disabled:hover,.btm-nav>[disabled]:hover{pointer-events:none;--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btm-nav>* .label{font-size:1rem;line-height:1.5rem}.breadcrumbs{padding-top:.5rem;padding-bottom:.5rem}.breadcrumbs>ol>li>a:focus,.breadcrumbs>ul>li>a:focus{outline:2px solid transparent;outline-offset:2px}.breadcrumbs>ol>li>a:focus-visible,.breadcrumbs>ul>li>a:focus-visible{outline:2px solid currentColor;outline-offset:2px}.breadcrumbs>ol>li+:before,.breadcrumbs>ul>li+:before{content:"";margin-left:.5rem;margin-right:.75rem;display:block;height:.375rem;width:.375rem;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));opacity:.4;border-top:1px solid;border-right:1px solid;background-color:transparent}[dir=rtl] .breadcrumbs>ol>li+:before,[dir=rtl] .breadcrumbs>ul>li+:before{--tw-rotate:-135deg}.btn{gap:.5rem;font-weight:600;text-decoration-line:none;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);border-width:var(--border-btn,1px);transition-property:color,background-color,border-color,opacity,box-shadow,transform}@media (prefers-reduced-motion:no-preference){.btn{animation:button-pop var(--animation-btn,.25s) ease-out}}.btn:active:focus,.btn:active:hover{animation:button-pop 0s ease-out;transform:scale(var(--btn-focus-scale,.97))}.btn{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));text-decoration-line:none;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:var(--fallback-bc,oklch(var(--bc)/1));background-color:oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity));--tw-bg-opacity:1;border-color:oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity));--tw-border-opacity:1}@supports not (color:oklch(0% 0 0)){.btn{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}@media (hover:hover){.btn:hover{--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn:hover{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b2)) / var(--tw-border-opacity,1)) 90%,#000)}}@supports not (color:oklch(0% 0 0)){.btn:hover{background-color:var(--btn-color,var(--fallback-b2));border-color:var(--btn-color,var(--fallback-b2))}}}@supports (color:color-mix(in oklab,black,black)){.btn-active{background-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-bg-opacity,1)) 90%,#000);border-color:color-mix(in oklab,oklch(var(--btn-color,var(--b3)) / var(--tw-border-opacity,1)) 90%,#000)}}.btn:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px}.btn-primary{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}@supports (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--p)}}@supports not (color:oklch(0% 0 0)){.btn-primary{--btn-color:var(--fallback-p)}}.btn-secondary{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}@supports (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--s)}}@supports not (color:oklch(0% 0 0)){.btn-secondary{--btn-color:var(--fallback-s)}}.btn-accent{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}@supports (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--a)}}@supports not (color:oklch(0% 0 0)){.btn-accent{--btn-color:var(--fallback-a)}}.btn-neutral{--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));outline-color:var(--fallback-n,oklch(var(--n)/1))}@supports (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--n)}}@supports not (color:oklch(0% 0 0)){.btn-neutral{--btn-color:var(--fallback-n)}}.btn-info{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}@supports (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--in)}}@supports not (color:oklch(0% 0 0)){.btn-info{--btn-color:var(--fallback-in)}}.btn-success{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}@supports (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--su)}}@supports not (color:oklch(0% 0 0)){.btn-success{--btn-color:var(--fallback-su)}}.btn-warning{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}@supports (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--wa)}}@supports not (color:oklch(0% 0 0)){.btn-warning{--btn-color:var(--fallback-wa)}}.btn-error{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}@supports (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--er)}}@supports not (color:oklch(0% 0 0)){.btn-error{--btn-color:var(--fallback-er)}}.btn.glass{--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn.glass:hover{--glass-opacity:25%;--glass-border-opacity:15%}}.btn.glass.btn-active{--glass-opacity:25%;--glass-border-opacity:15%}.btn-ghost{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-ghost:hover{border-color:transparent}@supports (color:oklch(0% 0 0)){.btn-ghost:hover{background-color:var(--fallback-bc,oklch(var(--bc)/.2))}}}.btn-ghost.btn-active{border-color:transparent;background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.btn-link{border-color:transparent;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)));text-decoration-line:underline;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-color:currentColor}@media (hover:hover){.btn-link:hover{border-color:transparent;background-color:transparent;text-decoration-line:underline}}.btn-link.btn-active{border-color:transparent;background-color:transparent;text-decoration-line:underline}.btn-outline{border-color:currentColor;background-color:transparent;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}@media (hover:hover){.btn-outline:hover{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}}.btn-outline.btn-active{--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-b1,oklch(var(--b1)/var(--tw-text-opacity)))}.btn-outline.btn-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-primary:hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary:hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn-outline.btn-primary.btn-active{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-primary.btn-active{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}.btn-outline.btn-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-secondary:hover{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary:hover{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}}.btn-outline.btn-secondary.btn-active{--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-secondary.btn-active{background-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 90%,#000)}}.btn-outline.btn-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-accent:hover{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent:hover{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}}.btn-outline.btn-accent.btn-active{--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-accent.btn-active{background-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 90%,#000)}}.btn-outline.btn-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-success:hover{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success:hover{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}}.btn-outline.btn-success.btn-active{--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-success.btn-active{background-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 90%,#000)}}.btn-outline.btn-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-info:hover{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info:hover{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}}.btn-outline.btn-info.btn-active{--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-info.btn-active{background-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 90%,#000)}}.btn-outline.btn-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-warning:hover{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning:hover{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}}.btn-outline.btn-warning.btn-active{--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-warning.btn-active{background-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 90%,#000)}}.btn-outline.btn-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@media (hover:hover){.btn-outline.btn-error:hover{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error:hover{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}}.btn-outline.btn-error.btn-active{--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}@supports (color:color-mix(in oklab,black,black)){.btn-outline.btn-error.btn-active{background-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 90%,#000)}}.btn.btn-disabled,.btn:disabled,.btn[disabled]{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.btn-disabled:hover,.btn:disabled:hover,.btn[disabled]:hover{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.btn:is(input[type=checkbox]:checked),.btn:is(input[type=radio]:checked){--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media (hover:hover){@supports (color:color-mix(in oklab,black,black)){.btn:is(input[type=checkbox]:checked):hover,.btn:is(input[type=radio]:checked):hover{background-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000);border-color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 90%,#000)}}}.btn:is(input[type=checkbox]:checked):focus-visible,.btn:is(input[type=radio]:checked):focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}@keyframes button-pop{0%{transform:scale(var(--btn-focus-scale,.98))}40%{transform:scale(1.02)}100%{transform:scale(1)}}.card{border-radius:var(--rounded-box,1rem)}.card :where(figure:first-child){overflow:hidden;border-start-start-radius:inherit;border-start-end-radius:inherit;border-end-start-radius:unset;border-end-end-radius:unset}.card :where(figure:last-child){overflow:hidden;border-start-start-radius:unset;border-start-end-radius:unset;border-end-start-radius:inherit;border-end-end-radius:inherit}.card:focus-visible{outline:2px solid currentColor;outline-offset:2px}.card.bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card-bordered{border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.card.compact .card-body{padding:1rem;font-size:.875rem;line-height:1.25rem}.card-body{padding:var(--padding-card,2rem);display:flex;flex-direction:column;gap:.5rem}.card-title{display:flex;align-items:center;gap:.5rem;font-size:1.25rem;line-height:1.75rem;font-weight:600}.card.image-full:before{z-index:10;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));opacity:.75}.card.image-full>.card-body{z-index:20;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.card.image-full :where(figure){overflow:hidden;border-radius:inherit}.carousel{-ms-overflow-style:none;scrollbar-width:none}.carousel::-webkit-scrollbar{display:none}.chat-bubble{border-radius:var(--rounded-box,1rem);min-height:2.75rem;min-width:2.75rem}.chat-bubble{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.chat-bubble-primary{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.chat-bubble-secondary{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.chat-bubble-accent{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.chat-bubble-info{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.chat-bubble-success{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.chat-bubble-warning{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.chat-bubble-error{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.chat-start .chat-bubble{border-end-start-radius:0}.chat-start .chat-bubble:before{inset-inline-start:-.749rem}.chat-end .chat-bubble{border-end-end-radius:0}.chat-end .chat-bubble:before{inset-inline-start:99.9%}.checkbox{--chkbg:var(--fallback-bc,oklch(var(--bc)/1));--chkfg:var(--fallback-b1,oklch(var(--b1)/1));height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.checkbox:focus{box-shadow:none}.checkbox:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.checkbox:disabled{border-width:0}.checkbox:checked,.checkbox[aria-checked=true]{background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-color:var(--chkbg);background-image:linear-gradient(-45deg,transparent 65%,var(--chkbg) 65.99%),linear-gradient(45deg,transparent 75%,var(--chkbg) 75.99%),linear-gradient(-45deg,var(--chkbg) 40%,transparent 40.99%),linear-gradient(45deg,var(--chkbg) 30%,var(--chkfg) 30.99%,var(--chkfg) 40%,transparent 40.99%),linear-gradient(-45deg,var(--chkfg) 50%,var(--chkbg) 50.99%)}.checkbox:indeterminate{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-repeat:no-repeat;animation:checkmark var(--animation-input,.2s) ease-out;background-image:linear-gradient(90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(-90deg,transparent 80%,var(--chkbg) 80%),linear-gradient(0deg,var(--chkbg) 43%,var(--chkfg) 43%,var(--chkfg) 57%,var(--chkbg) 57%)}.checkbox-primary{--chkbg:var(--fallback-p,oklch(var(--p)/1));--chkfg:var(--fallback-pc,oklch(var(--pc)/1));--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.checkbox-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.checkbox-primary:checked,.checkbox-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.checkbox-secondary{--chkbg:var(--fallback-s,oklch(var(--s)/1));--chkfg:var(--fallback-sc,oklch(var(--sc)/1));--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.checkbox-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.checkbox-secondary:checked,.checkbox-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.checkbox-accent{--chkbg:var(--fallback-a,oklch(var(--a)/1));--chkfg:var(--fallback-ac,oklch(var(--ac)/1));--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.checkbox-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.checkbox-accent:checked,.checkbox-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.checkbox-success{--chkbg:var(--fallback-su,oklch(var(--su)/1));--chkfg:var(--fallback-suc,oklch(var(--suc)/1));--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.checkbox-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.checkbox-success:checked,.checkbox-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.checkbox-warning{--chkbg:var(--fallback-wa,oklch(var(--wa)/1));--chkfg:var(--fallback-wac,oklch(var(--wac)/1));--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.checkbox-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.checkbox-warning:checked,.checkbox-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.checkbox-info{--chkbg:var(--fallback-in,oklch(var(--in)/1));--chkfg:var(--fallback-inc,oklch(var(--inc)/1));--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.checkbox-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.checkbox-info:checked,.checkbox-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.checkbox-error{--chkbg:var(--fallback-er,oklch(var(--er)/1));--chkfg:var(--fallback-erc,oklch(var(--erc)/1));--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.checkbox-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.checkbox-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.checkbox-error:checked,.checkbox-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.checkbox:disabled{cursor:not-allowed;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.2}@keyframes checkmark{0%{background-position-y:5px}50%{background-position-y:-2px}100%{background-position-y:0}}.checkbox-mark{display:none}.collapse{width:100%;border-radius:var(--rounded-box,1rem)}details.collapse{width:100%}details.collapse summary{position:relative;display:block}details.collapse summary::-webkit-details-marker{display:none}.collapse:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}details.collapse summary{outline:2px solid transparent;outline-offset:2px}.collapse:has(.collapse-title:focus-visible),.collapse:has(>input[type=checkbox]:focus-visible),.collapse:has(>input[type=radio]:focus-visible){outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.collapse-arrow>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;--tw-translate-y:-100%;--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:150ms;transition-timing-function:cubic-bezier(0,0,.2,1);transition-duration:.2s;top:1.9rem;inset-inline-end:1.4rem;content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.collapse-plus>.collapse-title:after{position:absolute;display:block;height:.5rem;width:.5rem;transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);top:.9rem;inset-inline-end:1.4rem;content:"+";pointer-events:none}.collapse:not(.collapse-open):not(.collapse-close)>.collapse-title,.collapse:not(.collapse-open):not(.collapse-close)>input[type=checkbox],.collapse:not(.collapse-open):not(.collapse-close)>input[type=radio]:not(:checked){cursor:pointer}.collapse:focus:not(.collapse-open):not(.collapse-close):not(.collapse[open])>.collapse-title{cursor:unset}.collapse-title{position:relative}:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){z-index:1}.collapse-title,:where(.collapse>input[type=checkbox]),:where(.collapse>input[type=radio]){width:100%;padding:1rem;padding-inline-end:3rem;min-height:3.75rem;transition:background-color .2s ease-out}.collapse-content{padding-left:1rem;padding-right:1rem;cursor:unset;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-open>:where(.collapse-content),.collapse:focus:not(.collapse-close)>:where(.collapse-content),.collapse:not(.collapse-close)>:where(input[type=checkbox]:checked~.collapse-content),.collapse:not(.collapse-close)>:where(input[type=radio]:checked~.collapse-content),.collapse[open]>:where(.collapse-content){padding-bottom:1rem;transition:padding .2s ease-out,background-color .2s ease-out}.collapse-arrow:focus:not(.collapse-close)>.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-arrow:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse-open.collapse-arrow>.collapse-title:after,.collapse[open].collapse-arrow>.collapse-title:after{--tw-translate-y:-50%;--tw-rotate:225deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.collapse-open.collapse-plus>.collapse-title:after,.collapse-plus:focus:not(.collapse-close)>.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=checkbox]:checked~.collapse-title:after,.collapse-plus:not(.collapse-close)>input[type=radio]:checked~.collapse-title:after,.collapse[open].collapse-plus>.collapse-title:after{content:"−"}.countdown>:before{text-align:center;transition:all 1s cubic-bezier(1,0,0,1)}.diff-item-1:after{border-radius:9999px;border-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)));background-color:var(--fallback-b1,oklch(var(--b1)/.5));--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);outline-style:solid;outline-offset:-3px;outline-color:var(--fallback-bc,oklch(var(--bc)/.05));--tw-backdrop-blur:blur(8px);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);translate:50% -50%}.diff-item-2{border-right-width:2px;--tw-border-opacity:1;border-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-border-opacity)))}.divider{margin-top:1rem;margin-bottom:1rem;height:1rem;white-space:nowrap}.divider:after,.divider:before{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.divider:not(:empty){gap:1rem}.divider-neutral:after,.divider-neutral:before{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)))}.divider-primary:after,.divider-primary:before{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.divider-secondary:after,.divider-secondary:before{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.divider-accent:after,.divider-accent:before{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.divider-success:after,.divider-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.divider-warning:after,.divider-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.divider-info:after,.divider-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.divider-error:after,.divider-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.drawer{width:100%}.drawer-side>.drawer-overlay{cursor:pointer;background-color:transparent;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.drawer-toggle:checked~.drawer-side>.drawer-overlay{background-color:#0006}.drawer-toggle:focus-visible~.drawer-content label.drawer-button{outline-style:solid;outline-width:2px;outline-offset:2px}.dropdown:is(:not(details)) .dropdown-content{transform-origin:top;--tw-scale-x:.95;--tw-scale-y:.95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1)}.dropdown-bottom .dropdown-content{transform-origin:top}.dropdown-top .dropdown-content{transform-origin:bottom}.dropdown-left .dropdown-content{transform-origin:right}.dropdown-right .dropdown-content{transform-origin:left}.dropdown.dropdown-open .dropdown-content,.dropdown:focus .dropdown-content,.dropdown:focus-within .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@media (hover:hover){.dropdown.dropdown-hover:hover .dropdown-content{--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}}.file-input{overflow:hidden;border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.file-input::file-selector-button{border-style:solid;--tw-border-opacity:1;border-color:var(--fallback-n,oklch(var(--n)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));font-weight:600;text-transform:uppercase;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));text-decoration-line:none;border-width:var(--border-btn,1px);animation:button-pop var(--animation-btn,.25s) ease-out}.file-input-bordered{--tw-border-opacity:0.2}.file-input:focus{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.file-input-ghost{--tw-bg-opacity:0.05}.file-input-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.file-input-ghost::file-selector-button{border-width:1px;border-color:transparent;background-color:transparent;color:currentColor}.file-input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.file-input-primary:focus{outline-color:var(--fallback-p,oklch(var(--p)/1))}.file-input-primary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.file-input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.file-input-secondary:focus{outline-color:var(--fallback-s,oklch(var(--s)/1))}.file-input-secondary::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.file-input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.file-input-accent:focus{outline-color:var(--fallback-a,oklch(var(--a)/1))}.file-input-accent::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.file-input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.file-input-info:focus{outline-color:var(--fallback-in,oklch(var(--in)/1))}.file-input-info::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.file-input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.file-input-success:focus{outline-color:var(--fallback-su,oklch(var(--su)/1))}.file-input-success::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.file-input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.file-input-warning:focus{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.file-input-warning::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.file-input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.file-input-error:focus{outline-color:var(--fallback-er,oklch(var(--er)/1))}.file-input-error::file-selector-button{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.file-input-disabled,.file-input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));--tw-text-opacity:0.2}.file-input-disabled::placeholder,.file-input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.file-input-disabled::file-selector-button,.file-input[disabled]::file-selector-button{--tw-border-opacity:0;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.2;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}.footer{column-gap:1rem;row-gap:2.5rem;font-size:.875rem;line-height:1.25rem}.footer>*{gap:.5rem}.footer-title{margin-bottom:.5rem;font-weight:700;text-transform:uppercase;opacity:.6}.label{padding-left:.25rem;padding-right:.25rem;padding-top:.5rem;padding-bottom:.5rem}.label-text{font-size:.875rem;line-height:1.25rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.label-text-alt{font-size:.75rem;line-height:1rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}@media(hover:hover){.label a:hover{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}}.hero-overlay{background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-bg-opacity:0.5}.hero-content{max-width:80rem;gap:1rem;padding:1rem}.input{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));font-size:1rem;line-height:1.5rem}.input input{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));background-color:transparent}.input input:focus{outline:2px solid transparent;outline-offset:2px}.input[list]::-webkit-calendar-picker-indicator{line-height:1em}.input-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input:focus,.input:focus-within{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.input-ghost{--tw-bg-opacity:0.05}.input-ghost:focus,.input-ghost:focus-within{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.input-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.input-primary:focus,.input-primary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.input-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.input-secondary:focus,.input-secondary:focus-within{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.input-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.input-accent:focus,.input-accent:focus-within{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.input-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.input-info:focus,.input-info:focus-within{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.input-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.input-success:focus,.input-success:focus-within{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.input-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.input-warning:focus,.input-warning:focus-within{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.input-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.input-error:focus,.input-error:focus-within{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.input-disabled,.input:disabled,.input:has(>input[disabled]),.input[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.input-disabled::placeholder,.input:disabled::placeholder,.input:has(>input[disabled])::placeholder,.input[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.input:has(>input[disabled])>input[disabled]{cursor:not-allowed}.input::-webkit-date-and-time-value{text-align:inherit}.join{border-radius:var(--rounded-btn,.5rem)}.join>:where(:not(:first-child)){margin-top:0;margin-bottom:0;margin-inline-start:-1px}.join>:where(:not(:first-child)):is(.btn){margin-inline-start:calc(var(--border-btn) * -1)}.join-item:focus{isolation:isolate}.kbd{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:.5rem;padding-right:.5rem;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));border-bottom-width:2px;min-height:2.2em;min-width:2.2em}.link-primary{--tw-text-opacity:1;color:var(--fallback-p,oklch(var(--p)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-primary:hover{color:color-mix(in oklab,var(--fallback-p,oklch(var(--p)/1)) 80%,#000)}}}.link-secondary{--tw-text-opacity:1;color:var(--fallback-s,oklch(var(--s)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-secondary:hover{color:color-mix(in oklab,var(--fallback-s,oklch(var(--s)/1)) 80%,#000)}}}.link-accent{--tw-text-opacity:1;color:var(--fallback-a,oklch(var(--a)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-accent:hover{color:color-mix(in oklab,var(--fallback-a,oklch(var(--a)/1)) 80%,#000)}}}.link-neutral{--tw-text-opacity:1;color:var(--fallback-n,oklch(var(--n)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-neutral:hover{color:color-mix(in oklab,var(--fallback-n,oklch(var(--n)/1)) 80%,#000)}}}.link-success{--tw-text-opacity:1;color:var(--fallback-su,oklch(var(--su)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-success:hover{color:color-mix(in oklab,var(--fallback-su,oklch(var(--su)/1)) 80%,#000)}}}.link-info{--tw-text-opacity:1;color:var(--fallback-in,oklch(var(--in)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-info:hover{color:color-mix(in oklab,var(--fallback-in,oklch(var(--in)/1)) 80%,#000)}}}.link-warning{--tw-text-opacity:1;color:var(--fallback-wa,oklch(var(--wa)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-warning:hover{color:color-mix(in oklab,var(--fallback-wa,oklch(var(--wa)/1)) 80%,#000)}}}.link-error{--tw-text-opacity:1;color:var(--fallback-er,oklch(var(--er)/var(--tw-text-opacity)))}@supports(color:color-mix(in oklab,black,black)){@media(hover:hover){.link-error:hover{color:color-mix(in oklab,var(--fallback-er,oklch(var(--er)/1)) 80%,#000)}}}.link:focus{outline:2px solid transparent;outline-offset:2px}.link:focus-visible{outline:2px solid currentColor;outline-offset:2px}.loading{pointer-events:none;display:inline-block;aspect-ratio:1/1;width:1.5rem;background-color:currentColor;mask-size:100%;mask-repeat:no-repeat;mask-position:center;mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-spinner{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' stroke='%23000' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_V8m1%7Btransform-origin:center;animation:spinner_zKoa 2s linear infinite%7D.spinner_V8m1 circle%7Bstroke-linecap:round;animation:spinner_YpZS 1.5s ease-out infinite%7D%40keyframes spinner_zKoa%7B100%25%7Btransform:rotate(360deg)%7D%7D%40keyframes spinner_YpZS%7B0%25%7Bstroke-dasharray:0 150;stroke-dashoffset:0%7D47.5%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-16%7D95%25%2C100%25%7Bstroke-dasharray:42 150;stroke-dashoffset:-59%7D%7D%3C%2Fstyle%3E%3Cg class='spinner_V8m1'%3E%3Ccircle cx='12' cy='12' r='9.5' fill='none' stroke-width='3'%3E%3C%2Fcircle%3E%3C%2Fg%3E%3C%2Fsvg%3E")}.loading-dots{mask-image:url("data:image/svg+xml,%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_qM83%7Banimation:spinner_8HQG 1.05s infinite%7D.spinner_oXPr%7Banimation-delay:.1s%7D.spinner_ZTLf%7Banimation-delay:.2s%7D@keyframes spinner_8HQG%7B0%25,57.14%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);transform:translate(0)%7D28.57%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);transform:translateY(-6px)%7D100%25%7Btransform:translate(0)%7D%7D%3C/style%3E%3Ccircle class='spinner_qM83' cx='4' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_oXPr' cx='12' cy='12' r='3'/%3E%3Ccircle class='spinner_qM83 spinner_ZTLf' cx='20' cy='12' r='3'/%3E%3C/svg%3E")}.loading-ring{mask-image:url("data:image/svg+xml,%3Csvg width='44' height='44' viewBox='0 0 44 44' xmlns='http://www.w3.org/2000/svg' stroke='%23fff'%3E%3Cg fill='none' fill-rule='evenodd' stroke-width='2'%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='0s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='0s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3Ccircle cx='22' cy='22' r='1'%3E%3Canimate attributeName='r' begin='-0.9s' dur='1.8s' values='1; 20' calcMode='spline' keyTimes='0; 1' keySplines='0.165, 0.84, 0.44, 1' repeatCount='indefinite' /%3E%3Canimate attributeName='stroke-opacity' begin='-0.9s' dur='1.8s' values='1; 0' calcMode='spline' keyTimes='0; 1' keySplines='0.3, 0.61, 0.355, 1' repeatCount='indefinite' /%3E%3C/circle%3E%3C/g%3E%3C/svg%3E")}.loading-ball{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_rXNP%7Banimation:spinner_YeBj .8s infinite%7D@keyframes spinner_YeBj%7B0%25%7Banimation-timing-function:cubic-bezier(0.33,0,.66,.33);cy:5px%7D46.875%25%7Bcy:20px;rx:4px;ry:4px%7D50%25%7Banimation-timing-function:cubic-bezier(0.33,.66,.66,1);cy:20.5px;rx:4.8px;ry:3px%7D53.125%25%7Brx:4px;ry:4px%7D100%25%7Bcy:5px%7D%7D%3C/style%3E%3Cellipse class='spinner_rXNP' cx='12' cy='5' rx='4' ry='4'/%3E%3C/svg%3E")}.loading-bars{mask-image:url("data:image/svg+xml,%0A%3Csvg width='24' height='24' viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cstyle%3E.spinner_hzlK%7Banimation:spinner_vc4H .8s linear infinite;animation-delay:-.8s%7D.spinner_koGT%7Banimation-delay:-.65s%7D.spinner_YF1u%7Banimation-delay:-.5s%7D@keyframes spinner_vc4H%7B0%25%7By:1px;height:22px%7D93.75%25%7By:5px;height:14px;opacity:.2%7D%7D%3C/style%3E%3Crect class='spinner_hzlK' x='1' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_koGT' x='9' y='1' width='6' height='22'/%3E%3Crect class='spinner_hzlK spinner_YF1u' x='17' y='1' width='6' height='22'/%3E%3C/svg%3E")}.loading-infinity{mask-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' style='shape-rendering: auto;' width='200px' height='200px' viewBox='0 0 100 100' preserveAspectRatio='xMidYMid'%3E%3Cpath fill='none' stroke='%230a0a0a' stroke-width='10' stroke-dasharray='205.271142578125 51.317785644531256' d='M24.3 30C11.4 30 5 43.3 5 50s6.4 20 19.3 20c19.3 0 32.1-40 51.4-40 C88.6 30 95 43.3 95 50s-6.4 20-19.3 20C56.4 70 43.6 30 24.3 30z' stroke-linecap='round' style='transform:scale(0.8);transform-origin:50px 50px'%3E%3Canimate attributeName='stroke-dashoffset' repeatCount='indefinite' dur='2s' keyTimes='0;1' values='0;256.58892822265625'%3E%3C/animate%3E%3C/path%3E%3C/svg%3E")}.loading-xs{width:1rem}.loading-sm{width:1.25rem}.loading-md{width:1.5rem}.loading-lg{width:2.5rem}.mask-squircle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 0C20 0 0 20 0 100s20 100 100 100 100-20 100-100S180 0 100 0Z'/%3e%3c/svg%3e")}.mask-decagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 58.779 19.098 36.327 50v61.804l-36.327 50L96 200l-58.779-19.098-36.327-50V69.098l36.327-50z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-diamond{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m100 0 100 100-100 100L0 100z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-heart{mask-image:url("data:image/svg+xml,%3csvg width='200' height='185' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M100 184.606a15.384 15.384 0 0 1-8.653-2.678C53.565 156.28 37.205 138.695 28.182 127.7 8.952 104.264-.254 80.202.005 54.146.308 24.287 24.264 0 53.406 0c21.192 0 35.869 11.937 44.416 21.879a2.884 2.884 0 0 0 4.356 0C110.725 11.927 125.402 0 146.594 0c29.142 0 53.098 24.287 53.4 54.151.26 26.061-8.956 50.122-28.176 73.554-9.023 10.994-25.383 28.58-63.165 54.228a15.384 15.384 0 0 1-8.653 2.673Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon{mask-image:url("data:image/svg+xml,%3csvg width='182' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M.3 65.486c0-9.196 6.687-20.063 14.211-25.078l61.86-35.946c8.36-5.016 20.899-5.016 29.258 0l61.86 35.946c8.36 5.015 14.211 15.882 14.211 25.078v71.055c0 9.196-6.687 20.063-14.211 25.079l-61.86 35.945c-8.36 4.18-20.899 4.18-29.258 0L14.51 161.62C6.151 157.44.3 145.737.3 136.54V65.486Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-hexagon-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='182' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M64.786 181.4c-9.196 0-20.063-6.687-25.079-14.21L3.762 105.33c-5.016-8.36-5.016-20.9 0-29.259l35.945-61.86C44.723 5.851 55.59 0 64.786 0h71.055c9.196 0 20.063 6.688 25.079 14.211l35.945 61.86c4.18 8.36 4.18 20.899 0 29.258l-35.945 61.86c-4.18 8.36-15.883 14.211-25.079 14.211H64.786Z' fill='black' fill-rule='nonzero'/%3e%3c/svg%3e")}.mask-circle{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle fill='black' cx='100' cy='100' r='100' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M46.154 0H200l-46.154 153.846H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-2{mask-image:url("data:image/svg+xml,%3csvg width='200' height='154' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.846 0H0l46.154 153.846H200z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-3{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.077 47.077v153.846l153.846-46.154V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-parallelogram-4{mask-image:url("data:image/svg+xml,%3csvg width='154' height='201' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M153.923 47.077v153.846L.077 154.77V.923z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-pentagon{mask-image:url("data:image/svg+xml,%3csvg width='192' height='181' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 0 95.106 69.098-36.327 111.804H37.22L.894 69.098z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-square{mask-image:url("data:image/svg+xml,%3csvg width='200' height='200' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M0 0h200v200H0z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 137.263-58.779 42.024 22.163-68.389L.894 68.481l72.476-.243L96 0l22.63 68.238 72.476.243-58.49 42.417 22.163 68.389z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-star-2{mask-image:url("data:image/svg+xml,%3csvg width='192' height='180' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m96 153.044-58.779 26.243 7.02-63.513L.894 68.481l63.117-13.01L96 0l31.989 55.472 63.117 13.01-43.347 47.292 7.02 63.513z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle{mask-image:url("data:image/svg+xml,%3csvg width='174' height='149' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 148.476-86.603.185L43.86 74.423 87 0l43.14 74.423 43.463 74.238z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-2{mask-image:url("data:image/svg+xml,%3csvg width='174' height='150' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m87 .738 86.603-.184-43.463 74.238L87 149.214 43.86 74.792.397.554z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-3{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='m149.369 87.107.185 86.603-74.239-43.463L.893 87.107l74.422-43.14L149.554.505z' fill-rule='evenodd'/%3e%3c/svg%3e")}.mask-triangle-4{mask-image:url("data:image/svg+xml,%3csvg width='150' height='174' xmlns='http://www.w3.org/2000/svg'%3e%3cpath fill='black' d='M.631 87.107.446.505l74.239 43.462 74.422 43.14-74.422 43.14L.446 173.71z' fill-rule='evenodd'/%3e%3c/svg%3e")}.menu{padding:.5rem}:where(.menuli:empty){--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;margin:.5rem 1rem;height:1px}.menu :where(liul){margin-inline-start:1rem;padding-inline-start:.5rem}.menu :where(liul):before{position:absolute;bottom:.75rem;inset-inline-start:0;top:.75rem;width:1px;--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));opacity:.1;content:""}.menu :where(li:not(.menu-title)>:not(ul,details,.menu-title,.btn)),.menu :where(li:not(.menu-title)>details>summary:not(.menu-title)){border-radius:var(--rounded-btn,.5rem);padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;text-align:start;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);text-wrap:balance}:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(summary,.active,.btn):focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):is(summary):not(.active,.btn):focus-visible,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn).focus,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(summary,.active,.btn):focus{cursor:pointer;background-color:var(--fallback-bc,oklch(var(--bc)/.1));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));outline:2px solid transparent;outline-offset:2px}@media (hover:hover){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{cursor:pointer;outline:2px solid transparent;outline-offset:2px}@supports (color:oklch(0% 0 0)){:where(.menuli:not(.menu-title,.disabled)>:not(ul,details,.menu-title)):not(.active,.btn):hover,:where(.menuli:not(.menu-title,.disabled)>details>summary:not(.menu-title)):not(.active,.btn):hover{background-color:var(--fallback-bc,oklch(var(--bc)/.1))}}}.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}@media(hover:hover){.menu li>:not(ul,.menu-title,details,.btn).active,.menu li>:not(ul,.menu-title,details,.btn):active,.menu li>details>summary:active{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}}.menu li.disabled{color:var(--fallback-bc,oklch(var(--bc)/.3))}.menu :where(li>details>summary)::-webkit-details-marker{display:none}.menu :where(li>.menu-dropdown-toggle):after,.menu :where(li>details>summary):after{justify-self:end;display:block;margin-top:-.5rem;height:.5rem;width:.5rem;transform:rotate(45deg);transition-property:transform,margin-top;transition-duration:.3s;transition-timing-function:cubic-bezier(.4,0,.2,1);content:"";transform-origin:75% 75%;box-shadow:2px 2px;pointer-events:none}.menu :where(li>.menu-dropdown-toggle.menu-dropdown-show):after,.menu :where(li>details[open]>summary):after{transform:rotate(225deg);margin-top:0}.menu-title{padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;font-size:.875rem;line-height:1.25rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.4))}.mockup-code{min-width:18rem;border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));padding-top:1.25rem;padding-bottom:1.25rem;--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));direction:ltr}.mockup-code:before{content:"";margin-bottom:1rem;display:block;height:.75rem;width:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-code pre{padding-right:1.25rem}.mockup-code pre:before{content:"";margin-right:2ch}.mockup-code pre[data-prefix]:before{content:attr(data-prefix);width:2rem;opacity:.5}.mockup-window{display:flex;flex-direction:column;border-radius:var(--rounded-box,1rem);padding-top:1.25rem}.mockup-window:before{content:"";margin-bottom:1rem;display:block;aspect-ratio:1/1;height:.75rem;flex-shrink:0;align-self:flex-start;border-radius:9999px;opacity:.3}.mockup-window:where([dir=rtl],[dir=rtl]*):before{align-self:flex-end}.mockup-window:before{box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-phone{display:inline-block;border:4px solid #444;border-radius:50px;background-color:#000;padding:10px;margin:0 auto;overflow:hidden}.mockup-phone .camera{position:relative;top:0;left:0;background:#000;height:25px;width:150px;margin:0 auto;border-bottom-left-radius:17px;border-bottom-right-radius:17px;z-index:11}.mockup-phone .camera:before{content:"";position:absolute;top:35%;left:50%;width:50px;height:4px;border-radius:5px;background-color:#0c0b0e;transform:translate(-50%,-50%)}.mockup-phone .camera:after{content:"";position:absolute;top:20%;left:70%;width:8px;height:8px;border-radius:5px;background-color:#0f0b25}.mockup-phone .display{overflow:hidden;border-radius:40px;margin-top:-25px}.mockup-browser{border-radius:var(--rounded-box,1rem)}.mockup-browser .mockup-browser-toolbar{margin-top:.75rem;margin-bottom:.75rem;display:inline-flex;width:100%;align-items:center;padding-right:1.4em}.mockup-browser .mockup-browser-toolbar:where([dir=rtl],[dir=rtl]*){flex-direction:row-reverse}.mockup-browser .mockup-browser-toolbar:before{content:"";margin-right:4.8rem;display:inline-block;aspect-ratio:1/1;height:.75rem;border-radius:9999px;opacity:.3;box-shadow:1.4em 0,2.8em 0,4.2em 0}.mockup-browser .mockup-browser-toolbar .input{position:relative;margin-left:auto;margin-right:auto;display:block;height:1.75rem;width:24rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding-left:2rem;direction:ltr}.mockup-browser .mockup-browser-toolbar .input:before{content:"";position:absolute;left:.5rem;top:50%;aspect-ratio:1/1;height:.75rem;--tw-translate-y:-50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:2px;border-color:currentColor;opacity:.6}.mockup-browser .mockup-browser-toolbar .input:after{content:"";position:absolute;left:1.25rem;top:50%;height:.5rem;--tw-translate-y:25%;--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:9999px;border-width:1px;border-color:currentColor;opacity:.6}.modal{background-color:transparent;color:inherit;transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity,visibility;overflow-y:hidden;overscroll-behavior:contain}.modal::backdrop,.modal:not(dialog:not(.modal-open)){background-color:#0006;animation:modal-pop .2s ease-out}.modal-backdrop{z-index:-1;grid-column-start:1;grid-row-start:1;display:grid;align-self:stretch;justify-self:stretch;color:transparent}.modal-box{grid-column-start:1;grid-row-start:1;width:91.666667%;max-width:32rem;--tw-scale-x:.9;--tw-scale-y:.9;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-bottom-right-radius:var(--rounded-box,1rem);border-bottom-left-radius:var(--rounded-box,1rem);border-top-left-radius:var(--rounded-box,1rem);border-top-right-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding:1.5rem;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.2s;transition-timing-function:cubic-bezier(0,0,.2,1);box-shadow:rgba(0,0,0,.25) 0 25px 50px -12px;overflow-y:auto;overscroll-behavior:contain}.modal-open .modal-box,.modal-toggle:checked+.modal .modal-box,.modal:target .modal-box,.modal[open] .modal-box{--tw-translate-y:0px;--tw-scale-x:1;--tw-scale-y:1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.modal-action{margin-top:1.5rem;justify-content:flex-end}.modal-action>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}@keyframes modal-pop{0%{opacity:0}}.navbar{padding:var(--navbar-padding,.5rem);min-height:4rem;width:100%}.progress{height:.5rem;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.2))}.progress::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-moz-progress-bar{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{--progress-color:var(--fallback-bc,oklch(var(--bc)/1))}.progress-primary:indeterminate{--progress-color:var(--fallback-p,oklch(var(--p)/1))}.progress-secondary:indeterminate{--progress-color:var(--fallback-s,oklch(var(--s)/1))}.progress-accent:indeterminate{--progress-color:var(--fallback-a,oklch(var(--a)/1))}.progress-info:indeterminate{--progress-color:var(--fallback-in,oklch(var(--in)/1))}.progress-success:indeterminate{--progress-color:var(--fallback-su,oklch(var(--su)/1))}.progress-warning:indeterminate{--progress-color:var(--fallback-wa,oklch(var(--wa)/1))}.progress-error:indeterminate{--progress-color:var(--fallback-er,oklch(var(--er)/1))}.progress::-webkit-progress-bar{border-radius:var(--rounded-box,1rem);background-color:transparent}.progress::-webkit-progress-value{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)))}.progress-primary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)))}.progress-secondary::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)))}.progress-accent::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)))}.progress-info::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.progress-success::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.progress-warning::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.progress-error::-webkit-progress-value{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.progress:indeterminate{background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}.progress:indeterminate::-moz-progress-bar{background-color:transparent;background-image:repeating-linear-gradient(90deg,var(--progress-color) -1%,var(--progress-color) 10%,transparent 10%,transparent 90%);background-size:200%;background-position-x:15%;animation:progress-loading 5s ease-in-out infinite}@keyframes progress-loading{50%{background-position-x:-115%}}.radial-progress{--value:0;--size:5rem;--thickness:calc(var(--size) / 10)}.radial-progress:after{background-color:currentColor}.radio{--chkbg:var(--bc);height:1.5rem;width:1.5rem;cursor:pointer;appearance:none;border-radius:9999px;border-width:1px;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2}.radio:focus{box-shadow:none}.radio:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/1))}.radio:checked,.radio[aria-checked=true]{--tw-bg-opacity:1;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));background-image:none;animation:radiomark var(--animation-input,.2s) ease-out;box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}.radio-primary{--chkbg:var(--p);--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}@media(hover:hover){.radio-primary:hover{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}}.radio-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.radio-primary:checked,.radio-primary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.radio-secondary{--chkbg:var(--s);--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}@media(hover:hover){.radio-secondary:hover{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}}.radio-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.radio-secondary:checked,.radio-secondary[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.radio-accent{--chkbg:var(--a);--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}@media(hover:hover){.radio-accent:hover{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}}.radio-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.radio-accent:checked,.radio-accent[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.radio-success{--chkbg:var(--su);--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}@media(hover:hover){.radio-success:hover{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}}.radio-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.radio-success:checked,.radio-success[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.radio-warning{--chkbg:var(--wa);--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}@media(hover:hover){.radio-warning:hover{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}}.radio-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.radio-warning:checked,.radio-warning[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.radio-info{--chkbg:var(--in);--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}@media(hover:hover){.radio-info:hover{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}}.radio-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.radio-info:checked,.radio-info[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.radio-error{--chkbg:var(--er);--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}@media(hover:hover){.radio-error:hover{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}}.radio-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.radio-error:checked,.radio-error[aria-checked=true]{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.radio:disabled{cursor:not-allowed;opacity:.2}@keyframes radiomark{0%{box-shadow:0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 12px var(--fallback-b1,oklch(var(--b1)/1)) inset}50%{box-shadow:0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 3px var(--fallback-b1,oklch(var(--b1)/1)) inset}100%{box-shadow:0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset}}.radio-mark{display:none}.range{appearance:none;-webkit-appearance:none;--range-shdw:var(--fallback-bc,oklch(var(--bc)/1));overflow:hidden;border-radius:var(--rounded-box,1rem);background-color:transparent}.range:focus-visible::-webkit-slider-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range:focus-visible::-moz-range-thumb{--focus-shadow:0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset,0 0 0 2rem var(--range-shdw) inset}.range::-webkit-slider-runnable-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-moz-range-track{height:.5rem;width:100%;border-radius:var(--rounded-box,1rem);background-color:var(--fallback-bc,oklch(var(--bc)/.1))}.range::-webkit-slider-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));appearance:none;-webkit-appearance:none;top:50%;color:var(--range-shdw);transform:translateY(-50%);--filler-size:100rem;--filler-offset:0.6rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range::-moz-range-thumb{position:relative;height:1.5rem;width:1.5rem;border-radius:var(--rounded-box,1rem);border-style:none;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));top:50%;color:var(--range-shdw);--filler-size:100rem;--filler-offset:0.5rem;box-shadow:0 0 0 3px var(--range-shdw) inset,var(--focus-shadow,0 0),calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size)}.range-primary{--range-shdw:var(--fallback-p,oklch(var(--p)/1))}.range-secondary{--range-shdw:var(--fallback-s,oklch(var(--s)/1))}.range-accent{--range-shdw:var(--fallback-a,oklch(var(--a)/1))}.range-success{--range-shdw:var(--fallback-su,oklch(var(--su)/1))}.range-warning{--range-shdw:var(--fallback-wa,oklch(var(--wa)/1))}.range-info{--range-shdw:var(--fallback-in,oklch(var(--in)/1))}.range-error{--range-shdw:var(--fallback-er,oklch(var(--er)/1))}.rating input{appearance:none;-webkit-appearance:none}.rating :where(input){animation:rating-pop var(--animation-input,.25s) ease-out;height:1.5rem;width:1.5rem;background-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));--tw-bg-opacity:1}.rating .rating-hidden{width:.5rem;background-color:transparent}.rating input[type=radio]:checked{background-image:none}.rating input:checked~input,.rating input[aria-checked=true]~input{--tw-bg-opacity:0.2}.rating input:focus-visible{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transform:translateY(-.125em)}.rating input:active:focus{animation:none;transform:translateY(-.125em)}.rating-half :where(input:not(.rating-hidden)){width:.75rem}@keyframes rating-pop{0%{transform:translateY(-.125em)}40%{transform:translateY(-.125em)}100%{transform:translateY(0)}}.select{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-inline-end:2.5rem}.select-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select{background-image:linear-gradient(45deg,transparent 50%,currentColor 50%),linear-gradient(135deg,currentColor 50%,transparent 50%);background-position:calc(100% - 20px) calc(1px + 50%),calc(100% - 16.1px) calc(1px + 50%);background-size:4px 4px,4px 4px;background-repeat:no-repeat}.select:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.select-ghost{--tw-bg-opacity:0.05}.select-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.select-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.select-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.select-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.select-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.select-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.select-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.select-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.select-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.select-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.select-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.select-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.select-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.select-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.select-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.select-disabled,.select:disabled,.select[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.select-disabled::placeholder,.select:disabled::placeholder,.select[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.select-multiple,.select[multiple],.select[size].select:not([size="1"]){background-image:none;padding-right:1rem}[dir=rtl] .select{background-position:calc(0% + 12px) calc(1px + 50%),calc(0% + 16px) calc(1px + 50%)}.skeleton{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));will-change:background-position;animation:skeleton 1.8s ease-in-out infinite;background-image:linear-gradient(105deg,transparent 0,transparent 40%,var(--fallback-b1,oklch(var(--b1)/1)) 50%,transparent 60%,transparent 100%);background-size:200% auto;background-repeat:no-repeat;background-position-x:-50%}@media (prefers-reduced-motion){.skeleton{animation-duration:15s}}@keyframes skeleton{from{background-position:150%}to{background-position:-50%}}.stack{place-items:center;align-items:flex-end}.stack>*{width:100%;opacity:.6}.stack>:nth-child(2){opacity:.8}.stack>:nth-child(1){opacity:1}.stats{border-radius:var(--rounded-box,1rem);--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}:where(.stats)>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:0;border-right-width:calc(1px * var(--tw-divide-x-reverse));border-left-width:calc(1px * calc(1 - var(--tw-divide-x-reverse)));--tw-divide-y-reverse:0;border-top-width:calc(0px * calc(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(0px * var(--tw-divide-y-reverse))}:where(.stats){overflow-x:auto}[dir=rtl] .stats>:not([hidden])~:not([hidden]){--tw-divide-x-reverse:1}.stat{column-gap:1rem;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.1;padding-left:1.5rem;padding-right:1.5rem;padding-top:1rem;padding-bottom:1rem}.stat-title{color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-value{font-size:2.25rem;line-height:2.5rem;font-weight:800}.stat-desc{font-size:.75rem;line-height:1rem;color:var(--fallback-bc,oklch(var(--bc)/.6))}.stat-actions{margin-top:1rem}.steps .step{grid-template-rows:40px 1fr;grid-template-columns:auto;min-width:4rem}.steps .step:before{top:0;grid-column-start:1;grid-row-start:1;height:.5rem;width:100%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));content:"";margin-inline-start:-100%}.steps .step:after{content:counter(step);counter-increment:step;z-index:1;position:relative;grid-column-start:1;grid-row-start:1;display:grid;height:2rem;width:2rem;place-items:center;place-self:center;border-radius:9999px;--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}.steps .step:first-child:before{content:none}.steps .step[data-content]:after{content:attr(data-content)}.steps .step-neutral+.step-neutral:before,.steps .step-neutral:after{--tw-bg-opacity:1;background-color:var(--fallback-n,oklch(var(--n)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)))}.steps .step-primary+.step-primary:before,.steps .step-primary:after{--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.steps .step-secondary+.step-secondary:before,.steps .step-secondary:after{--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.steps .step-accent+.step-accent:before,.steps .step-accent:after{--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.steps .step-info+.step-info:before{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)))}.steps .step-info:after{--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.steps .step-success+.step-success:before{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)))}.steps .step-success:after{--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.steps .step-warning+.step-warning:before{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)))}.steps .step-warning:after{--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.steps .step-error+.step-error:before{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)))}.steps .step-error:after{--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.swap{cursor:pointer}.swap>*{transition-duration:.3s;transition-timing-function:cubic-bezier(0,0,.2,1);transition-property:transform,opacity}.swap-rotate .swap-indeterminate,.swap-rotate .swap-on,.swap-rotate input:indeterminate~.swap-on{--tw-rotate:45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-off,.swap-rotate input:checked~.swap-off,.swap-rotate input:indeterminate~.swap-off{--tw-rotate:-45deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-active:where(.swap-rotate) .swap-on,.swap-rotate input:checked~.swap-on,.swap-rotate input:indeterminate~.swap-indeterminate{--tw-rotate:0deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.swap-flip{transform-style:preserve-3d;perspective:16em}.swap-flip .swap-indeterminate,.swap-flip .swap-on,.swap-flip input:indeterminate~.swap-on{transform:rotateY(180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-off,.swap-flip input:checked~.swap-off,.swap-flip input:indeterminate~.swap-off{transform:rotateY(-180deg);backface-visibility:hidden;opacity:1}.swap-active:where(.swap-flip) .swap-on,.swap-flip input:checked~.swap-on,.swap-flip input:indeterminate~.swap-indeterminate{transform:rotateY(0)}.tabs-lifted>.tab:focus-visible{border-end-end-radius:0;border-end-start-radius:0}.tab{--tw-text-opacity:0.5}@media(hover:hover){.tab:hover{--tw-text-opacity:1}}.tab{--tab-color:var(--fallback-bc,oklch(var(--bc)/1));--tab-bg:var(--fallback-b1,oklch(var(--b1)/1));--tab-border-color:var(--fallback-b3,oklch(var(--b3)/1));color:var(--tab-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem)}.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tab:is(input:checked){border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:1;--tw-text-opacity:1}.tab:focus{outline:2px solid transparent;outline-offset:2px}.tab:focus-visible{outline:2px solid currentColor;outline-offset:-5px}.tab-disabled,.tab[disabled]{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}@media (hover:hover){.tab[disabled],.tab[disabled]:hover{cursor:not-allowed;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));--tw-text-opacity:0.2}}.tabs-bordered>.tab{border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));--tw-border-opacity:0.2;border-style:solid;border-bottom-width:calc(var(--tab-border,1px) + 1px)}.tabs-lifted>.tab{border:var(--tab-border,1px) solid transparent;border-width:0 0 var(--tab-border,1px) 0;border-start-start-radius:var(--tab-radius,.5rem);border-start-end-radius:var(--tab-radius,.5rem);border-bottom-color:var(--tab-border-color);padding-inline-start:var(--tab-padding,1rem);padding-inline-end:var(--tab-padding,1rem);padding-top:var(--tab-border,1px)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-lifted>.tab:is(input:checked){background-color:var(--tab-bg);border-width:var(--tab-border,1px) var(--tab-border,1px) 0 var(--tab-border,1px);border-inline-start-color:var(--tab-border-color);border-inline-end-color:var(--tab-border-color);border-top-color:var(--tab-border-color);padding-inline-start:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-inline-end:calc(var(--tab-padding,1rem) - var(--tab-border,1px));padding-bottom:var(--tab-border,1px);padding-top:0}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before,.tabs-lifted>.tab:is(input:checked):before{z-index:1;content:"";display:block;position:absolute;width:calc(100% + var(--tab-radius,.5rem) * 2);height:var(--tab-radius,.5rem);bottom:0;background-size:var(--tab-radius,.5rem);background-position:top left,top right;background-repeat:no-repeat;--tab-grad:calc(69% - var(--tab-border, 1px));--radius-start:radial-gradient(
+        circle at top left,
+        transparent var(--tab-grad),
+        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
+        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
+        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
+      );--radius-end:radial-gradient(
+        circle at top right,
+        transparent var(--tab-grad),
+        var(--tab-border-color) calc(var(--tab-grad) + 0.25px),
+        var(--tab-border-color) calc(var(--tab-grad) + var(--tab-border, 1px)),
+        var(--tab-bg) calc(var(--tab-grad) + var(--tab-border, 1px) + 0.25px)
+      );background-image:var(--radius-start),var(--radius-end)}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,.tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-end);background-position:top right}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):first-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):first-child:before{background-image:var(--radius-start);background-position:top left}.tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,.tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-start);background-position:top left}[dir=rtl] .tabs-lifted>.tab:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):last-child:before,[dir=rtl] .tabs-lifted>.tab:is(input:checked):last-child:before{background-image:var(--radius-end);background-position:top right}.tabs-lifted>.tab:is(input:checked)+.tabs-lifted .tab:is(input:checked):before,.tabs-lifted>:is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled])+.tabs-lifted :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):before{background-image:var(--radius-end);background-position:top right}.tabs-boxed{border-radius:var(--rounded-btn,.5rem);--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));padding:.25rem}.tabs-boxed .tab{border-radius:var(--rounded-btn,.5rem)}.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]),.tabs-boxed :is(input:checked){--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}@media(hover:hover){.tabs-boxed :is(.tab-active,[aria-selected=true]):not(.tab-disabled):not([disabled]):hover,.tabs-boxed :is(input:checked):hover{--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}}.table{border-radius:var(--rounded-box,1rem);text-align:left;font-size:.875rem;line-height:1.25rem}.table:where([dir=rtl],[dir=rtl]*){text-align:right}.table :where(th,td){padding-left:1rem;padding-right:1rem;padding-top:.75rem;padding-bottom:.75rem;vertical-align:middle}.table tr.active,.table tr.active:nth-child(even),.table-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}@media(hover:hover){.table tr.hover:hover,.table tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)))}}.table-zebra tr.active,.table-zebra tr.active:nth-child(even),.table-zebra-zebra tbody tr:nth-child(even){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}@media(hover:hover){.table-zebra tr.hover:hover,.table-zebra tr.hover:nth-child(even):hover{--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}}.table :where(theadtr,tbodytr:not(:last-child),tbodytr:first-child:last-child){border-bottom-width:1px;--tw-border-opacity:1;border-bottom-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.table :where(thead,tfoot){white-space:nowrap;font-size:.75rem;line-height:1rem;font-weight:700;color:var(--fallback-bc,oklch(var(--bc)/.6))}.table :where(tfoot){border-top-width:1px;--tw-border-opacity:1;border-top-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)))}.textarea{border-radius:var(--rounded-btn,.5rem);border-width:1px;border-color:transparent;--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)))}.textarea-bordered{border-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea:focus{box-shadow:none;border-color:var(--fallback-bc,oklch(var(--bc)/.2));outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.textarea-ghost{--tw-bg-opacity:0.05}.textarea-ghost:focus{--tw-bg-opacity:1;--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:none}.textarea-primary{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)))}.textarea-primary:focus{--tw-border-opacity:1;border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));outline-color:var(--fallback-p,oklch(var(--p)/1))}.textarea-secondary{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)))}.textarea-secondary:focus{--tw-border-opacity:1;border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));outline-color:var(--fallback-s,oklch(var(--s)/1))}.textarea-accent{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)))}.textarea-accent:focus{--tw-border-opacity:1;border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));outline-color:var(--fallback-a,oklch(var(--a)/1))}.textarea-info{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)))}.textarea-info:focus{--tw-border-opacity:1;border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));outline-color:var(--fallback-in,oklch(var(--in)/1))}.textarea-success{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)))}.textarea-success:focus{--tw-border-opacity:1;border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));outline-color:var(--fallback-su,oklch(var(--su)/1))}.textarea-warning{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)))}.textarea-warning:focus{--tw-border-opacity:1;border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.textarea-error{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)))}.textarea-error:focus{--tw-border-opacity:1;border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));outline-color:var(--fallback-er,oklch(var(--er)/1))}.textarea-disabled,.textarea:disabled,.textarea[disabled]{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b2,oklch(var(--b2)/var(--tw-bg-opacity)));color:var(--fallback-bc,oklch(var(--bc)/.4))}.textarea-disabled::placeholder,.textarea:disabled::placeholder,.textarea[disabled]::placeholder{color:var(--fallback-bc,oklch(var(--bc)/var(--tw-placeholder-opacity)));--tw-placeholder-opacity:0.2}.timeline hr{height:.25rem}:where(.timelinehr){--tw-bg-opacity:1;background-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-bg-opacity)))}:where(.timeline:has(.timeline-middle)hr):first-child{border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}:where(.timeline:has(.timeline-middle)hr):last-child{border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):first-childhr:last-child){border-start-start-radius:var(--rounded-badge,1.9rem);border-end-start-radius:var(--rounded-badge,1.9rem);border-start-end-radius:0;border-end-end-radius:0}:where(.timeline:not(:has(.timeline-middle)):last-childhr:first-child){border-start-end-radius:var(--rounded-badge,1.9rem);border-end-end-radius:var(--rounded-badge,1.9rem);border-start-start-radius:0;border-end-start-radius:0}.timeline-box{border-radius:var(--rounded-box,1rem);border-width:1px;--tw-border-opacity:1;border-color:var(--fallback-b3,oklch(var(--b3)/var(--tw-border-opacity)));--tw-bg-opacity:1;background-color:var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));padding-left:1rem;padding-right:1rem;padding-top:.5rem;padding-bottom:.5rem;--tw-shadow:0 1px 2px 0 rgb(0 0 0 / 0.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.toast{gap:.5rem;padding:1rem}.toast>*{animation:toast-pop .25s ease-out}@keyframes toast-pop{0%{transform:scale(.9);opacity:0}100%{transform:scale(1);opacity:1}}.toggle{--tglbg:var(--fallback-b1,oklch(var(--b1)/1));--handleoffset:1.5rem;--handleoffsetcalculator:calc(var(--handleoffset) * -1);--togglehandleborder:0 0;height:1.5rem;width:3rem;cursor:pointer;appearance:none;border-radius:var(--rounded-badge,1.9rem);border-width:1px;border-color:currentColor;background-color:currentColor;color:var(--fallback-bc,oklch(var(--bc)/.5));transition:background,box-shadow var(--animation-input,.2s) ease-out;box-shadow:var(--handleoffsetcalculator) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset,var(--togglehandleborder)}[dir=rtl] .toggle{--handleoffsetcalculator:calc(var(--handleoffset) * 1)}.toggle:focus-visible{outline-style:solid;outline-width:2px;outline-offset:2px;outline-color:var(--fallback-bc,oklch(var(--bc)/.2))}.toggle:hover{background-color:currentColor}.toggle:checked,.toggle[aria-checked=true]{background-image:none;--handleoffsetcalculator:var(--handleoffset);--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)))}[dir=rtl] .toggle:checked,[dir=rtl] .toggle[aria-checked=true]{--handleoffsetcalculator:calc(var(--handleoffset) * -1)}.toggle:indeterminate{--tw-text-opacity:1;color:var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}[dir=rtl] .toggle:indeterminate{box-shadow:calc(var(--handleoffset)/ 2) 0 0 2px var(--tglbg) inset,calc(var(--handleoffset)/ -2) 0 0 2px var(--tglbg) inset,0 0 0 2px var(--tglbg) inset}.toggle-primary:focus-visible{outline-color:var(--fallback-p,oklch(var(--p)/1))}.toggle-primary:checked,.toggle-primary[aria-checked=true]{border-color:var(--fallback-p,oklch(var(--p)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-p,oklch(var(--p)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-pc,oklch(var(--pc)/var(--tw-text-opacity)))}.toggle-secondary:focus-visible{outline-color:var(--fallback-s,oklch(var(--s)/1))}.toggle-secondary:checked,.toggle-secondary[aria-checked=true]{border-color:var(--fallback-s,oklch(var(--s)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-s,oklch(var(--s)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-sc,oklch(var(--sc)/var(--tw-text-opacity)))}.toggle-accent:focus-visible{outline-color:var(--fallback-a,oklch(var(--a)/1))}.toggle-accent:checked,.toggle-accent[aria-checked=true]{border-color:var(--fallback-a,oklch(var(--a)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-a,oklch(var(--a)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-ac,oklch(var(--ac)/var(--tw-text-opacity)))}.toggle-success:focus-visible{outline-color:var(--fallback-su,oklch(var(--su)/1))}.toggle-success:checked,.toggle-success[aria-checked=true]{border-color:var(--fallback-su,oklch(var(--su)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-su,oklch(var(--su)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-suc,oklch(var(--suc)/var(--tw-text-opacity)))}.toggle-warning:focus-visible{outline-color:var(--fallback-wa,oklch(var(--wa)/1))}.toggle-warning:checked,.toggle-warning[aria-checked=true]{border-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-wa,oklch(var(--wa)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-wac,oklch(var(--wac)/var(--tw-text-opacity)))}.toggle-info:focus-visible{outline-color:var(--fallback-in,oklch(var(--in)/1))}.toggle-info:checked,.toggle-info[aria-checked=true]{border-color:var(--fallback-in,oklch(var(--in)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-in,oklch(var(--in)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-inc,oklch(var(--inc)/var(--tw-text-opacity)))}.toggle-error:focus-visible{outline-color:var(--fallback-er,oklch(var(--er)/1))}.toggle-error:checked,.toggle-error[aria-checked=true]{border-color:var(--fallback-er,oklch(var(--er)/var(--tw-border-opacity)));--tw-border-opacity:0.1;--tw-bg-opacity:1;background-color:var(--fallback-er,oklch(var(--er)/var(--tw-bg-opacity)));--tw-text-opacity:1;color:var(--fallback-erc,oklch(var(--erc)/var(--tw-text-opacity)))}.toggle:disabled{cursor:not-allowed;--tw-border-opacity:1;border-color:var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));background-color:transparent;opacity:.3;--togglehandleborder:0 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset,var(--handleoffsetcalculator) 0 0 3px var(--fallback-bc,oklch(var(--bc)/1)) inset}.toggle-mark{display:none}:root .prose{--tw-prose-body:var(--fallback-bc,oklch(var(--bc)/0.8));--tw-prose-headings:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-lead:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-links:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bold:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-counters:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-bullets:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-hr:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-quotes:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-quote-borders:var(--fallback-bc,oklch(var(--bc)/0.2));--tw-prose-captions:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-code:var(--fallback-bc,oklch(var(--bc)/1));--tw-prose-pre-code:var(--fallback-nc,oklch(var(--nc)/1));--tw-prose-pre-bg:var(--fallback-n,oklch(var(--n)/1));--tw-prose-th-borders:var(--fallback-bc,oklch(var(--bc)/0.5));--tw-prose-td-borders:var(--fallback-bc,oklch(var(--bc)/0.2))}.prose :where(code):not(:where([class~=not-prose]*,pre*)){padding:1px 8px;border-radius:var(--rounded-badge);font-weight:initial;background-color:var(--fallback-bc,oklch(var(--bc)/.1))}@supports not (color:oklch(0% 0 0)){.prose :where(code):not(:where([class~=not-prose]*,pre*)){background-color:var(--fallback-b3,oklch(var(--b3)/1))}}.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::after,.prose :where(code):not(:where([class~=not-prose],[class~=not-prose]*))::before{display:none}.prose pre code{border-radius:0;padding:0}.prose :where(tbodytr,thead):not(:where([class~=not-prose]*)){border-bottom-color:var(--fallback-bc,oklch(var(--bc)/.2))}:root{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}@media (prefers-color-scheme:dark){:root{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}}[data-theme=light]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=light]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:89.824% 0.06192 275.75;--ac:15.352% 0.0368 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:49.12% 0.3096 275.75;--s:69.71% 0.329 342.55;--sc:98.71% 0.0106 342.55;--a:76.76% 0.184 183.61;--n:32.1785% 0.02476 255.701624;--nc:89.4994% 0.011585 252.096176;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.4169% 0.00108 197.137559;--bc:27.8078% 0.029596 256.847952}[data-theme=dark]{color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}:root:has(input.theme-controller[value=dark]:checked){color-scheme:dark;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:13.138% 0.0392 275.75;--sc:14.96% 0.052 342.55;--ac:14.902% 0.0334 183.61;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:65.69% 0.196 275.75;--s:74.8% 0.26 342.55;--a:74.51% 0.167 183.61;--n:31.3815% 0.021108 254.139175;--nc:74.6477% 0.0216 264.435964;--b1:25.3267% 0.015896 252.417568;--b2:23.2607% 0.013807 253.100675;--b3:21.1484% 0.01165 254.087939;--bc:74.6477% 0.0216 264.435964}[data-theme=cupcake]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}:root:has(input.theme-controller[value=cupcake]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:15.2344% 0.017892 200.026556;--sc:15.787% 0.020249 356.29965;--ac:15.8762% 0.029206 78.618794;--nc:84.7148% 0.013247 313.189598;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--p:76.172% 0.089459 200.026556;--s:78.9351% 0.101246 356.29965;--a:79.3811% 0.146032 78.618794;--n:23.5742% 0.066235 313.189598;--b1:97.7882% 0.00418 56.375637;--b2:93.9822% 0.007638 61.449292;--b3:91.5861% 0.006811 53.440502;--bc:23.5742% 0.066235 313.189598;--rounded-btn:1.9rem;--tab-border:2px;--tab-radius:0.7rem}[data-theme=bumblebee]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}:root:has(input.theme-controller[value=bumblebee]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--ac:16.254% 0.0314 56.52;--nc:82.55% 0.015 281.99;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:89.51% 0.2132 96.61;--pc:38.92% 0.046 96.61;--s:80.39% 0.194 70.76;--sc:39.38% 0.068 70.76;--a:81.27% 0.157 56.52;--n:12.75% 0.075 281.99;--b1:100% 0 0}[data-theme=emerald]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=emerald]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:76.6626% 0.135433 153.450024;--pc:33.3872% 0.040618 162.240129;--s:61.3028% 0.202368 261.294233;--sc:100% 0 0;--a:72.7725% 0.149783 33.200363;--ac:0% 0 0;--n:35.5192% 0.032071 262.988584;--nc:98.4625% 0.001706 247.838921;--b1:100% 0 0;--bc:35.5192% 0.032071 262.988584;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=corporate]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=corporate]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:12.078% 0.0456 269.1;--sc:13.0739% 0.010951 256.688055;--ac:15.3934% 0.022799 163.57888;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--border-btn:1px;--tab-border:1px;--p:60.39% 0.228 269.1;--s:65.3694% 0.054756 256.688055;--a:76.9669% 0.113994 163.57888;--n:22.3899% 0.031305 278.07229;--nc:95.8796% 0.008588 247.915135;--b1:100% 0 0;--bc:22.3899% 0.031305 278.07229;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem;--tab-radius:0.25rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=synthwave]{color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}:root:has(input.theme-controller[value=synthwave]:checked){color-scheme:dark;--b2:20.2941% 0.076211 287.835609;--b3:18.7665% 0.070475 287.835609;--pc:14.4421% 0.031903 342.009383;--sc:15.6543% 0.02362 227.382405;--ac:17.608% 0.0412 93.72;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:72.2105% 0.159514 342.009383;--s:78.2714% 0.118101 227.382405;--a:88.04% 0.206 93.72;--n:25.5554% 0.103537 286.507967;--nc:97.9365% 0.00819 301.358346;--b1:21.8216% 0.081948 287.835609;--bc:97.9365% 0.00819 301.358346;--in:76.5197% 0.12273 231.831603;--inc:23.5017% 0.096418 290.329844;--su:86.0572% 0.115038 178.624677;--suc:23.5017% 0.096418 290.329844;--wa:85.531% 0.122117 93.722227;--wac:23.5017% 0.096418 290.329844;--er:73.7005% 0.121339 32.639257;--erc:23.5017% 0.096418 290.329844}[data-theme=retro]{color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}:root:has(input.theme-controller[value=retro]:checked){color-scheme:light;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:76.8664% 0.104092 22.664655;--pc:26.5104% 0.006243 0.522862;--s:80.7415% 0.052534 159.094608;--sc:26.5104% 0.006243 0.522862;--a:70.3919% 0.125455 52.953428;--ac:26.5104% 0.006243 0.522862;--n:28.4181% 0.009519 355.534017;--nc:92.5604% 0.025113 89.217311;--b1:91.6374% 0.034554 90.51575;--b2:88.2722% 0.049418 91.774344;--b3:84.133% 0.065952 90.856665;--bc:26.5104% 0.006243 0.522862;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33;--rounded-box:0.4rem;--rounded-btn:0.4rem;--rounded-badge:0.4rem;--tab-radius:0.4rem}[data-theme=cyberpunk]{color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}:root:has(input.theme-controller[value=cyberpunk]:checked){color-scheme:light;--b2:87.8943% 0.16647 104.32;--b3:81.2786% 0.15394 104.32;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:18.902% 0.0358 104.32;--pc:14.844% 0.0418 6.35;--sc:16.666% 0.0368 204.72;--ac:14.372% 0.04352 310.43;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;--p:74.22% 0.209 6.35;--s:83.33% 0.184 204.72;--a:71.86% 0.2176 310.43;--n:23.04% 0.065 269.31;--nc:94.51% 0.179 104.32;--b1:94.51% 0.179 104.32;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--tab-radius:0}[data-theme=valentine]{color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=valentine]:checked){color-scheme:light;--b2:88.0567% 0.024834 337.06289;--b3:81.4288% 0.022964 337.06289;--pc:13.7239% 0.030755 15.066527;--sc:14.3942% 0.029258 293.189609;--ac:14.2537% 0.014961 197.828857;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.614% 0.0414 27.33;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:68.6197% 0.153774 15.066527;--s:71.971% 0.14629 293.189609;--a:71.2685% 0.074804 197.828857;--n:54.6053% 0.143342 358.004839;--nc:90.2701% 0.037202 336.955191;--b1:94.6846% 0.026703 337.06289;--bc:37.3085% 0.081131 4.606426;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.07% 0.207 27.33;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=halloween]{color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}:root:has(input.theme-controller[value=halloween]:checked){color-scheme:dark;--b2:23.0416% 0 0;--b3:21.3072% 0 0;--bc:84.9552% 0 0;--sc:89.196% 0.0496 305.03;--nc:84.8742% 0.009322 65.681484;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:13.144% 0.0398 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:77.48% 0.204 60.62;--pc:19.6935% 0.004671 196.779412;--s:45.98% 0.248 305.03;--a:64.8% 0.223 136.073479;--ac:0% 0 0;--n:24.371% 0.046608 65.681484;--b1:24.7759% 0 0;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:65.72% 0.199 27.33}[data-theme=garden]{color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}:root:has(input.theme-controller[value=garden]:checked){color-scheme:light;--b2:86.4453% 0.002011 17.197414;--b3:79.9386% 0.00186 17.197414;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--sc:89.699% 0.022197 355.095988;--ac:11.2547% 0.010859 154.390187;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:62.45% 0.278 3.83636;--pc:100% 0 0;--s:48.4952% 0.110985 355.095988;--a:56.2735% 0.054297 154.390187;--n:24.1559% 0.049362 89.070594;--nc:92.9519% 0.002163 17.197414;--b1:92.9519% 0.002163 17.197414;--bc:16.9617% 0.001664 17.32068}[data-theme=forest]{color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}:root:has(input.theme-controller[value=forest]:checked){color-scheme:dark;--b2:17.522% 0.007709 17.911578;--b3:16.2032% 0.007129 17.911578;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:83.7682% 0.001658 17.911578;--sc:13.9553% 0.027077 168.327128;--ac:14.1257% 0.02389 185.713193;--nc:86.1397% 0.007806 171.364646;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:68.6283% 0.185567 148.958922;--pc:0% 0 0;--s:69.7764% 0.135385 168.327128;--a:70.6285% 0.119451 185.713193;--n:30.6985% 0.039032 171.364646;--b1:18.8409% 0.00829 17.911578;--rounded-btn:1.9rem}[data-theme=aqua]{color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}:root:has(input.theme-controller[value=aqua]:checked){color-scheme:dark;--b2:45.3464% 0.118611 261.181672;--b3:41.9333% 0.109683 261.181672;--bc:89.7519% 0.025508 261.181672;--sc:12.1365% 0.02175 309.782946;--ac:18.6854% 0.020445 94.555431;--nc:12.2124% 0.023402 243.760661;--inc:90.923% 0.043042 262.880917;--suc:12.541% 0.033982 149.213788;--wac:13.3168% 0.031484 58.31834;--erc:14.79% 0.038 27.33;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:85.6617% 0.14498 198.6458;--pc:40.1249% 0.068266 197.603872;--s:60.6827% 0.108752 309.782946;--a:93.4269% 0.102225 94.555431;--n:61.0622% 0.117009 243.760661;--b1:48.7596% 0.127539 261.181672;--in:54.615% 0.215208 262.880917;--su:62.7052% 0.169912 149.213788;--wa:66.584% 0.157422 58.31834;--er:73.95% 0.19 27.33}[data-theme=lofi]{color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}:root:has(input.theme-controller[value=lofi]:checked){color-scheme:light;--inc:15.908% 0.0206 205.9;--suc:18.026% 0.0306 164.14;--wac:17.674% 0.027 79.94;--erc:15.732% 0.03 28.47;--border-btn:1px;--tab-border:1px;--p:15.9066% 0 0;--pc:100% 0 0;--s:21.455% 0.001566 17.278957;--sc:100% 0 0;--a:26.8618% 0 0;--ac:100% 0 0;--n:0% 0 0;--nc:100% 0 0;--b1:100% 0 0;--b2:96.1151% 0 0;--b3:92.268% 0.001082 17.17934;--bc:0% 0 0;--in:79.54% 0.103 205.9;--su:90.13% 0.153 164.14;--wa:88.37% 0.135 79.94;--er:78.66% 0.15 28.47;--rounded-box:0.25rem;--rounded-btn:0.125rem;--rounded-badge:0.125rem;--tab-radius:0.125rem;--animation-btn:0;--animation-input:0;--btn-focus-scale:1}[data-theme=pastel]{color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=pastel]:checked){color-scheme:light;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--bc:20% 0 0;--pc:16.6166% 0.006979 316.8737;--sc:17.6153% 0.009839 8.688364;--ac:17.8419% 0.012056 170.923263;--nc:14.2681% 0.014702 228.183906;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:83.0828% 0.034896 316.8737;--s:88.0763% 0.049197 8.688364;--a:89.2096% 0.06028 170.923263;--n:71.3406% 0.07351 228.183906;--b1:100% 0 0;--b2:98.4625% 0.001706 247.838921;--b3:87.1681% 0.009339 258.338227;--rounded-btn:1.9rem;--tab-radius:0.7rem}[data-theme=fantasy]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}:root:has(input.theme-controller[value=fantasy]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--in:72.06% 0.191 231.6;--su:64.8% 0.150 160;--wa:84.71% 0.199 83.87;--er:71.76% 0.221 22.18;--pc:87.49% 0.0378 325.02;--sc:90.784% 0.0324 241.36;--ac:15.196% 0.0408 56.72;--nc:85.5616% 0.005919 256.847952;--inc:0% 0 0;--suc:0% 0 0;--wac:0% 0 0;--erc:0% 0 0;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:37.45% 0.189 325.02;--s:53.92% 0.162 241.36;--a:75.98% 0.204 56.72;--n:27.8078% 0.029596 256.847952;--b1:100% 0 0;--bc:27.8078% 0.029596 256.847952}[data-theme=wireframe]{color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=wireframe]:checked){color-scheme:light;--bc:20% 0 0;--pc:15.6521% 0 0;--sc:15.6521% 0 0;--ac:15.6521% 0 0;--nc:18.8014% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:14.1626% 0.019994 108.702381;--erc:12.5591% 0.051537 29.233885;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;font-family:Chalkboard,comic sans ms,sans-serif;--p:78.2604% 0 0;--s:78.2604% 0 0;--a:78.2604% 0 0;--n:94.007% 0 0;--b1:100% 0 0;--b2:94.9119% 0 0;--b3:89.7547% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:70.8131% 0.099969 108.702381;--er:62.7955% 0.257683 29.233885;--rounded-box:0.2rem;--rounded-btn:0.2rem;--rounded-badge:0.2rem;--tab-radius:0.2rem}[data-theme=black]{color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}:root:has(input.theme-controller[value=black]:checked){color-scheme:dark;--pc:86.736% 0 0;--sc:86.736% 0 0;--ac:86.736% 0 0;--nc:86.736% 0 0;--inc:89.0403% 0.062643 264.052021;--suc:90.395% 0.035372 142.495339;--wac:19.3597% 0.042201 109.769232;--erc:12.5591% 0.051537 29.233885;--border-btn:1px;--tab-border:1px;--p:33.6799% 0 0;--s:33.6799% 0 0;--a:33.6799% 0 0;--b1:0% 0 0;--b2:19.1251% 0 0;--b3:26.8618% 0 0;--bc:87.6096% 0 0;--n:33.6799% 0 0;--in:45.2014% 0.313214 264.052021;--su:51.9752% 0.176858 142.495339;--wa:96.7983% 0.211006 109.769232;--er:62.7955% 0.257683 29.233885;--rounded-box:0;--rounded-btn:0;--rounded-badge:0;--animation-btn:0;--animation-input:0;--btn-focus-scale:1;--tab-radius:0}[data-theme=luxury]{color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}:root:has(input.theme-controller[value=luxury]:checked){color-scheme:dark;--pc:20% 0 0;--sc:85.5163% 0.012821 261.069149;--ac:87.3349% 0.010348 338.82597;--inc:15.8122% 0.024356 237.133883;--suc:15.6239% 0.038579 132.154381;--wac:17.2255% 0.027305 102.89115;--erc:14.3506% 0.035271 22.568916;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:100% 0 0;--s:27.5815% 0.064106 261.069149;--a:36.6744% 0.051741 338.82597;--n:24.27% 0.057015 59.825019;--nc:93.2033% 0.089631 90.861683;--b1:14.0765% 0.004386 285.822869;--b2:20.2191% 0.004211 308.22937;--b3:29.8961% 0.003818 308.318612;--bc:75.6879% 0.123666 76.890484;--in:79.0612% 0.121778 237.133883;--su:78.1197% 0.192894 132.154381;--wa:86.1274% 0.136524 102.89115;--er:71.7531% 0.176357 22.568916}[data-theme=dracula]{color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}:root:has(input.theme-controller[value=dracula]:checked){color-scheme:dark;--b2:26.8053% 0.020556 277.508664;--b3:24.7877% 0.019009 277.508664;--pc:15.0922% 0.036614 346.812432;--sc:14.8405% 0.029709 301.883095;--ac:16.6785% 0.024826 66.558491;--nc:87.8891% 0.006515 275.524078;--inc:17.6526% 0.018676 212.846491;--suc:17.4199% 0.043903 148.024881;--wac:19.1068% 0.026849 112.757109;--erc:13.6441% 0.041266 24.430965;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.4611% 0.18307 346.812432;--s:74.2023% 0.148546 301.883095;--a:83.3927% 0.124132 66.558491;--n:39.4456% 0.032576 275.524078;--b1:28.8229% 0.022103 277.508664;--bc:97.7477% 0.007913 106.545019;--in:88.263% 0.09338 212.846491;--su:87.0995% 0.219516 148.024881;--wa:95.5338% 0.134246 112.757109;--er:68.2204% 0.206328 24.430965}[data-theme=cmyk]{color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}:root:has(input.theme-controller[value=cmyk]:checked){color-scheme:light;--b2:93% 0 0;--b3:86% 0 0;--bc:20% 0 0;--pc:14.3544% 0.02666 239.443325;--sc:12.8953% 0.040552 359.339283;--ac:18.8458% 0.037948 105.306968;--nc:84.3557% 0 0;--inc:13.6952% 0.0189 217.284104;--suc:89.3898% 0.032505 321.406278;--wac:14.2473% 0.031969 52.023412;--erc:12.4027% 0.041677 28.717543;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.7722% 0.133298 239.443325;--s:64.4766% 0.202758 359.339283;--a:94.2289% 0.189741 105.306968;--n:21.7787% 0 0;--b1:100% 0 0;--in:68.4759% 0.094499 217.284104;--su:46.949% 0.162524 321.406278;--wa:71.2364% 0.159843 52.023412;--er:62.0133% 0.208385 28.717543}[data-theme=autumn]{color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}:root:has(input.theme-controller[value=autumn]:checked){color-scheme:light;--b2:89.1077% 0 0;--b3:82.4006% 0 0;--bc:19.1629% 0 0;--pc:88.1446% 0.032232 17.530175;--sc:12.3353% 0.033821 23.865865;--ac:14.6851% 0.018999 60.729616;--nc:90.8734% 0.007475 51.902819;--inc:13.8449% 0.019596 207.284192;--suc:12.199% 0.016032 174.616213;--wac:14.0163% 0.032982 56.844303;--erc:90.614% 0.0482 24.16;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:40.7232% 0.16116 17.530175;--s:61.6763% 0.169105 23.865865;--a:73.4253% 0.094994 60.729616;--n:54.3672% 0.037374 51.902819;--b1:95.8147% 0 0;--in:69.2245% 0.097979 207.284192;--su:60.9951% 0.080159 174.616213;--wa:70.0817% 0.164909 56.844303;--er:53.07% 0.241 24.16}[data-theme=business]{color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}:root:has(input.theme-controller[value=business]:checked){color-scheme:dark;--b2:22.6487% 0 0;--b3:20.944% 0 0;--bc:84.8707% 0 0;--pc:88.3407% 0.019811 251.473931;--sc:12.8185% 0.005481 229.389418;--ac:13.4542% 0.033545 35.791525;--nc:85.4882% 0.00265 253.041249;--inc:12.5233% 0.028702 240.033697;--suc:14.0454% 0.018919 156.59611;--wac:15.4965% 0.023141 81.519177;--erc:90.3221% 0.029356 29.674507;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:41.7036% 0.099057 251.473931;--s:64.0924% 0.027405 229.389418;--a:67.271% 0.167726 35.791525;--n:27.441% 0.01325 253.041249;--b1:24.3535% 0 0;--in:62.6163% 0.143511 240.033697;--su:70.2268% 0.094594 156.59611;--wa:77.4824% 0.115704 81.519177;--er:51.6105% 0.14678 29.674507;--rounded-box:0.25rem;--rounded-btn:.125rem;--rounded-badge:.125rem}[data-theme=acid]{color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=acid]:checked){color-scheme:light;--b2:91.6146% 0 0;--b3:84.7189% 0 0;--bc:19.7021% 0 0;--pc:14.38% 0.0714 330.759573;--sc:14.674% 0.0448 48.250878;--ac:18.556% 0.0528 122.962951;--nc:84.262% 0.0256 278.68;--inc:12.144% 0.0454 252.05;--suc:17.144% 0.0532 158.53;--wac:18.202% 0.0424 100.5;--erc:12.968% 0.0586 29.349188;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:71.9% 0.357 330.759573;--s:73.37% 0.224 48.250878;--a:92.78% 0.264 122.962951;--n:21.31% 0.128 278.68;--b1:98.5104% 0 0;--in:60.72% 0.227 252.05;--su:85.72% 0.266 158.53;--wa:91.01% 0.212 100.5;--er:64.84% 0.293 29.349188;--rounded-box:1.25rem;--rounded-btn:1rem;--rounded-badge:1rem;--tab-radius:0.7rem}[data-theme=lemonade]{color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}:root:has(input.theme-controller[value=lemonade]:checked){color-scheme:light;--b2:91.8003% 0.0186 123.72;--b3:84.8906% 0.0172 123.72;--bc:19.742% 0.004 123.72;--pc:11.784% 0.0398 134.6;--sc:15.55% 0.0392 111.09;--ac:17.078% 0.0402 100.73;--nc:86.196% 0.015 108.6;--inc:17.238% 0.0094 224.14;--suc:17.238% 0.0094 157.85;--wac:17.238% 0.0094 102.15;--erc:17.238% 0.0094 25.85;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:58.92% 0.199 134.6;--s:77.75% 0.196 111.09;--a:85.39% 0.201 100.73;--n:30.98% 0.075 108.6;--b1:98.71% 0.02 123.72;--in:86.19% 0.047 224.14;--su:86.19% 0.047 157.85;--wa:86.19% 0.047 102.15;--er:86.19% 0.047 25.85}[data-theme=night]{color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}:root:has(input.theme-controller[value=night]:checked){color-scheme:dark;--b2:19.3144% 0.037037 265.754874;--b3:17.8606% 0.034249 265.754874;--bc:84.1536% 0.007965 265.754874;--pc:15.0703% 0.027798 232.66148;--sc:13.6023% 0.031661 276.934902;--ac:14.4721% 0.035244 350.048739;--nc:85.5899% 0.00737 260.030984;--suc:15.6904% 0.026506 181.911977;--wac:16.6486% 0.027912 82.95003;--erc:14.3572% 0.034051 13.11834;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:75.3513% 0.138989 232.66148;--s:68.0113% 0.158303 276.934902;--a:72.3603% 0.176218 350.048739;--n:27.9495% 0.036848 260.030984;--b1:20.7682% 0.039824 265.754874;--in:68.4553% 0.148062 237.25135;--inc:0% 0 0;--su:78.452% 0.132529 181.911977;--wa:83.2428% 0.139558 82.95003;--er:71.7858% 0.170255 13.11834}[data-theme=coffee]{color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}:root:has(input.theme-controller[value=coffee]:checked){color-scheme:dark;--b2:20.1585% 0.021457 329.708637;--b3:18.6412% 0.019842 329.708637;--pc:14.3993% 0.024765 62.756393;--sc:86.893% 0.00597 199.19444;--ac:88.5243% 0.014881 224.389184;--nc:83.3022% 0.003149 326.261446;--inc:15.898% 0.012774 184.558367;--suc:14.9445% 0.014491 131.116276;--wac:17.6301% 0.028162 87.722413;--erc:15.4637% 0.025644 31.871922;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:71.9967% 0.123825 62.756393;--s:34.465% 0.029849 199.19444;--a:42.6213% 0.074405 224.389184;--n:16.5109% 0.015743 326.261446;--b1:21.6758% 0.023072 329.708637;--bc:72.3547% 0.092794 79.129387;--in:79.4902% 0.063869 184.558367;--su:74.7224% 0.072456 131.116276;--wa:88.1503% 0.140812 87.722413;--er:77.3187% 0.12822 31.871922}[data-theme=winter]{color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}:root:has(input.theme-controller[value=winter]:checked){color-scheme:light;--pc:91.372% 0.051 257.57;--sc:88.5103% 0.03222 282.339433;--ac:11.988% 0.038303 335.171434;--nc:83.9233% 0.012704 257.651965;--inc:17.6255% 0.017178 214.515264;--suc:16.0988% 0.015404 197.823719;--wac:17.8345% 0.009167 71.47031;--erc:14.6185% 0.022037 20.076293;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:56.86% 0.255 257.57;--s:42.5516% 0.161098 282.339433;--a:59.9398% 0.191515 335.171434;--n:19.6166% 0.063518 257.651965;--b1:100% 0 0;--b2:97.4663% 0.011947 259.822565;--b3:93.2686% 0.016223 262.751375;--bc:41.8869% 0.053885 255.824911;--in:88.1275% 0.085888 214.515264;--su:80.4941% 0.077019 197.823719;--wa:89.1725% 0.045833 71.47031;--er:73.0926% 0.110185 20.076293}[data-theme=dim]{color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}:root:has(input.theme-controller[value=dim]:checked){color-scheme:dark;--pc:17.2267% 0.028331 139.549991;--sc:14.6752% 0.033181 35.353059;--ac:14.8459% 0.026728 311.37924;--inc:17.2157% 0.028409 206.182959;--suc:17.2343% 0.028437 166.534048;--wac:17.2327% 0.028447 94.818679;--erc:16.4838% 0.019914 33.756357;--rounded-box:1rem;--rounded-btn:0.5rem;--rounded-badge:1.9rem;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--tab-radius:0.5rem;--p:86.1335% 0.141656 139.549991;--s:73.3759% 0.165904 35.353059;--a:74.2296% 0.133641 311.37924;--n:24.7311% 0.020483 264.094728;--nc:82.9011% 0.031335 222.959324;--b1:30.8577% 0.023243 264.149498;--b2:28.0368% 0.01983 264.182074;--b3:26.3469% 0.018403 262.177739;--bc:82.9011% 0.031335 222.959324;--in:86.0785% 0.142046 206.182959;--su:86.1717% 0.142187 166.534048;--wa:86.1634% 0.142236 94.818679;--er:82.4189% 0.09957 33.756357}[data-theme=nord]{color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}:root:has(input.theme-controller[value=nord]:checked){color-scheme:light;--pc:11.8872% 0.015449 254.027774;--sc:13.9303% 0.011822 248.687186;--ac:15.4929% 0.01245 217.469017;--inc:13.8414% 0.012499 332.664922;--suc:15.3654% 0.01498 131.063061;--wac:17.0972% 0.017847 84.093335;--erc:12.122% 0.024119 15.341883;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:59.4359% 0.077246 254.027774;--s:69.6516% 0.059108 248.687186;--a:77.4643% 0.062249 217.469017;--n:45.229% 0.035214 264.1312;--nc:89.9258% 0.016374 262.749256;--b1:95.1276% 0.007445 260.731539;--b2:93.2996% 0.010389 261.788485;--b3:89.9258% 0.016374 262.749256;--bc:32.4374% 0.022945 264.182036;--in:69.2072% 0.062496 332.664922;--su:76.827% 0.074899 131.063061;--wa:85.4862% 0.089234 84.093335;--er:60.61% 0.120594 15.341883;--rounded-box:0.4rem;--rounded-btn:0.2rem;--rounded-badge:0.4rem;--tab-radius:0.2rem}[data-theme=sunset]{color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}:root:has(input.theme-controller[value=sunset]:checked){color-scheme:dark;--pc:14.9408% 0.031656 39.94703;--sc:14.5075% 0.035531 2.72034;--ac:14.2589% 0.033336 299.844533;--inc:17.1119% 0.017054 206.015183;--suc:17.1122% 0.017172 144.77874;--wac:17.1139% 0.016961 74.427797;--erc:17.1023% 0.015778 16.886379;--animation-btn:0.25s;--animation-input:.2s;--btn-focus-scale:0.95;--border-btn:1px;--tab-border:1px;--p:74.7039% 0.158278 39.94703;--s:72.5375% 0.177654 2.72034;--a:71.2947% 0.166678 299.844533;--n:26% 0.019 237.69;--nc:70% 0.019 237.69;--b1:22% 0.019 237.69;--b2:20% 0.019 237.69;--b3:18% 0.019 237.69;--bc:77.3835% 0.043586 245.096534;--in:85.5596% 0.085271 206.015183;--su:85.5609% 0.08586 144.77874;--wa:85.5695% 0.084806 74.427797;--er:85.5116% 0.07889 16.886379;--rounded-box:1.2rem;--rounded-btn:0.8rem;--rounded-badge:0.4rem;--tab-radius:0.7rem}
diff --git a/examples/server/public/deps_markdown-it.js b/examples/server/public/deps_markdown-it.js
new file mode 100644
index 000000000..1be0cebe6
--- /dev/null
+++ b/examples/server/public/deps_markdown-it.js
@@ -0,0 +1,8442 @@
+/*! markdown-it 13.0.2 https://github.com/markdown-it/markdown-it @license MIT */
+(function(global, factory) {
+  typeof exports === "object" && typeof module !== "undefined" ? module.exports = factory() : typeof define === "function" && define.amd ? define(factory) : (global = typeof globalThis !== "undefined" ? globalThis : global || self, 
+  global.markdownit = factory());
+})(this, (function() {
+  "use strict";
+  function createCommonjsModule(fn, basedir, module) {
+    return module = {
+      path: basedir,
+      exports: {},
+      require: function(path, base) {
+        return commonjsRequire(path, base === undefined || base === null ? module.path : base);
+      }
+    }, fn(module, module.exports), module.exports;
+  }
+  function getAugmentedNamespace(n) {
+    if (n.__esModule) return n;
+    var a = Object.defineProperty({}, "__esModule", {
+      value: true
+    });
+    Object.keys(n).forEach((function(k) {
+      var d = Object.getOwnPropertyDescriptor(n, k);
+      Object.defineProperty(a, k, d.get ? d : {
+        enumerable: true,
+        get: function() {
+          return n[k];
+        }
+      });
+    }));
+    return a;
+  }
+  function commonjsRequire() {
+    throw new Error("Dynamic requires are not currently supported by @rollup/plugin-commonjs");
+  }
+  var require$$0 = {
+    Aacute: "\xc1",
+    aacute: "\xe1",
+    Abreve: "\u0102",
+    abreve: "\u0103",
+    ac: "\u223e",
+    acd: "\u223f",
+    acE: "\u223e\u0333",
+    Acirc: "\xc2",
+    acirc: "\xe2",
+    acute: "\xb4",
+    Acy: "\u0410",
+    acy: "\u0430",
+    AElig: "\xc6",
+    aelig: "\xe6",
+    af: "\u2061",
+    Afr: "\ud835\udd04",
+    afr: "\ud835\udd1e",
+    Agrave: "\xc0",
+    agrave: "\xe0",
+    alefsym: "\u2135",
+    aleph: "\u2135",
+    Alpha: "\u0391",
+    alpha: "\u03b1",
+    Amacr: "\u0100",
+    amacr: "\u0101",
+    amalg: "\u2a3f",
+    amp: "&",
+    AMP: "&",
+    andand: "\u2a55",
+    And: "\u2a53",
+    and: "\u2227",
+    andd: "\u2a5c",
+    andslope: "\u2a58",
+    andv: "\u2a5a",
+    ang: "\u2220",
+    ange: "\u29a4",
+    angle: "\u2220",
+    angmsdaa: "\u29a8",
+    angmsdab: "\u29a9",
+    angmsdac: "\u29aa",
+    angmsdad: "\u29ab",
+    angmsdae: "\u29ac",
+    angmsdaf: "\u29ad",
+    angmsdag: "\u29ae",
+    angmsdah: "\u29af",
+    angmsd: "\u2221",
+    angrt: "\u221f",
+    angrtvb: "\u22be",
+    angrtvbd: "\u299d",
+    angsph: "\u2222",
+    angst: "\xc5",
+    angzarr: "\u237c",
+    Aogon: "\u0104",
+    aogon: "\u0105",
+    Aopf: "\ud835\udd38",
+    aopf: "\ud835\udd52",
+    apacir: "\u2a6f",
+    ap: "\u2248",
+    apE: "\u2a70",
+    ape: "\u224a",
+    apid: "\u224b",
+    apos: "'",
+    ApplyFunction: "\u2061",
+    approx: "\u2248",
+    approxeq: "\u224a",
+    Aring: "\xc5",
+    aring: "\xe5",
+    Ascr: "\ud835\udc9c",
+    ascr: "\ud835\udcb6",
+    Assign: "\u2254",
+    ast: "*",
+    asymp: "\u2248",
+    asympeq: "\u224d",
+    Atilde: "\xc3",
+    atilde: "\xe3",
+    Auml: "\xc4",
+    auml: "\xe4",
+    awconint: "\u2233",
+    awint: "\u2a11",
+    backcong: "\u224c",
+    backepsilon: "\u03f6",
+    backprime: "\u2035",
+    backsim: "\u223d",
+    backsimeq: "\u22cd",
+    Backslash: "\u2216",
+    Barv: "\u2ae7",
+    barvee: "\u22bd",
+    barwed: "\u2305",
+    Barwed: "\u2306",
+    barwedge: "\u2305",
+    bbrk: "\u23b5",
+    bbrktbrk: "\u23b6",
+    bcong: "\u224c",
+    Bcy: "\u0411",
+    bcy: "\u0431",
+    bdquo: "\u201e",
+    becaus: "\u2235",
+    because: "\u2235",
+    Because: "\u2235",
+    bemptyv: "\u29b0",
+    bepsi: "\u03f6",
+    bernou: "\u212c",
+    Bernoullis: "\u212c",
+    Beta: "\u0392",
+    beta: "\u03b2",
+    beth: "\u2136",
+    between: "\u226c",
+    Bfr: "\ud835\udd05",
+    bfr: "\ud835\udd1f",
+    bigcap: "\u22c2",
+    bigcirc: "\u25ef",
+    bigcup: "\u22c3",
+    bigodot: "\u2a00",
+    bigoplus: "\u2a01",
+    bigotimes: "\u2a02",
+    bigsqcup: "\u2a06",
+    bigstar: "\u2605",
+    bigtriangledown: "\u25bd",
+    bigtriangleup: "\u25b3",
+    biguplus: "\u2a04",
+    bigvee: "\u22c1",
+    bigwedge: "\u22c0",
+    bkarow: "\u290d",
+    blacklozenge: "\u29eb",
+    blacksquare: "\u25aa",
+    blacktriangle: "\u25b4",
+    blacktriangledown: "\u25be",
+    blacktriangleleft: "\u25c2",
+    blacktriangleright: "\u25b8",
+    blank: "\u2423",
+    blk12: "\u2592",
+    blk14: "\u2591",
+    blk34: "\u2593",
+    block: "\u2588",
+    bne: "=\u20e5",
+    bnequiv: "\u2261\u20e5",
+    bNot: "\u2aed",
+    bnot: "\u2310",
+    Bopf: "\ud835\udd39",
+    bopf: "\ud835\udd53",
+    bot: "\u22a5",
+    bottom: "\u22a5",
+    bowtie: "\u22c8",
+    boxbox: "\u29c9",
+    boxdl: "\u2510",
+    boxdL: "\u2555",
+    boxDl: "\u2556",
+    boxDL: "\u2557",
+    boxdr: "\u250c",
+    boxdR: "\u2552",
+    boxDr: "\u2553",
+    boxDR: "\u2554",
+    boxh: "\u2500",
+    boxH: "\u2550",
+    boxhd: "\u252c",
+    boxHd: "\u2564",
+    boxhD: "\u2565",
+    boxHD: "\u2566",
+    boxhu: "\u2534",
+    boxHu: "\u2567",
+    boxhU: "\u2568",
+    boxHU: "\u2569",
+    boxminus: "\u229f",
+    boxplus: "\u229e",
+    boxtimes: "\u22a0",
+    boxul: "\u2518",
+    boxuL: "\u255b",
+    boxUl: "\u255c",
+    boxUL: "\u255d",
+    boxur: "\u2514",
+    boxuR: "\u2558",
+    boxUr: "\u2559",
+    boxUR: "\u255a",
+    boxv: "\u2502",
+    boxV: "\u2551",
+    boxvh: "\u253c",
+    boxvH: "\u256a",
+    boxVh: "\u256b",
+    boxVH: "\u256c",
+    boxvl: "\u2524",
+    boxvL: "\u2561",
+    boxVl: "\u2562",
+    boxVL: "\u2563",
+    boxvr: "\u251c",
+    boxvR: "\u255e",
+    boxVr: "\u255f",
+    boxVR: "\u2560",
+    bprime: "\u2035",
+    breve: "\u02d8",
+    Breve: "\u02d8",
+    brvbar: "\xa6",
+    bscr: "\ud835\udcb7",
+    Bscr: "\u212c",
+    bsemi: "\u204f",
+    bsim: "\u223d",
+    bsime: "\u22cd",
+    bsolb: "\u29c5",
+    bsol: "\\",
+    bsolhsub: "\u27c8",
+    bull: "\u2022",
+    bullet: "\u2022",
+    bump: "\u224e",
+    bumpE: "\u2aae",
+    bumpe: "\u224f",
+    Bumpeq: "\u224e",
+    bumpeq: "\u224f",
+    Cacute: "\u0106",
+    cacute: "\u0107",
+    capand: "\u2a44",
+    capbrcup: "\u2a49",
+    capcap: "\u2a4b",
+    cap: "\u2229",
+    Cap: "\u22d2",
+    capcup: "\u2a47",
+    capdot: "\u2a40",
+    CapitalDifferentialD: "\u2145",
+    caps: "\u2229\ufe00",
+    caret: "\u2041",
+    caron: "\u02c7",
+    Cayleys: "\u212d",
+    ccaps: "\u2a4d",
+    Ccaron: "\u010c",
+    ccaron: "\u010d",
+    Ccedil: "\xc7",
+    ccedil: "\xe7",
+    Ccirc: "\u0108",
+    ccirc: "\u0109",
+    Cconint: "\u2230",
+    ccups: "\u2a4c",
+    ccupssm: "\u2a50",
+    Cdot: "\u010a",
+    cdot: "\u010b",
+    cedil: "\xb8",
+    Cedilla: "\xb8",
+    cemptyv: "\u29b2",
+    cent: "\xa2",
+    centerdot: "\xb7",
+    CenterDot: "\xb7",
+    cfr: "\ud835\udd20",
+    Cfr: "\u212d",
+    CHcy: "\u0427",
+    chcy: "\u0447",
+    check: "\u2713",
+    checkmark: "\u2713",
+    Chi: "\u03a7",
+    chi: "\u03c7",
+    circ: "\u02c6",
+    circeq: "\u2257",
+    circlearrowleft: "\u21ba",
+    circlearrowright: "\u21bb",
+    circledast: "\u229b",
+    circledcirc: "\u229a",
+    circleddash: "\u229d",
+    CircleDot: "\u2299",
+    circledR: "\xae",
+    circledS: "\u24c8",
+    CircleMinus: "\u2296",
+    CirclePlus: "\u2295",
+    CircleTimes: "\u2297",
+    cir: "\u25cb",
+    cirE: "\u29c3",
+    cire: "\u2257",
+    cirfnint: "\u2a10",
+    cirmid: "\u2aef",
+    cirscir: "\u29c2",
+    ClockwiseContourIntegral: "\u2232",
+    CloseCurlyDoubleQuote: "\u201d",
+    CloseCurlyQuote: "\u2019",
+    clubs: "\u2663",
+    clubsuit: "\u2663",
+    colon: ":",
+    Colon: "\u2237",
+    Colone: "\u2a74",
+    colone: "\u2254",
+    coloneq: "\u2254",
+    comma: ",",
+    commat: "@",
+    comp: "\u2201",
+    compfn: "\u2218",
+    complement: "\u2201",
+    complexes: "\u2102",
+    cong: "\u2245",
+    congdot: "\u2a6d",
+    Congruent: "\u2261",
+    conint: "\u222e",
+    Conint: "\u222f",
+    ContourIntegral: "\u222e",
+    copf: "\ud835\udd54",
+    Copf: "\u2102",
+    coprod: "\u2210",
+    Coproduct: "\u2210",
+    copy: "\xa9",
+    COPY: "\xa9",
+    copysr: "\u2117",
+    CounterClockwiseContourIntegral: "\u2233",
+    crarr: "\u21b5",
+    cross: "\u2717",
+    Cross: "\u2a2f",
+    Cscr: "\ud835\udc9e",
+    cscr: "\ud835\udcb8",
+    csub: "\u2acf",
+    csube: "\u2ad1",
+    csup: "\u2ad0",
+    csupe: "\u2ad2",
+    ctdot: "\u22ef",
+    cudarrl: "\u2938",
+    cudarrr: "\u2935",
+    cuepr: "\u22de",
+    cuesc: "\u22df",
+    cularr: "\u21b6",
+    cularrp: "\u293d",
+    cupbrcap: "\u2a48",
+    cupcap: "\u2a46",
+    CupCap: "\u224d",
+    cup: "\u222a",
+    Cup: "\u22d3",
+    cupcup: "\u2a4a",
+    cupdot: "\u228d",
+    cupor: "\u2a45",
+    cups: "\u222a\ufe00",
+    curarr: "\u21b7",
+    curarrm: "\u293c",
+    curlyeqprec: "\u22de",
+    curlyeqsucc: "\u22df",
+    curlyvee: "\u22ce",
+    curlywedge: "\u22cf",
+    curren: "\xa4",
+    curvearrowleft: "\u21b6",
+    curvearrowright: "\u21b7",
+    cuvee: "\u22ce",
+    cuwed: "\u22cf",
+    cwconint: "\u2232",
+    cwint: "\u2231",
+    cylcty: "\u232d",
+    dagger: "\u2020",
+    Dagger: "\u2021",
+    daleth: "\u2138",
+    darr: "\u2193",
+    Darr: "\u21a1",
+    dArr: "\u21d3",
+    dash: "\u2010",
+    Dashv: "\u2ae4",
+    dashv: "\u22a3",
+    dbkarow: "\u290f",
+    dblac: "\u02dd",
+    Dcaron: "\u010e",
+    dcaron: "\u010f",
+    Dcy: "\u0414",
+    dcy: "\u0434",
+    ddagger: "\u2021",
+    ddarr: "\u21ca",
+    DD: "\u2145",
+    dd: "\u2146",
+    DDotrahd: "\u2911",
+    ddotseq: "\u2a77",
+    deg: "\xb0",
+    Del: "\u2207",
+    Delta: "\u0394",
+    delta: "\u03b4",
+    demptyv: "\u29b1",
+    dfisht: "\u297f",
+    Dfr: "\ud835\udd07",
+    dfr: "\ud835\udd21",
+    dHar: "\u2965",
+    dharl: "\u21c3",
+    dharr: "\u21c2",
+    DiacriticalAcute: "\xb4",
+    DiacriticalDot: "\u02d9",
+    DiacriticalDoubleAcute: "\u02dd",
+    DiacriticalGrave: "`",
+    DiacriticalTilde: "\u02dc",
+    diam: "\u22c4",
+    diamond: "\u22c4",
+    Diamond: "\u22c4",
+    diamondsuit: "\u2666",
+    diams: "\u2666",
+    die: "\xa8",
+    DifferentialD: "\u2146",
+    digamma: "\u03dd",
+    disin: "\u22f2",
+    div: "\xf7",
+    divide: "\xf7",
+    divideontimes: "\u22c7",
+    divonx: "\u22c7",
+    DJcy: "\u0402",
+    djcy: "\u0452",
+    dlcorn: "\u231e",
+    dlcrop: "\u230d",
+    dollar: "$",
+    Dopf: "\ud835\udd3b",
+    dopf: "\ud835\udd55",
+    Dot: "\xa8",
+    dot: "\u02d9",
+    DotDot: "\u20dc",
+    doteq: "\u2250",
+    doteqdot: "\u2251",
+    DotEqual: "\u2250",
+    dotminus: "\u2238",
+    dotplus: "\u2214",
+    dotsquare: "\u22a1",
+    doublebarwedge: "\u2306",
+    DoubleContourIntegral: "\u222f",
+    DoubleDot: "\xa8",
+    DoubleDownArrow: "\u21d3",
+    DoubleLeftArrow: "\u21d0",
+    DoubleLeftRightArrow: "\u21d4",
+    DoubleLeftTee: "\u2ae4",
+    DoubleLongLeftArrow: "\u27f8",
+    DoubleLongLeftRightArrow: "\u27fa",
+    DoubleLongRightArrow: "\u27f9",
+    DoubleRightArrow: "\u21d2",
+    DoubleRightTee: "\u22a8",
+    DoubleUpArrow: "\u21d1",
+    DoubleUpDownArrow: "\u21d5",
+    DoubleVerticalBar: "\u2225",
+    DownArrowBar: "\u2913",
+    downarrow: "\u2193",
+    DownArrow: "\u2193",
+    Downarrow: "\u21d3",
+    DownArrowUpArrow: "\u21f5",
+    DownBreve: "\u0311",
+    downdownarrows: "\u21ca",
+    downharpoonleft: "\u21c3",
+    downharpoonright: "\u21c2",
+    DownLeftRightVector: "\u2950",
+    DownLeftTeeVector: "\u295e",
+    DownLeftVectorBar: "\u2956",
+    DownLeftVector: "\u21bd",
+    DownRightTeeVector: "\u295f",
+    DownRightVectorBar: "\u2957",
+    DownRightVector: "\u21c1",
+    DownTeeArrow: "\u21a7",
+    DownTee: "\u22a4",
+    drbkarow: "\u2910",
+    drcorn: "\u231f",
+    drcrop: "\u230c",
+    Dscr: "\ud835\udc9f",
+    dscr: "\ud835\udcb9",
+    DScy: "\u0405",
+    dscy: "\u0455",
+    dsol: "\u29f6",
+    Dstrok: "\u0110",
+    dstrok: "\u0111",
+    dtdot: "\u22f1",
+    dtri: "\u25bf",
+    dtrif: "\u25be",
+    duarr: "\u21f5",
+    duhar: "\u296f",
+    dwangle: "\u29a6",
+    DZcy: "\u040f",
+    dzcy: "\u045f",
+    dzigrarr: "\u27ff",
+    Eacute: "\xc9",
+    eacute: "\xe9",
+    easter: "\u2a6e",
+    Ecaron: "\u011a",
+    ecaron: "\u011b",
+    Ecirc: "\xca",
+    ecirc: "\xea",
+    ecir: "\u2256",
+    ecolon: "\u2255",
+    Ecy: "\u042d",
+    ecy: "\u044d",
+    eDDot: "\u2a77",
+    Edot: "\u0116",
+    edot: "\u0117",
+    eDot: "\u2251",
+    ee: "\u2147",
+    efDot: "\u2252",
+    Efr: "\ud835\udd08",
+    efr: "\ud835\udd22",
+    eg: "\u2a9a",
+    Egrave: "\xc8",
+    egrave: "\xe8",
+    egs: "\u2a96",
+    egsdot: "\u2a98",
+    el: "\u2a99",
+    Element: "\u2208",
+    elinters: "\u23e7",
+    ell: "\u2113",
+    els: "\u2a95",
+    elsdot: "\u2a97",
+    Emacr: "\u0112",
+    emacr: "\u0113",
+    empty: "\u2205",
+    emptyset: "\u2205",
+    EmptySmallSquare: "\u25fb",
+    emptyv: "\u2205",
+    EmptyVerySmallSquare: "\u25ab",
+    emsp13: "\u2004",
+    emsp14: "\u2005",
+    emsp: "\u2003",
+    ENG: "\u014a",
+    eng: "\u014b",
+    ensp: "\u2002",
+    Eogon: "\u0118",
+    eogon: "\u0119",
+    Eopf: "\ud835\udd3c",
+    eopf: "\ud835\udd56",
+    epar: "\u22d5",
+    eparsl: "\u29e3",
+    eplus: "\u2a71",
+    epsi: "\u03b5",
+    Epsilon: "\u0395",
+    epsilon: "\u03b5",
+    epsiv: "\u03f5",
+    eqcirc: "\u2256",
+    eqcolon: "\u2255",
+    eqsim: "\u2242",
+    eqslantgtr: "\u2a96",
+    eqslantless: "\u2a95",
+    Equal: "\u2a75",
+    equals: "=",
+    EqualTilde: "\u2242",
+    equest: "\u225f",
+    Equilibrium: "\u21cc",
+    equiv: "\u2261",
+    equivDD: "\u2a78",
+    eqvparsl: "\u29e5",
+    erarr: "\u2971",
+    erDot: "\u2253",
+    escr: "\u212f",
+    Escr: "\u2130",
+    esdot: "\u2250",
+    Esim: "\u2a73",
+    esim: "\u2242",
+    Eta: "\u0397",
+    eta: "\u03b7",
+    ETH: "\xd0",
+    eth: "\xf0",
+    Euml: "\xcb",
+    euml: "\xeb",
+    euro: "\u20ac",
+    excl: "!",
+    exist: "\u2203",
+    Exists: "\u2203",
+    expectation: "\u2130",
+    exponentiale: "\u2147",
+    ExponentialE: "\u2147",
+    fallingdotseq: "\u2252",
+    Fcy: "\u0424",
+    fcy: "\u0444",
+    female: "\u2640",
+    ffilig: "\ufb03",
+    fflig: "\ufb00",
+    ffllig: "\ufb04",
+    Ffr: "\ud835\udd09",
+    ffr: "\ud835\udd23",
+    filig: "\ufb01",
+    FilledSmallSquare: "\u25fc",
+    FilledVerySmallSquare: "\u25aa",
+    fjlig: "fj",
+    flat: "\u266d",
+    fllig: "\ufb02",
+    fltns: "\u25b1",
+    fnof: "\u0192",
+    Fopf: "\ud835\udd3d",
+    fopf: "\ud835\udd57",
+    forall: "\u2200",
+    ForAll: "\u2200",
+    fork: "\u22d4",
+    forkv: "\u2ad9",
+    Fouriertrf: "\u2131",
+    fpartint: "\u2a0d",
+    frac12: "\xbd",
+    frac13: "\u2153",
+    frac14: "\xbc",
+    frac15: "\u2155",
+    frac16: "\u2159",
+    frac18: "\u215b",
+    frac23: "\u2154",
+    frac25: "\u2156",
+    frac34: "\xbe",
+    frac35: "\u2157",
+    frac38: "\u215c",
+    frac45: "\u2158",
+    frac56: "\u215a",
+    frac58: "\u215d",
+    frac78: "\u215e",
+    frasl: "\u2044",
+    frown: "\u2322",
+    fscr: "\ud835\udcbb",
+    Fscr: "\u2131",
+    gacute: "\u01f5",
+    Gamma: "\u0393",
+    gamma: "\u03b3",
+    Gammad: "\u03dc",
+    gammad: "\u03dd",
+    gap: "\u2a86",
+    Gbreve: "\u011e",
+    gbreve: "\u011f",
+    Gcedil: "\u0122",
+    Gcirc: "\u011c",
+    gcirc: "\u011d",
+    Gcy: "\u0413",
+    gcy: "\u0433",
+    Gdot: "\u0120",
+    gdot: "\u0121",
+    ge: "\u2265",
+    gE: "\u2267",
+    gEl: "\u2a8c",
+    gel: "\u22db",
+    geq: "\u2265",
+    geqq: "\u2267",
+    geqslant: "\u2a7e",
+    gescc: "\u2aa9",
+    ges: "\u2a7e",
+    gesdot: "\u2a80",
+    gesdoto: "\u2a82",
+    gesdotol: "\u2a84",
+    gesl: "\u22db\ufe00",
+    gesles: "\u2a94",
+    Gfr: "\ud835\udd0a",
+    gfr: "\ud835\udd24",
+    gg: "\u226b",
+    Gg: "\u22d9",
+    ggg: "\u22d9",
+    gimel: "\u2137",
+    GJcy: "\u0403",
+    gjcy: "\u0453",
+    gla: "\u2aa5",
+    gl: "\u2277",
+    glE: "\u2a92",
+    glj: "\u2aa4",
+    gnap: "\u2a8a",
+    gnapprox: "\u2a8a",
+    gne: "\u2a88",
+    gnE: "\u2269",
+    gneq: "\u2a88",
+    gneqq: "\u2269",
+    gnsim: "\u22e7",
+    Gopf: "\ud835\udd3e",
+    gopf: "\ud835\udd58",
+    grave: "`",
+    GreaterEqual: "\u2265",
+    GreaterEqualLess: "\u22db",
+    GreaterFullEqual: "\u2267",
+    GreaterGreater: "\u2aa2",
+    GreaterLess: "\u2277",
+    GreaterSlantEqual: "\u2a7e",
+    GreaterTilde: "\u2273",
+    Gscr: "\ud835\udca2",
+    gscr: "\u210a",
+    gsim: "\u2273",
+    gsime: "\u2a8e",
+    gsiml: "\u2a90",
+    gtcc: "\u2aa7",
+    gtcir: "\u2a7a",
+    gt: ">",
+    GT: ">",
+    Gt: "\u226b",
+    gtdot: "\u22d7",
+    gtlPar: "\u2995",
+    gtquest: "\u2a7c",
+    gtrapprox: "\u2a86",
+    gtrarr: "\u2978",
+    gtrdot: "\u22d7",
+    gtreqless: "\u22db",
+    gtreqqless: "\u2a8c",
+    gtrless: "\u2277",
+    gtrsim: "\u2273",
+    gvertneqq: "\u2269\ufe00",
+    gvnE: "\u2269\ufe00",
+    Hacek: "\u02c7",
+    hairsp: "\u200a",
+    half: "\xbd",
+    hamilt: "\u210b",
+    HARDcy: "\u042a",
+    hardcy: "\u044a",
+    harrcir: "\u2948",
+    harr: "\u2194",
+    hArr: "\u21d4",
+    harrw: "\u21ad",
+    Hat: "^",
+    hbar: "\u210f",
+    Hcirc: "\u0124",
+    hcirc: "\u0125",
+    hearts: "\u2665",
+    heartsuit: "\u2665",
+    hellip: "\u2026",
+    hercon: "\u22b9",
+    hfr: "\ud835\udd25",
+    Hfr: "\u210c",
+    HilbertSpace: "\u210b",
+    hksearow: "\u2925",
+    hkswarow: "\u2926",
+    hoarr: "\u21ff",
+    homtht: "\u223b",
+    hookleftarrow: "\u21a9",
+    hookrightarrow: "\u21aa",
+    hopf: "\ud835\udd59",
+    Hopf: "\u210d",
+    horbar: "\u2015",
+    HorizontalLine: "\u2500",
+    hscr: "\ud835\udcbd",
+    Hscr: "\u210b",
+    hslash: "\u210f",
+    Hstrok: "\u0126",
+    hstrok: "\u0127",
+    HumpDownHump: "\u224e",
+    HumpEqual: "\u224f",
+    hybull: "\u2043",
+    hyphen: "\u2010",
+    Iacute: "\xcd",
+    iacute: "\xed",
+    ic: "\u2063",
+    Icirc: "\xce",
+    icirc: "\xee",
+    Icy: "\u0418",
+    icy: "\u0438",
+    Idot: "\u0130",
+    IEcy: "\u0415",
+    iecy: "\u0435",
+    iexcl: "\xa1",
+    iff: "\u21d4",
+    ifr: "\ud835\udd26",
+    Ifr: "\u2111",
+    Igrave: "\xcc",
+    igrave: "\xec",
+    ii: "\u2148",
+    iiiint: "\u2a0c",
+    iiint: "\u222d",
+    iinfin: "\u29dc",
+    iiota: "\u2129",
+    IJlig: "\u0132",
+    ijlig: "\u0133",
+    Imacr: "\u012a",
+    imacr: "\u012b",
+    image: "\u2111",
+    ImaginaryI: "\u2148",
+    imagline: "\u2110",
+    imagpart: "\u2111",
+    imath: "\u0131",
+    Im: "\u2111",
+    imof: "\u22b7",
+    imped: "\u01b5",
+    Implies: "\u21d2",
+    incare: "\u2105",
+    in: "\u2208",
+    infin: "\u221e",
+    infintie: "\u29dd",
+    inodot: "\u0131",
+    intcal: "\u22ba",
+    int: "\u222b",
+    Int: "\u222c",
+    integers: "\u2124",
+    Integral: "\u222b",
+    intercal: "\u22ba",
+    Intersection: "\u22c2",
+    intlarhk: "\u2a17",
+    intprod: "\u2a3c",
+    InvisibleComma: "\u2063",
+    InvisibleTimes: "\u2062",
+    IOcy: "\u0401",
+    iocy: "\u0451",
+    Iogon: "\u012e",
+    iogon: "\u012f",
+    Iopf: "\ud835\udd40",
+    iopf: "\ud835\udd5a",
+    Iota: "\u0399",
+    iota: "\u03b9",
+    iprod: "\u2a3c",
+    iquest: "\xbf",
+    iscr: "\ud835\udcbe",
+    Iscr: "\u2110",
+    isin: "\u2208",
+    isindot: "\u22f5",
+    isinE: "\u22f9",
+    isins: "\u22f4",
+    isinsv: "\u22f3",
+    isinv: "\u2208",
+    it: "\u2062",
+    Itilde: "\u0128",
+    itilde: "\u0129",
+    Iukcy: "\u0406",
+    iukcy: "\u0456",
+    Iuml: "\xcf",
+    iuml: "\xef",
+    Jcirc: "\u0134",
+    jcirc: "\u0135",
+    Jcy: "\u0419",
+    jcy: "\u0439",
+    Jfr: "\ud835\udd0d",
+    jfr: "\ud835\udd27",
+    jmath: "\u0237",
+    Jopf: "\ud835\udd41",
+    jopf: "\ud835\udd5b",
+    Jscr: "\ud835\udca5",
+    jscr: "\ud835\udcbf",
+    Jsercy: "\u0408",
+    jsercy: "\u0458",
+    Jukcy: "\u0404",
+    jukcy: "\u0454",
+    Kappa: "\u039a",
+    kappa: "\u03ba",
+    kappav: "\u03f0",
+    Kcedil: "\u0136",
+    kcedil: "\u0137",
+    Kcy: "\u041a",
+    kcy: "\u043a",
+    Kfr: "\ud835\udd0e",
+    kfr: "\ud835\udd28",
+    kgreen: "\u0138",
+    KHcy: "\u0425",
+    khcy: "\u0445",
+    KJcy: "\u040c",
+    kjcy: "\u045c",
+    Kopf: "\ud835\udd42",
+    kopf: "\ud835\udd5c",
+    Kscr: "\ud835\udca6",
+    kscr: "\ud835\udcc0",
+    lAarr: "\u21da",
+    Lacute: "\u0139",
+    lacute: "\u013a",
+    laemptyv: "\u29b4",
+    lagran: "\u2112",
+    Lambda: "\u039b",
+    lambda: "\u03bb",
+    lang: "\u27e8",
+    Lang: "\u27ea",
+    langd: "\u2991",
+    langle: "\u27e8",
+    lap: "\u2a85",
+    Laplacetrf: "\u2112",
+    laquo: "\xab",
+    larrb: "\u21e4",
+    larrbfs: "\u291f",
+    larr: "\u2190",
+    Larr: "\u219e",
+    lArr: "\u21d0",
+    larrfs: "\u291d",
+    larrhk: "\u21a9",
+    larrlp: "\u21ab",
+    larrpl: "\u2939",
+    larrsim: "\u2973",
+    larrtl: "\u21a2",
+    latail: "\u2919",
+    lAtail: "\u291b",
+    lat: "\u2aab",
+    late: "\u2aad",
+    lates: "\u2aad\ufe00",
+    lbarr: "\u290c",
+    lBarr: "\u290e",
+    lbbrk: "\u2772",
+    lbrace: "{",
+    lbrack: "[",
+    lbrke: "\u298b",
+    lbrksld: "\u298f",
+    lbrkslu: "\u298d",
+    Lcaron: "\u013d",
+    lcaron: "\u013e",
+    Lcedil: "\u013b",
+    lcedil: "\u013c",
+    lceil: "\u2308",
+    lcub: "{",
+    Lcy: "\u041b",
+    lcy: "\u043b",
+    ldca: "\u2936",
+    ldquo: "\u201c",
+    ldquor: "\u201e",
+    ldrdhar: "\u2967",
+    ldrushar: "\u294b",
+    ldsh: "\u21b2",
+    le: "\u2264",
+    lE: "\u2266",
+    LeftAngleBracket: "\u27e8",
+    LeftArrowBar: "\u21e4",
+    leftarrow: "\u2190",
+    LeftArrow: "\u2190",
+    Leftarrow: "\u21d0",
+    LeftArrowRightArrow: "\u21c6",
+    leftarrowtail: "\u21a2",
+    LeftCeiling: "\u2308",
+    LeftDoubleBracket: "\u27e6",
+    LeftDownTeeVector: "\u2961",
+    LeftDownVectorBar: "\u2959",
+    LeftDownVector: "\u21c3",
+    LeftFloor: "\u230a",
+    leftharpoondown: "\u21bd",
+    leftharpoonup: "\u21bc",
+    leftleftarrows: "\u21c7",
+    leftrightarrow: "\u2194",
+    LeftRightArrow: "\u2194",
+    Leftrightarrow: "\u21d4",
+    leftrightarrows: "\u21c6",
+    leftrightharpoons: "\u21cb",
+    leftrightsquigarrow: "\u21ad",
+    LeftRightVector: "\u294e",
+    LeftTeeArrow: "\u21a4",
+    LeftTee: "\u22a3",
+    LeftTeeVector: "\u295a",
+    leftthreetimes: "\u22cb",
+    LeftTriangleBar: "\u29cf",
+    LeftTriangle: "\u22b2",
+    LeftTriangleEqual: "\u22b4",
+    LeftUpDownVector: "\u2951",
+    LeftUpTeeVector: "\u2960",
+    LeftUpVectorBar: "\u2958",
+    LeftUpVector: "\u21bf",
+    LeftVectorBar: "\u2952",
+    LeftVector: "\u21bc",
+    lEg: "\u2a8b",
+    leg: "\u22da",
+    leq: "\u2264",
+    leqq: "\u2266",
+    leqslant: "\u2a7d",
+    lescc: "\u2aa8",
+    les: "\u2a7d",
+    lesdot: "\u2a7f",
+    lesdoto: "\u2a81",
+    lesdotor: "\u2a83",
+    lesg: "\u22da\ufe00",
+    lesges: "\u2a93",
+    lessapprox: "\u2a85",
+    lessdot: "\u22d6",
+    lesseqgtr: "\u22da",
+    lesseqqgtr: "\u2a8b",
+    LessEqualGreater: "\u22da",
+    LessFullEqual: "\u2266",
+    LessGreater: "\u2276",
+    lessgtr: "\u2276",
+    LessLess: "\u2aa1",
+    lesssim: "\u2272",
+    LessSlantEqual: "\u2a7d",
+    LessTilde: "\u2272",
+    lfisht: "\u297c",
+    lfloor: "\u230a",
+    Lfr: "\ud835\udd0f",
+    lfr: "\ud835\udd29",
+    lg: "\u2276",
+    lgE: "\u2a91",
+    lHar: "\u2962",
+    lhard: "\u21bd",
+    lharu: "\u21bc",
+    lharul: "\u296a",
+    lhblk: "\u2584",
+    LJcy: "\u0409",
+    ljcy: "\u0459",
+    llarr: "\u21c7",
+    ll: "\u226a",
+    Ll: "\u22d8",
+    llcorner: "\u231e",
+    Lleftarrow: "\u21da",
+    llhard: "\u296b",
+    lltri: "\u25fa",
+    Lmidot: "\u013f",
+    lmidot: "\u0140",
+    lmoustache: "\u23b0",
+    lmoust: "\u23b0",
+    lnap: "\u2a89",
+    lnapprox: "\u2a89",
+    lne: "\u2a87",
+    lnE: "\u2268",
+    lneq: "\u2a87",
+    lneqq: "\u2268",
+    lnsim: "\u22e6",
+    loang: "\u27ec",
+    loarr: "\u21fd",
+    lobrk: "\u27e6",
+    longleftarrow: "\u27f5",
+    LongLeftArrow: "\u27f5",
+    Longleftarrow: "\u27f8",
+    longleftrightarrow: "\u27f7",
+    LongLeftRightArrow: "\u27f7",
+    Longleftrightarrow: "\u27fa",
+    longmapsto: "\u27fc",
+    longrightarrow: "\u27f6",
+    LongRightArrow: "\u27f6",
+    Longrightarrow: "\u27f9",
+    looparrowleft: "\u21ab",
+    looparrowright: "\u21ac",
+    lopar: "\u2985",
+    Lopf: "\ud835\udd43",
+    lopf: "\ud835\udd5d",
+    loplus: "\u2a2d",
+    lotimes: "\u2a34",
+    lowast: "\u2217",
+    lowbar: "_",
+    LowerLeftArrow: "\u2199",
+    LowerRightArrow: "\u2198",
+    loz: "\u25ca",
+    lozenge: "\u25ca",
+    lozf: "\u29eb",
+    lpar: "(",
+    lparlt: "\u2993",
+    lrarr: "\u21c6",
+    lrcorner: "\u231f",
+    lrhar: "\u21cb",
+    lrhard: "\u296d",
+    lrm: "\u200e",
+    lrtri: "\u22bf",
+    lsaquo: "\u2039",
+    lscr: "\ud835\udcc1",
+    Lscr: "\u2112",
+    lsh: "\u21b0",
+    Lsh: "\u21b0",
+    lsim: "\u2272",
+    lsime: "\u2a8d",
+    lsimg: "\u2a8f",
+    lsqb: "[",
+    lsquo: "\u2018",
+    lsquor: "\u201a",
+    Lstrok: "\u0141",
+    lstrok: "\u0142",
+    ltcc: "\u2aa6",
+    ltcir: "\u2a79",
+    lt: "<",
+    LT: "<",
+    Lt: "\u226a",
+    ltdot: "\u22d6",
+    lthree: "\u22cb",
+    ltimes: "\u22c9",
+    ltlarr: "\u2976",
+    ltquest: "\u2a7b",
+    ltri: "\u25c3",
+    ltrie: "\u22b4",
+    ltrif: "\u25c2",
+    ltrPar: "\u2996",
+    lurdshar: "\u294a",
+    luruhar: "\u2966",
+    lvertneqq: "\u2268\ufe00",
+    lvnE: "\u2268\ufe00",
+    macr: "\xaf",
+    male: "\u2642",
+    malt: "\u2720",
+    maltese: "\u2720",
+    Map: "\u2905",
+    map: "\u21a6",
+    mapsto: "\u21a6",
+    mapstodown: "\u21a7",
+    mapstoleft: "\u21a4",
+    mapstoup: "\u21a5",
+    marker: "\u25ae",
+    mcomma: "\u2a29",
+    Mcy: "\u041c",
+    mcy: "\u043c",
+    mdash: "\u2014",
+    mDDot: "\u223a",
+    measuredangle: "\u2221",
+    MediumSpace: "\u205f",
+    Mellintrf: "\u2133",
+    Mfr: "\ud835\udd10",
+    mfr: "\ud835\udd2a",
+    mho: "\u2127",
+    micro: "\xb5",
+    midast: "*",
+    midcir: "\u2af0",
+    mid: "\u2223",
+    middot: "\xb7",
+    minusb: "\u229f",
+    minus: "\u2212",
+    minusd: "\u2238",
+    minusdu: "\u2a2a",
+    MinusPlus: "\u2213",
+    mlcp: "\u2adb",
+    mldr: "\u2026",
+    mnplus: "\u2213",
+    models: "\u22a7",
+    Mopf: "\ud835\udd44",
+    mopf: "\ud835\udd5e",
+    mp: "\u2213",
+    mscr: "\ud835\udcc2",
+    Mscr: "\u2133",
+    mstpos: "\u223e",
+    Mu: "\u039c",
+    mu: "\u03bc",
+    multimap: "\u22b8",
+    mumap: "\u22b8",
+    nabla: "\u2207",
+    Nacute: "\u0143",
+    nacute: "\u0144",
+    nang: "\u2220\u20d2",
+    nap: "\u2249",
+    napE: "\u2a70\u0338",
+    napid: "\u224b\u0338",
+    napos: "\u0149",
+    napprox: "\u2249",
+    natural: "\u266e",
+    naturals: "\u2115",
+    natur: "\u266e",
+    nbsp: "\xa0",
+    nbump: "\u224e\u0338",
+    nbumpe: "\u224f\u0338",
+    ncap: "\u2a43",
+    Ncaron: "\u0147",
+    ncaron: "\u0148",
+    Ncedil: "\u0145",
+    ncedil: "\u0146",
+    ncong: "\u2247",
+    ncongdot: "\u2a6d\u0338",
+    ncup: "\u2a42",
+    Ncy: "\u041d",
+    ncy: "\u043d",
+    ndash: "\u2013",
+    nearhk: "\u2924",
+    nearr: "\u2197",
+    neArr: "\u21d7",
+    nearrow: "\u2197",
+    ne: "\u2260",
+    nedot: "\u2250\u0338",
+    NegativeMediumSpace: "\u200b",
+    NegativeThickSpace: "\u200b",
+    NegativeThinSpace: "\u200b",
+    NegativeVeryThinSpace: "\u200b",
+    nequiv: "\u2262",
+    nesear: "\u2928",
+    nesim: "\u2242\u0338",
+    NestedGreaterGreater: "\u226b",
+    NestedLessLess: "\u226a",
+    NewLine: "\n",
+    nexist: "\u2204",
+    nexists: "\u2204",
+    Nfr: "\ud835\udd11",
+    nfr: "\ud835\udd2b",
+    ngE: "\u2267\u0338",
+    nge: "\u2271",
+    ngeq: "\u2271",
+    ngeqq: "\u2267\u0338",
+    ngeqslant: "\u2a7e\u0338",
+    nges: "\u2a7e\u0338",
+    nGg: "\u22d9\u0338",
+    ngsim: "\u2275",
+    nGt: "\u226b\u20d2",
+    ngt: "\u226f",
+    ngtr: "\u226f",
+    nGtv: "\u226b\u0338",
+    nharr: "\u21ae",
+    nhArr: "\u21ce",
+    nhpar: "\u2af2",
+    ni: "\u220b",
+    nis: "\u22fc",
+    nisd: "\u22fa",
+    niv: "\u220b",
+    NJcy: "\u040a",
+    njcy: "\u045a",
+    nlarr: "\u219a",
+    nlArr: "\u21cd",
+    nldr: "\u2025",
+    nlE: "\u2266\u0338",
+    nle: "\u2270",
+    nleftarrow: "\u219a",
+    nLeftarrow: "\u21cd",
+    nleftrightarrow: "\u21ae",
+    nLeftrightarrow: "\u21ce",
+    nleq: "\u2270",
+    nleqq: "\u2266\u0338",
+    nleqslant: "\u2a7d\u0338",
+    nles: "\u2a7d\u0338",
+    nless: "\u226e",
+    nLl: "\u22d8\u0338",
+    nlsim: "\u2274",
+    nLt: "\u226a\u20d2",
+    nlt: "\u226e",
+    nltri: "\u22ea",
+    nltrie: "\u22ec",
+    nLtv: "\u226a\u0338",
+    nmid: "\u2224",
+    NoBreak: "\u2060",
+    NonBreakingSpace: "\xa0",
+    nopf: "\ud835\udd5f",
+    Nopf: "\u2115",
+    Not: "\u2aec",
+    not: "\xac",
+    NotCongruent: "\u2262",
+    NotCupCap: "\u226d",
+    NotDoubleVerticalBar: "\u2226",
+    NotElement: "\u2209",
+    NotEqual: "\u2260",
+    NotEqualTilde: "\u2242\u0338",
+    NotExists: "\u2204",
+    NotGreater: "\u226f",
+    NotGreaterEqual: "\u2271",
+    NotGreaterFullEqual: "\u2267\u0338",
+    NotGreaterGreater: "\u226b\u0338",
+    NotGreaterLess: "\u2279",
+    NotGreaterSlantEqual: "\u2a7e\u0338",
+    NotGreaterTilde: "\u2275",
+    NotHumpDownHump: "\u224e\u0338",
+    NotHumpEqual: "\u224f\u0338",
+    notin: "\u2209",
+    notindot: "\u22f5\u0338",
+    notinE: "\u22f9\u0338",
+    notinva: "\u2209",
+    notinvb: "\u22f7",
+    notinvc: "\u22f6",
+    NotLeftTriangleBar: "\u29cf\u0338",
+    NotLeftTriangle: "\u22ea",
+    NotLeftTriangleEqual: "\u22ec",
+    NotLess: "\u226e",
+    NotLessEqual: "\u2270",
+    NotLessGreater: "\u2278",
+    NotLessLess: "\u226a\u0338",
+    NotLessSlantEqual: "\u2a7d\u0338",
+    NotLessTilde: "\u2274",
+    NotNestedGreaterGreater: "\u2aa2\u0338",
+    NotNestedLessLess: "\u2aa1\u0338",
+    notni: "\u220c",
+    notniva: "\u220c",
+    notnivb: "\u22fe",
+    notnivc: "\u22fd",
+    NotPrecedes: "\u2280",
+    NotPrecedesEqual: "\u2aaf\u0338",
+    NotPrecedesSlantEqual: "\u22e0",
+    NotReverseElement: "\u220c",
+    NotRightTriangleBar: "\u29d0\u0338",
+    NotRightTriangle: "\u22eb",
+    NotRightTriangleEqual: "\u22ed",
+    NotSquareSubset: "\u228f\u0338",
+    NotSquareSubsetEqual: "\u22e2",
+    NotSquareSuperset: "\u2290\u0338",
+    NotSquareSupersetEqual: "\u22e3",
+    NotSubset: "\u2282\u20d2",
+    NotSubsetEqual: "\u2288",
+    NotSucceeds: "\u2281",
+    NotSucceedsEqual: "\u2ab0\u0338",
+    NotSucceedsSlantEqual: "\u22e1",
+    NotSucceedsTilde: "\u227f\u0338",
+    NotSuperset: "\u2283\u20d2",
+    NotSupersetEqual: "\u2289",
+    NotTilde: "\u2241",
+    NotTildeEqual: "\u2244",
+    NotTildeFullEqual: "\u2247",
+    NotTildeTilde: "\u2249",
+    NotVerticalBar: "\u2224",
+    nparallel: "\u2226",
+    npar: "\u2226",
+    nparsl: "\u2afd\u20e5",
+    npart: "\u2202\u0338",
+    npolint: "\u2a14",
+    npr: "\u2280",
+    nprcue: "\u22e0",
+    nprec: "\u2280",
+    npreceq: "\u2aaf\u0338",
+    npre: "\u2aaf\u0338",
+    nrarrc: "\u2933\u0338",
+    nrarr: "\u219b",
+    nrArr: "\u21cf",
+    nrarrw: "\u219d\u0338",
+    nrightarrow: "\u219b",
+    nRightarrow: "\u21cf",
+    nrtri: "\u22eb",
+    nrtrie: "\u22ed",
+    nsc: "\u2281",
+    nsccue: "\u22e1",
+    nsce: "\u2ab0\u0338",
+    Nscr: "\ud835\udca9",
+    nscr: "\ud835\udcc3",
+    nshortmid: "\u2224",
+    nshortparallel: "\u2226",
+    nsim: "\u2241",
+    nsime: "\u2244",
+    nsimeq: "\u2244",
+    nsmid: "\u2224",
+    nspar: "\u2226",
+    nsqsube: "\u22e2",
+    nsqsupe: "\u22e3",
+    nsub: "\u2284",
+    nsubE: "\u2ac5\u0338",
+    nsube: "\u2288",
+    nsubset: "\u2282\u20d2",
+    nsubseteq: "\u2288",
+    nsubseteqq: "\u2ac5\u0338",
+    nsucc: "\u2281",
+    nsucceq: "\u2ab0\u0338",
+    nsup: "\u2285",
+    nsupE: "\u2ac6\u0338",
+    nsupe: "\u2289",
+    nsupset: "\u2283\u20d2",
+    nsupseteq: "\u2289",
+    nsupseteqq: "\u2ac6\u0338",
+    ntgl: "\u2279",
+    Ntilde: "\xd1",
+    ntilde: "\xf1",
+    ntlg: "\u2278",
+    ntriangleleft: "\u22ea",
+    ntrianglelefteq: "\u22ec",
+    ntriangleright: "\u22eb",
+    ntrianglerighteq: "\u22ed",
+    Nu: "\u039d",
+    nu: "\u03bd",
+    num: "#",
+    numero: "\u2116",
+    numsp: "\u2007",
+    nvap: "\u224d\u20d2",
+    nvdash: "\u22ac",
+    nvDash: "\u22ad",
+    nVdash: "\u22ae",
+    nVDash: "\u22af",
+    nvge: "\u2265\u20d2",
+    nvgt: ">\u20d2",
+    nvHarr: "\u2904",
+    nvinfin: "\u29de",
+    nvlArr: "\u2902",
+    nvle: "\u2264\u20d2",
+    nvlt: "<\u20d2",
+    nvltrie: "\u22b4\u20d2",
+    nvrArr: "\u2903",
+    nvrtrie: "\u22b5\u20d2",
+    nvsim: "\u223c\u20d2",
+    nwarhk: "\u2923",
+    nwarr: "\u2196",
+    nwArr: "\u21d6",
+    nwarrow: "\u2196",
+    nwnear: "\u2927",
+    Oacute: "\xd3",
+    oacute: "\xf3",
+    oast: "\u229b",
+    Ocirc: "\xd4",
+    ocirc: "\xf4",
+    ocir: "\u229a",
+    Ocy: "\u041e",
+    ocy: "\u043e",
+    odash: "\u229d",
+    Odblac: "\u0150",
+    odblac: "\u0151",
+    odiv: "\u2a38",
+    odot: "\u2299",
+    odsold: "\u29bc",
+    OElig: "\u0152",
+    oelig: "\u0153",
+    ofcir: "\u29bf",
+    Ofr: "\ud835\udd12",
+    ofr: "\ud835\udd2c",
+    ogon: "\u02db",
+    Ograve: "\xd2",
+    ograve: "\xf2",
+    ogt: "\u29c1",
+    ohbar: "\u29b5",
+    ohm: "\u03a9",
+    oint: "\u222e",
+    olarr: "\u21ba",
+    olcir: "\u29be",
+    olcross: "\u29bb",
+    oline: "\u203e",
+    olt: "\u29c0",
+    Omacr: "\u014c",
+    omacr: "\u014d",
+    Omega: "\u03a9",
+    omega: "\u03c9",
+    Omicron: "\u039f",
+    omicron: "\u03bf",
+    omid: "\u29b6",
+    ominus: "\u2296",
+    Oopf: "\ud835\udd46",
+    oopf: "\ud835\udd60",
+    opar: "\u29b7",
+    OpenCurlyDoubleQuote: "\u201c",
+    OpenCurlyQuote: "\u2018",
+    operp: "\u29b9",
+    oplus: "\u2295",
+    orarr: "\u21bb",
+    Or: "\u2a54",
+    or: "\u2228",
+    ord: "\u2a5d",
+    order: "\u2134",
+    orderof: "\u2134",
+    ordf: "\xaa",
+    ordm: "\xba",
+    origof: "\u22b6",
+    oror: "\u2a56",
+    orslope: "\u2a57",
+    orv: "\u2a5b",
+    oS: "\u24c8",
+    Oscr: "\ud835\udcaa",
+    oscr: "\u2134",
+    Oslash: "\xd8",
+    oslash: "\xf8",
+    osol: "\u2298",
+    Otilde: "\xd5",
+    otilde: "\xf5",
+    otimesas: "\u2a36",
+    Otimes: "\u2a37",
+    otimes: "\u2297",
+    Ouml: "\xd6",
+    ouml: "\xf6",
+    ovbar: "\u233d",
+    OverBar: "\u203e",
+    OverBrace: "\u23de",
+    OverBracket: "\u23b4",
+    OverParenthesis: "\u23dc",
+    para: "\xb6",
+    parallel: "\u2225",
+    par: "\u2225",
+    parsim: "\u2af3",
+    parsl: "\u2afd",
+    part: "\u2202",
+    PartialD: "\u2202",
+    Pcy: "\u041f",
+    pcy: "\u043f",
+    percnt: "%",
+    period: ".",
+    permil: "\u2030",
+    perp: "\u22a5",
+    pertenk: "\u2031",
+    Pfr: "\ud835\udd13",
+    pfr: "\ud835\udd2d",
+    Phi: "\u03a6",
+    phi: "\u03c6",
+    phiv: "\u03d5",
+    phmmat: "\u2133",
+    phone: "\u260e",
+    Pi: "\u03a0",
+    pi: "\u03c0",
+    pitchfork: "\u22d4",
+    piv: "\u03d6",
+    planck: "\u210f",
+    planckh: "\u210e",
+    plankv: "\u210f",
+    plusacir: "\u2a23",
+    plusb: "\u229e",
+    pluscir: "\u2a22",
+    plus: "+",
+    plusdo: "\u2214",
+    plusdu: "\u2a25",
+    pluse: "\u2a72",
+    PlusMinus: "\xb1",
+    plusmn: "\xb1",
+    plussim: "\u2a26",
+    plustwo: "\u2a27",
+    pm: "\xb1",
+    Poincareplane: "\u210c",
+    pointint: "\u2a15",
+    popf: "\ud835\udd61",
+    Popf: "\u2119",
+    pound: "\xa3",
+    prap: "\u2ab7",
+    Pr: "\u2abb",
+    pr: "\u227a",
+    prcue: "\u227c",
+    precapprox: "\u2ab7",
+    prec: "\u227a",
+    preccurlyeq: "\u227c",
+    Precedes: "\u227a",
+    PrecedesEqual: "\u2aaf",
+    PrecedesSlantEqual: "\u227c",
+    PrecedesTilde: "\u227e",
+    preceq: "\u2aaf",
+    precnapprox: "\u2ab9",
+    precneqq: "\u2ab5",
+    precnsim: "\u22e8",
+    pre: "\u2aaf",
+    prE: "\u2ab3",
+    precsim: "\u227e",
+    prime: "\u2032",
+    Prime: "\u2033",
+    primes: "\u2119",
+    prnap: "\u2ab9",
+    prnE: "\u2ab5",
+    prnsim: "\u22e8",
+    prod: "\u220f",
+    Product: "\u220f",
+    profalar: "\u232e",
+    profline: "\u2312",
+    profsurf: "\u2313",
+    prop: "\u221d",
+    Proportional: "\u221d",
+    Proportion: "\u2237",
+    propto: "\u221d",
+    prsim: "\u227e",
+    prurel: "\u22b0",
+    Pscr: "\ud835\udcab",
+    pscr: "\ud835\udcc5",
+    Psi: "\u03a8",
+    psi: "\u03c8",
+    puncsp: "\u2008",
+    Qfr: "\ud835\udd14",
+    qfr: "\ud835\udd2e",
+    qint: "\u2a0c",
+    qopf: "\ud835\udd62",
+    Qopf: "\u211a",
+    qprime: "\u2057",
+    Qscr: "\ud835\udcac",
+    qscr: "\ud835\udcc6",
+    quaternions: "\u210d",
+    quatint: "\u2a16",
+    quest: "?",
+    questeq: "\u225f",
+    quot: '"',
+    QUOT: '"',
+    rAarr: "\u21db",
+    race: "\u223d\u0331",
+    Racute: "\u0154",
+    racute: "\u0155",
+    radic: "\u221a",
+    raemptyv: "\u29b3",
+    rang: "\u27e9",
+    Rang: "\u27eb",
+    rangd: "\u2992",
+    range: "\u29a5",
+    rangle: "\u27e9",
+    raquo: "\xbb",
+    rarrap: "\u2975",
+    rarrb: "\u21e5",
+    rarrbfs: "\u2920",
+    rarrc: "\u2933",
+    rarr: "\u2192",
+    Rarr: "\u21a0",
+    rArr: "\u21d2",
+    rarrfs: "\u291e",
+    rarrhk: "\u21aa",
+    rarrlp: "\u21ac",
+    rarrpl: "\u2945",
+    rarrsim: "\u2974",
+    Rarrtl: "\u2916",
+    rarrtl: "\u21a3",
+    rarrw: "\u219d",
+    ratail: "\u291a",
+    rAtail: "\u291c",
+    ratio: "\u2236",
+    rationals: "\u211a",
+    rbarr: "\u290d",
+    rBarr: "\u290f",
+    RBarr: "\u2910",
+    rbbrk: "\u2773",
+    rbrace: "}",
+    rbrack: "]",
+    rbrke: "\u298c",
+    rbrksld: "\u298e",
+    rbrkslu: "\u2990",
+    Rcaron: "\u0158",
+    rcaron: "\u0159",
+    Rcedil: "\u0156",
+    rcedil: "\u0157",
+    rceil: "\u2309",
+    rcub: "}",
+    Rcy: "\u0420",
+    rcy: "\u0440",
+    rdca: "\u2937",
+    rdldhar: "\u2969",
+    rdquo: "\u201d",
+    rdquor: "\u201d",
+    rdsh: "\u21b3",
+    real: "\u211c",
+    realine: "\u211b",
+    realpart: "\u211c",
+    reals: "\u211d",
+    Re: "\u211c",
+    rect: "\u25ad",
+    reg: "\xae",
+    REG: "\xae",
+    ReverseElement: "\u220b",
+    ReverseEquilibrium: "\u21cb",
+    ReverseUpEquilibrium: "\u296f",
+    rfisht: "\u297d",
+    rfloor: "\u230b",
+    rfr: "\ud835\udd2f",
+    Rfr: "\u211c",
+    rHar: "\u2964",
+    rhard: "\u21c1",
+    rharu: "\u21c0",
+    rharul: "\u296c",
+    Rho: "\u03a1",
+    rho: "\u03c1",
+    rhov: "\u03f1",
+    RightAngleBracket: "\u27e9",
+    RightArrowBar: "\u21e5",
+    rightarrow: "\u2192",
+    RightArrow: "\u2192",
+    Rightarrow: "\u21d2",
+    RightArrowLeftArrow: "\u21c4",
+    rightarrowtail: "\u21a3",
+    RightCeiling: "\u2309",
+    RightDoubleBracket: "\u27e7",
+    RightDownTeeVector: "\u295d",
+    RightDownVectorBar: "\u2955",
+    RightDownVector: "\u21c2",
+    RightFloor: "\u230b",
+    rightharpoondown: "\u21c1",
+    rightharpoonup: "\u21c0",
+    rightleftarrows: "\u21c4",
+    rightleftharpoons: "\u21cc",
+    rightrightarrows: "\u21c9",
+    rightsquigarrow: "\u219d",
+    RightTeeArrow: "\u21a6",
+    RightTee: "\u22a2",
+    RightTeeVector: "\u295b",
+    rightthreetimes: "\u22cc",
+    RightTriangleBar: "\u29d0",
+    RightTriangle: "\u22b3",
+    RightTriangleEqual: "\u22b5",
+    RightUpDownVector: "\u294f",
+    RightUpTeeVector: "\u295c",
+    RightUpVectorBar: "\u2954",
+    RightUpVector: "\u21be",
+    RightVectorBar: "\u2953",
+    RightVector: "\u21c0",
+    ring: "\u02da",
+    risingdotseq: "\u2253",
+    rlarr: "\u21c4",
+    rlhar: "\u21cc",
+    rlm: "\u200f",
+    rmoustache: "\u23b1",
+    rmoust: "\u23b1",
+    rnmid: "\u2aee",
+    roang: "\u27ed",
+    roarr: "\u21fe",
+    robrk: "\u27e7",
+    ropar: "\u2986",
+    ropf: "\ud835\udd63",
+    Ropf: "\u211d",
+    roplus: "\u2a2e",
+    rotimes: "\u2a35",
+    RoundImplies: "\u2970",
+    rpar: ")",
+    rpargt: "\u2994",
+    rppolint: "\u2a12",
+    rrarr: "\u21c9",
+    Rrightarrow: "\u21db",
+    rsaquo: "\u203a",
+    rscr: "\ud835\udcc7",
+    Rscr: "\u211b",
+    rsh: "\u21b1",
+    Rsh: "\u21b1",
+    rsqb: "]",
+    rsquo: "\u2019",
+    rsquor: "\u2019",
+    rthree: "\u22cc",
+    rtimes: "\u22ca",
+    rtri: "\u25b9",
+    rtrie: "\u22b5",
+    rtrif: "\u25b8",
+    rtriltri: "\u29ce",
+    RuleDelayed: "\u29f4",
+    ruluhar: "\u2968",
+    rx: "\u211e",
+    Sacute: "\u015a",
+    sacute: "\u015b",
+    sbquo: "\u201a",
+    scap: "\u2ab8",
+    Scaron: "\u0160",
+    scaron: "\u0161",
+    Sc: "\u2abc",
+    sc: "\u227b",
+    sccue: "\u227d",
+    sce: "\u2ab0",
+    scE: "\u2ab4",
+    Scedil: "\u015e",
+    scedil: "\u015f",
+    Scirc: "\u015c",
+    scirc: "\u015d",
+    scnap: "\u2aba",
+    scnE: "\u2ab6",
+    scnsim: "\u22e9",
+    scpolint: "\u2a13",
+    scsim: "\u227f",
+    Scy: "\u0421",
+    scy: "\u0441",
+    sdotb: "\u22a1",
+    sdot: "\u22c5",
+    sdote: "\u2a66",
+    searhk: "\u2925",
+    searr: "\u2198",
+    seArr: "\u21d8",
+    searrow: "\u2198",
+    sect: "\xa7",
+    semi: ";",
+    seswar: "\u2929",
+    setminus: "\u2216",
+    setmn: "\u2216",
+    sext: "\u2736",
+    Sfr: "\ud835\udd16",
+    sfr: "\ud835\udd30",
+    sfrown: "\u2322",
+    sharp: "\u266f",
+    SHCHcy: "\u0429",
+    shchcy: "\u0449",
+    SHcy: "\u0428",
+    shcy: "\u0448",
+    ShortDownArrow: "\u2193",
+    ShortLeftArrow: "\u2190",
+    shortmid: "\u2223",
+    shortparallel: "\u2225",
+    ShortRightArrow: "\u2192",
+    ShortUpArrow: "\u2191",
+    shy: "\xad",
+    Sigma: "\u03a3",
+    sigma: "\u03c3",
+    sigmaf: "\u03c2",
+    sigmav: "\u03c2",
+    sim: "\u223c",
+    simdot: "\u2a6a",
+    sime: "\u2243",
+    simeq: "\u2243",
+    simg: "\u2a9e",
+    simgE: "\u2aa0",
+    siml: "\u2a9d",
+    simlE: "\u2a9f",
+    simne: "\u2246",
+    simplus: "\u2a24",
+    simrarr: "\u2972",
+    slarr: "\u2190",
+    SmallCircle: "\u2218",
+    smallsetminus: "\u2216",
+    smashp: "\u2a33",
+    smeparsl: "\u29e4",
+    smid: "\u2223",
+    smile: "\u2323",
+    smt: "\u2aaa",
+    smte: "\u2aac",
+    smtes: "\u2aac\ufe00",
+    SOFTcy: "\u042c",
+    softcy: "\u044c",
+    solbar: "\u233f",
+    solb: "\u29c4",
+    sol: "/",
+    Sopf: "\ud835\udd4a",
+    sopf: "\ud835\udd64",
+    spades: "\u2660",
+    spadesuit: "\u2660",
+    spar: "\u2225",
+    sqcap: "\u2293",
+    sqcaps: "\u2293\ufe00",
+    sqcup: "\u2294",
+    sqcups: "\u2294\ufe00",
+    Sqrt: "\u221a",
+    sqsub: "\u228f",
+    sqsube: "\u2291",
+    sqsubset: "\u228f",
+    sqsubseteq: "\u2291",
+    sqsup: "\u2290",
+    sqsupe: "\u2292",
+    sqsupset: "\u2290",
+    sqsupseteq: "\u2292",
+    square: "\u25a1",
+    Square: "\u25a1",
+    SquareIntersection: "\u2293",
+    SquareSubset: "\u228f",
+    SquareSubsetEqual: "\u2291",
+    SquareSuperset: "\u2290",
+    SquareSupersetEqual: "\u2292",
+    SquareUnion: "\u2294",
+    squarf: "\u25aa",
+    squ: "\u25a1",
+    squf: "\u25aa",
+    srarr: "\u2192",
+    Sscr: "\ud835\udcae",
+    sscr: "\ud835\udcc8",
+    ssetmn: "\u2216",
+    ssmile: "\u2323",
+    sstarf: "\u22c6",
+    Star: "\u22c6",
+    star: "\u2606",
+    starf: "\u2605",
+    straightepsilon: "\u03f5",
+    straightphi: "\u03d5",
+    strns: "\xaf",
+    sub: "\u2282",
+    Sub: "\u22d0",
+    subdot: "\u2abd",
+    subE: "\u2ac5",
+    sube: "\u2286",
+    subedot: "\u2ac3",
+    submult: "\u2ac1",
+    subnE: "\u2acb",
+    subne: "\u228a",
+    subplus: "\u2abf",
+    subrarr: "\u2979",
+    subset: "\u2282",
+    Subset: "\u22d0",
+    subseteq: "\u2286",
+    subseteqq: "\u2ac5",
+    SubsetEqual: "\u2286",
+    subsetneq: "\u228a",
+    subsetneqq: "\u2acb",
+    subsim: "\u2ac7",
+    subsub: "\u2ad5",
+    subsup: "\u2ad3",
+    succapprox: "\u2ab8",
+    succ: "\u227b",
+    succcurlyeq: "\u227d",
+    Succeeds: "\u227b",
+    SucceedsEqual: "\u2ab0",
+    SucceedsSlantEqual: "\u227d",
+    SucceedsTilde: "\u227f",
+    succeq: "\u2ab0",
+    succnapprox: "\u2aba",
+    succneqq: "\u2ab6",
+    succnsim: "\u22e9",
+    succsim: "\u227f",
+    SuchThat: "\u220b",
+    sum: "\u2211",
+    Sum: "\u2211",
+    sung: "\u266a",
+    sup1: "\xb9",
+    sup2: "\xb2",
+    sup3: "\xb3",
+    sup: "\u2283",
+    Sup: "\u22d1",
+    supdot: "\u2abe",
+    supdsub: "\u2ad8",
+    supE: "\u2ac6",
+    supe: "\u2287",
+    supedot: "\u2ac4",
+    Superset: "\u2283",
+    SupersetEqual: "\u2287",
+    suphsol: "\u27c9",
+    suphsub: "\u2ad7",
+    suplarr: "\u297b",
+    supmult: "\u2ac2",
+    supnE: "\u2acc",
+    supne: "\u228b",
+    supplus: "\u2ac0",
+    supset: "\u2283",
+    Supset: "\u22d1",
+    supseteq: "\u2287",
+    supseteqq: "\u2ac6",
+    supsetneq: "\u228b",
+    supsetneqq: "\u2acc",
+    supsim: "\u2ac8",
+    supsub: "\u2ad4",
+    supsup: "\u2ad6",
+    swarhk: "\u2926",
+    swarr: "\u2199",
+    swArr: "\u21d9",
+    swarrow: "\u2199",
+    swnwar: "\u292a",
+    szlig: "\xdf",
+    Tab: "\t",
+    target: "\u2316",
+    Tau: "\u03a4",
+    tau: "\u03c4",
+    tbrk: "\u23b4",
+    Tcaron: "\u0164",
+    tcaron: "\u0165",
+    Tcedil: "\u0162",
+    tcedil: "\u0163",
+    Tcy: "\u0422",
+    tcy: "\u0442",
+    tdot: "\u20db",
+    telrec: "\u2315",
+    Tfr: "\ud835\udd17",
+    tfr: "\ud835\udd31",
+    there4: "\u2234",
+    therefore: "\u2234",
+    Therefore: "\u2234",
+    Theta: "\u0398",
+    theta: "\u03b8",
+    thetasym: "\u03d1",
+    thetav: "\u03d1",
+    thickapprox: "\u2248",
+    thicksim: "\u223c",
+    ThickSpace: "\u205f\u200a",
+    ThinSpace: "\u2009",
+    thinsp: "\u2009",
+    thkap: "\u2248",
+    thksim: "\u223c",
+    THORN: "\xde",
+    thorn: "\xfe",
+    tilde: "\u02dc",
+    Tilde: "\u223c",
+    TildeEqual: "\u2243",
+    TildeFullEqual: "\u2245",
+    TildeTilde: "\u2248",
+    timesbar: "\u2a31",
+    timesb: "\u22a0",
+    times: "\xd7",
+    timesd: "\u2a30",
+    tint: "\u222d",
+    toea: "\u2928",
+    topbot: "\u2336",
+    topcir: "\u2af1",
+    top: "\u22a4",
+    Topf: "\ud835\udd4b",
+    topf: "\ud835\udd65",
+    topfork: "\u2ada",
+    tosa: "\u2929",
+    tprime: "\u2034",
+    trade: "\u2122",
+    TRADE: "\u2122",
+    triangle: "\u25b5",
+    triangledown: "\u25bf",
+    triangleleft: "\u25c3",
+    trianglelefteq: "\u22b4",
+    triangleq: "\u225c",
+    triangleright: "\u25b9",
+    trianglerighteq: "\u22b5",
+    tridot: "\u25ec",
+    trie: "\u225c",
+    triminus: "\u2a3a",
+    TripleDot: "\u20db",
+    triplus: "\u2a39",
+    trisb: "\u29cd",
+    tritime: "\u2a3b",
+    trpezium: "\u23e2",
+    Tscr: "\ud835\udcaf",
+    tscr: "\ud835\udcc9",
+    TScy: "\u0426",
+    tscy: "\u0446",
+    TSHcy: "\u040b",
+    tshcy: "\u045b",
+    Tstrok: "\u0166",
+    tstrok: "\u0167",
+    twixt: "\u226c",
+    twoheadleftarrow: "\u219e",
+    twoheadrightarrow: "\u21a0",
+    Uacute: "\xda",
+    uacute: "\xfa",
+    uarr: "\u2191",
+    Uarr: "\u219f",
+    uArr: "\u21d1",
+    Uarrocir: "\u2949",
+    Ubrcy: "\u040e",
+    ubrcy: "\u045e",
+    Ubreve: "\u016c",
+    ubreve: "\u016d",
+    Ucirc: "\xdb",
+    ucirc: "\xfb",
+    Ucy: "\u0423",
+    ucy: "\u0443",
+    udarr: "\u21c5",
+    Udblac: "\u0170",
+    udblac: "\u0171",
+    udhar: "\u296e",
+    ufisht: "\u297e",
+    Ufr: "\ud835\udd18",
+    ufr: "\ud835\udd32",
+    Ugrave: "\xd9",
+    ugrave: "\xf9",
+    uHar: "\u2963",
+    uharl: "\u21bf",
+    uharr: "\u21be",
+    uhblk: "\u2580",
+    ulcorn: "\u231c",
+    ulcorner: "\u231c",
+    ulcrop: "\u230f",
+    ultri: "\u25f8",
+    Umacr: "\u016a",
+    umacr: "\u016b",
+    uml: "\xa8",
+    UnderBar: "_",
+    UnderBrace: "\u23df",
+    UnderBracket: "\u23b5",
+    UnderParenthesis: "\u23dd",
+    Union: "\u22c3",
+    UnionPlus: "\u228e",
+    Uogon: "\u0172",
+    uogon: "\u0173",
+    Uopf: "\ud835\udd4c",
+    uopf: "\ud835\udd66",
+    UpArrowBar: "\u2912",
+    uparrow: "\u2191",
+    UpArrow: "\u2191",
+    Uparrow: "\u21d1",
+    UpArrowDownArrow: "\u21c5",
+    updownarrow: "\u2195",
+    UpDownArrow: "\u2195",
+    Updownarrow: "\u21d5",
+    UpEquilibrium: "\u296e",
+    upharpoonleft: "\u21bf",
+    upharpoonright: "\u21be",
+    uplus: "\u228e",
+    UpperLeftArrow: "\u2196",
+    UpperRightArrow: "\u2197",
+    upsi: "\u03c5",
+    Upsi: "\u03d2",
+    upsih: "\u03d2",
+    Upsilon: "\u03a5",
+    upsilon: "\u03c5",
+    UpTeeArrow: "\u21a5",
+    UpTee: "\u22a5",
+    upuparrows: "\u21c8",
+    urcorn: "\u231d",
+    urcorner: "\u231d",
+    urcrop: "\u230e",
+    Uring: "\u016e",
+    uring: "\u016f",
+    urtri: "\u25f9",
+    Uscr: "\ud835\udcb0",
+    uscr: "\ud835\udcca",
+    utdot: "\u22f0",
+    Utilde: "\u0168",
+    utilde: "\u0169",
+    utri: "\u25b5",
+    utrif: "\u25b4",
+    uuarr: "\u21c8",
+    Uuml: "\xdc",
+    uuml: "\xfc",
+    uwangle: "\u29a7",
+    vangrt: "\u299c",
+    varepsilon: "\u03f5",
+    varkappa: "\u03f0",
+    varnothing: "\u2205",
+    varphi: "\u03d5",
+    varpi: "\u03d6",
+    varpropto: "\u221d",
+    varr: "\u2195",
+    vArr: "\u21d5",
+    varrho: "\u03f1",
+    varsigma: "\u03c2",
+    varsubsetneq: "\u228a\ufe00",
+    varsubsetneqq: "\u2acb\ufe00",
+    varsupsetneq: "\u228b\ufe00",
+    varsupsetneqq: "\u2acc\ufe00",
+    vartheta: "\u03d1",
+    vartriangleleft: "\u22b2",
+    vartriangleright: "\u22b3",
+    vBar: "\u2ae8",
+    Vbar: "\u2aeb",
+    vBarv: "\u2ae9",
+    Vcy: "\u0412",
+    vcy: "\u0432",
+    vdash: "\u22a2",
+    vDash: "\u22a8",
+    Vdash: "\u22a9",
+    VDash: "\u22ab",
+    Vdashl: "\u2ae6",
+    veebar: "\u22bb",
+    vee: "\u2228",
+    Vee: "\u22c1",
+    veeeq: "\u225a",
+    vellip: "\u22ee",
+    verbar: "|",
+    Verbar: "\u2016",
+    vert: "|",
+    Vert: "\u2016",
+    VerticalBar: "\u2223",
+    VerticalLine: "|",
+    VerticalSeparator: "\u2758",
+    VerticalTilde: "\u2240",
+    VeryThinSpace: "\u200a",
+    Vfr: "\ud835\udd19",
+    vfr: "\ud835\udd33",
+    vltri: "\u22b2",
+    vnsub: "\u2282\u20d2",
+    vnsup: "\u2283\u20d2",
+    Vopf: "\ud835\udd4d",
+    vopf: "\ud835\udd67",
+    vprop: "\u221d",
+    vrtri: "\u22b3",
+    Vscr: "\ud835\udcb1",
+    vscr: "\ud835\udccb",
+    vsubnE: "\u2acb\ufe00",
+    vsubne: "\u228a\ufe00",
+    vsupnE: "\u2acc\ufe00",
+    vsupne: "\u228b\ufe00",
+    Vvdash: "\u22aa",
+    vzigzag: "\u299a",
+    Wcirc: "\u0174",
+    wcirc: "\u0175",
+    wedbar: "\u2a5f",
+    wedge: "\u2227",
+    Wedge: "\u22c0",
+    wedgeq: "\u2259",
+    weierp: "\u2118",
+    Wfr: "\ud835\udd1a",
+    wfr: "\ud835\udd34",
+    Wopf: "\ud835\udd4e",
+    wopf: "\ud835\udd68",
+    wp: "\u2118",
+    wr: "\u2240",
+    wreath: "\u2240",
+    Wscr: "\ud835\udcb2",
+    wscr: "\ud835\udccc",
+    xcap: "\u22c2",
+    xcirc: "\u25ef",
+    xcup: "\u22c3",
+    xdtri: "\u25bd",
+    Xfr: "\ud835\udd1b",
+    xfr: "\ud835\udd35",
+    xharr: "\u27f7",
+    xhArr: "\u27fa",
+    Xi: "\u039e",
+    xi: "\u03be",
+    xlarr: "\u27f5",
+    xlArr: "\u27f8",
+    xmap: "\u27fc",
+    xnis: "\u22fb",
+    xodot: "\u2a00",
+    Xopf: "\ud835\udd4f",
+    xopf: "\ud835\udd69",
+    xoplus: "\u2a01",
+    xotime: "\u2a02",
+    xrarr: "\u27f6",
+    xrArr: "\u27f9",
+    Xscr: "\ud835\udcb3",
+    xscr: "\ud835\udccd",
+    xsqcup: "\u2a06",
+    xuplus: "\u2a04",
+    xutri: "\u25b3",
+    xvee: "\u22c1",
+    xwedge: "\u22c0",
+    Yacute: "\xdd",
+    yacute: "\xfd",
+    YAcy: "\u042f",
+    yacy: "\u044f",
+    Ycirc: "\u0176",
+    ycirc: "\u0177",
+    Ycy: "\u042b",
+    ycy: "\u044b",
+    yen: "\xa5",
+    Yfr: "\ud835\udd1c",
+    yfr: "\ud835\udd36",
+    YIcy: "\u0407",
+    yicy: "\u0457",
+    Yopf: "\ud835\udd50",
+    yopf: "\ud835\udd6a",
+    Yscr: "\ud835\udcb4",
+    yscr: "\ud835\udcce",
+    YUcy: "\u042e",
+    yucy: "\u044e",
+    yuml: "\xff",
+    Yuml: "\u0178",
+    Zacute: "\u0179",
+    zacute: "\u017a",
+    Zcaron: "\u017d",
+    zcaron: "\u017e",
+    Zcy: "\u0417",
+    zcy: "\u0437",
+    Zdot: "\u017b",
+    zdot: "\u017c",
+    zeetrf: "\u2128",
+    ZeroWidthSpace: "\u200b",
+    Zeta: "\u0396",
+    zeta: "\u03b6",
+    zfr: "\ud835\udd37",
+    Zfr: "\u2128",
+    ZHcy: "\u0416",
+    zhcy: "\u0436",
+    zigrarr: "\u21dd",
+    zopf: "\ud835\udd6b",
+    Zopf: "\u2124",
+    Zscr: "\ud835\udcb5",
+    zscr: "\ud835\udccf",
+    zwj: "\u200d",
+    zwnj: "\u200c"
+  };
+  /*eslint quotes:0*/  var entities = require$$0;
+  var regex$4 = /[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]/;
+  var encodeCache = {};
+  // Create a lookup array where anything but characters in `chars` string
+  // and alphanumeric chars is percent-encoded.
+  
+    function getEncodeCache(exclude) {
+    var i, ch, cache = encodeCache[exclude];
+    if (cache) {
+      return cache;
+    }
+    cache = encodeCache[exclude] = [];
+    for (i = 0; i < 128; i++) {
+      ch = String.fromCharCode(i);
+      if (/^[0-9a-z]$/i.test(ch)) {
+        // always allow unencoded alphanumeric characters
+        cache.push(ch);
+      } else {
+        cache.push("%" + ("0" + i.toString(16).toUpperCase()).slice(-2));
+      }
+    }
+    for (i = 0; i < exclude.length; i++) {
+      cache[exclude.charCodeAt(i)] = exclude[i];
+    }
+    return cache;
+  }
+  // Encode unsafe characters with percent-encoding, skipping already
+  // encoded sequences.
+  
+  //  - string       - string to encode
+  //  - exclude      - list of characters to ignore (in addition to a-zA-Z0-9)
+  //  - keepEscaped  - don't encode '%' in a correct escape sequence (default: true)
+  
+    function encode$2(string, exclude, keepEscaped) {
+    var i, l, code, nextCode, cache, result = "";
+    if (typeof exclude !== "string") {
+      // encode(string, keepEscaped)
+      keepEscaped = exclude;
+      exclude = encode$2.defaultChars;
+    }
+    if (typeof keepEscaped === "undefined") {
+      keepEscaped = true;
+    }
+    cache = getEncodeCache(exclude);
+    for (i = 0, l = string.length; i < l; i++) {
+      code = string.charCodeAt(i);
+      if (keepEscaped && code === 37 /* % */ && i + 2 < l) {
+        if (/^[0-9a-f]{2}$/i.test(string.slice(i + 1, i + 3))) {
+          result += string.slice(i, i + 3);
+          i += 2;
+          continue;
+        }
+      }
+      if (code < 128) {
+        result += cache[code];
+        continue;
+      }
+      if (code >= 55296 && code <= 57343) {
+        if (code >= 55296 && code <= 56319 && i + 1 < l) {
+          nextCode = string.charCodeAt(i + 1);
+          if (nextCode >= 56320 && nextCode <= 57343) {
+            result += encodeURIComponent(string[i] + string[i + 1]);
+            i++;
+            continue;
+          }
+        }
+        result += "%EF%BF%BD";
+        continue;
+      }
+      result += encodeURIComponent(string[i]);
+    }
+    return result;
+  }
+  encode$2.defaultChars = ";/?:@&=+$,-_.!~*'()#";
+  encode$2.componentChars = "-_.!~*'()";
+  var encode_1 = encode$2;
+  /* eslint-disable no-bitwise */  var decodeCache = {};
+  function getDecodeCache(exclude) {
+    var i, ch, cache = decodeCache[exclude];
+    if (cache) {
+      return cache;
+    }
+    cache = decodeCache[exclude] = [];
+    for (i = 0; i < 128; i++) {
+      ch = String.fromCharCode(i);
+      cache.push(ch);
+    }
+    for (i = 0; i < exclude.length; i++) {
+      ch = exclude.charCodeAt(i);
+      cache[ch] = "%" + ("0" + ch.toString(16).toUpperCase()).slice(-2);
+    }
+    return cache;
+  }
+  // Decode percent-encoded string.
+  
+    function decode$2(string, exclude) {
+    var cache;
+    if (typeof exclude !== "string") {
+      exclude = decode$2.defaultChars;
+    }
+    cache = getDecodeCache(exclude);
+    return string.replace(/(%[a-f0-9]{2})+/gi, (function(seq) {
+      var i, l, b1, b2, b3, b4, chr, result = "";
+      for (i = 0, l = seq.length; i < l; i += 3) {
+        b1 = parseInt(seq.slice(i + 1, i + 3), 16);
+        if (b1 < 128) {
+          result += cache[b1];
+          continue;
+        }
+        if ((b1 & 224) === 192 && i + 3 < l) {
+          // 110xxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          if ((b2 & 192) === 128) {
+            chr = b1 << 6 & 1984 | b2 & 63;
+            if (chr < 128) {
+              result += "\ufffd\ufffd";
+            } else {
+              result += String.fromCharCode(chr);
+            }
+            i += 3;
+            continue;
+          }
+        }
+        if ((b1 & 240) === 224 && i + 6 < l) {
+          // 1110xxxx 10xxxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
+          if ((b2 & 192) === 128 && (b3 & 192) === 128) {
+            chr = b1 << 12 & 61440 | b2 << 6 & 4032 | b3 & 63;
+            if (chr < 2048 || chr >= 55296 && chr <= 57343) {
+              result += "\ufffd\ufffd\ufffd";
+            } else {
+              result += String.fromCharCode(chr);
+            }
+            i += 6;
+            continue;
+          }
+        }
+        if ((b1 & 248) === 240 && i + 9 < l) {
+          // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
+          b2 = parseInt(seq.slice(i + 4, i + 6), 16);
+          b3 = parseInt(seq.slice(i + 7, i + 9), 16);
+          b4 = parseInt(seq.slice(i + 10, i + 12), 16);
+          if ((b2 & 192) === 128 && (b3 & 192) === 128 && (b4 & 192) === 128) {
+            chr = b1 << 18 & 1835008 | b2 << 12 & 258048 | b3 << 6 & 4032 | b4 & 63;
+            if (chr < 65536 || chr > 1114111) {
+              result += "\ufffd\ufffd\ufffd\ufffd";
+            } else {
+              chr -= 65536;
+              result += String.fromCharCode(55296 + (chr >> 10), 56320 + (chr & 1023));
+            }
+            i += 9;
+            continue;
+          }
+        }
+        result += "\ufffd";
+      }
+      return result;
+    }));
+  }
+  decode$2.defaultChars = ";/?:@&=+$,#";
+  decode$2.componentChars = "";
+  var decode_1 = decode$2;
+  var format$1 = function format(url) {
+    var result = "";
+    result += url.protocol || "";
+    result += url.slashes ? "//" : "";
+    result += url.auth ? url.auth + "@" : "";
+    if (url.hostname && url.hostname.indexOf(":") !== -1) {
+      // ipv6 address
+      result += "[" + url.hostname + "]";
+    } else {
+      result += url.hostname || "";
+    }
+    result += url.port ? ":" + url.port : "";
+    result += url.pathname || "";
+    result += url.search || "";
+    result += url.hash || "";
+    return result;
+  };
+  // Copyright Joyent, Inc. and other Node contributors.
+  
+  // Changes from joyent/node:
+  
+  // 1. No leading slash in paths,
+  //    e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
+  
+  // 2. Backslashes are not replaced with slashes,
+  //    so `http:\\example.org\` is treated like a relative path
+  
+  // 3. Trailing colon is treated like a part of the path,
+  //    i.e. in `http://example.org:foo` pathname is `:foo`
+  
+  // 4. Nothing is URL-encoded in the resulting object,
+  //    (in joyent/node some chars in auth and paths are encoded)
+  
+  // 5. `url.parse()` does not have `parseQueryString` argument
+  
+  // 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
+  //    which can be constructed using other parts of the url.
+  
+    function Url() {
+    this.protocol = null;
+    this.slashes = null;
+    this.auth = null;
+    this.port = null;
+    this.hostname = null;
+    this.hash = null;
+    this.search = null;
+    this.pathname = null;
+  }
+  // Reference: RFC 3986, RFC 1808, RFC 2396
+  // define these here so at least they only have to be
+  // compiled once on the first module load.
+    var protocolPattern = /^([a-z0-9.+-]+:)/i, portPattern = /:[0-9]*$/, 
+  // Special case for a simple path URL
+  simplePathPattern = /^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/, 
+  // RFC 2396: characters reserved for delimiting URLs.
+  // We actually just auto-escape these.
+  delims = [ "<", ">", '"', "`", " ", "\r", "\n", "\t" ], 
+  // RFC 2396: characters not allowed for various reasons.
+  unwise = [ "{", "}", "|", "\\", "^", "`" ].concat(delims), 
+  // Allowed by RFCs, but cause of XSS attacks.  Always escape these.
+  autoEscape = [ "'" ].concat(unwise), 
+  // Characters that are never ever allowed in a hostname.
+  // Note that any invalid chars are also handled, but these
+  // are the ones that are *expected* to be seen, so we fast-path
+  // them.
+  nonHostChars = [ "%", "/", "?", ";", "#" ].concat(autoEscape), hostEndingChars = [ "/", "?", "#" ], hostnameMaxLen = 255, hostnamePartPattern = /^[+a-z0-9A-Z_-]{0,63}$/, hostnamePartStart = /^([+a-z0-9A-Z_-]{0,63})(.*)$/, 
+  // protocols that can allow "unsafe" and "unwise" chars.
+  /* eslint-disable no-script-url */
+  // protocols that never have a hostname.
+  hostlessProtocol = {
+    javascript: true,
+    "javascript:": true
+  }, 
+  // protocols that always contain a // bit.
+  slashedProtocol = {
+    http: true,
+    https: true,
+    ftp: true,
+    gopher: true,
+    file: true,
+    "http:": true,
+    "https:": true,
+    "ftp:": true,
+    "gopher:": true,
+    "file:": true
+  };
+  /* eslint-enable no-script-url */  function urlParse(url, slashesDenoteHost) {
+    if (url && url instanceof Url) {
+      return url;
+    }
+    var u = new Url;
+    u.parse(url, slashesDenoteHost);
+    return u;
+  }
+  Url.prototype.parse = function(url, slashesDenoteHost) {
+    var i, l, lowerProto, hec, slashes, rest = url;
+    // trim before proceeding.
+    // This is to support parse stuff like "  http://foo.com  \n"
+        rest = rest.trim();
+    if (!slashesDenoteHost && url.split("#").length === 1) {
+      // Try fast path regexp
+      var simplePath = simplePathPattern.exec(rest);
+      if (simplePath) {
+        this.pathname = simplePath[1];
+        if (simplePath[2]) {
+          this.search = simplePath[2];
+        }
+        return this;
+      }
+    }
+    var proto = protocolPattern.exec(rest);
+    if (proto) {
+      proto = proto[0];
+      lowerProto = proto.toLowerCase();
+      this.protocol = proto;
+      rest = rest.substr(proto.length);
+    }
+    // figure out if it's got a host
+    // user@server is *always* interpreted as a hostname, and url
+    // resolution will treat //foo/bar as host=foo,path=bar because that's
+    // how the browser resolves relative URLs.
+        if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/)) {
+      slashes = rest.substr(0, 2) === "//";
+      if (slashes && !(proto && hostlessProtocol[proto])) {
+        rest = rest.substr(2);
+        this.slashes = true;
+      }
+    }
+    if (!hostlessProtocol[proto] && (slashes || proto && !slashedProtocol[proto])) {
+      // there's a hostname.
+      // the first instance of /, ?, ;, or # ends the host.
+      // If there is an @ in the hostname, then non-host chars *are* allowed
+      // to the left of the last @ sign, unless some host-ending character
+      // comes *before* the @-sign.
+      // URLs are obnoxious.
+      // ex:
+      // http://a@b@c/ => user:a@b host:c
+      // http://a@b?@c => user:a host:c path:/?@c
+      // v0.12 TODO(isaacs): This is not quite how Chrome does things.
+      // Review our test case against browsers more comprehensively.
+      // find the first instance of any hostEndingChars
+      var hostEnd = -1;
+      for (i = 0; i < hostEndingChars.length; i++) {
+        hec = rest.indexOf(hostEndingChars[i]);
+        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
+          hostEnd = hec;
+        }
+      }
+      // at this point, either we have an explicit point where the
+      // auth portion cannot go past, or the last @ char is the decider.
+            var auth, atSign;
+      if (hostEnd === -1) {
+        // atSign can be anywhere.
+        atSign = rest.lastIndexOf("@");
+      } else {
+        // atSign must be in auth portion.
+        // http://a@b/c@d => host:b auth:a path:/c@d
+        atSign = rest.lastIndexOf("@", hostEnd);
+      }
+      // Now we have a portion which is definitely the auth.
+      // Pull that off.
+            if (atSign !== -1) {
+        auth = rest.slice(0, atSign);
+        rest = rest.slice(atSign + 1);
+        this.auth = auth;
+      }
+      // the host is the remaining to the left of the first non-host char
+            hostEnd = -1;
+      for (i = 0; i < nonHostChars.length; i++) {
+        hec = rest.indexOf(nonHostChars[i]);
+        if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) {
+          hostEnd = hec;
+        }
+      }
+      // if we still have not hit it, then the entire thing is a host.
+            if (hostEnd === -1) {
+        hostEnd = rest.length;
+      }
+      if (rest[hostEnd - 1] === ":") {
+        hostEnd--;
+      }
+      var host = rest.slice(0, hostEnd);
+      rest = rest.slice(hostEnd);
+      // pull out port.
+            this.parseHost(host);
+      // we've indicated that there is a hostname,
+      // so even if it's empty, it has to be present.
+            this.hostname = this.hostname || "";
+      // if hostname begins with [ and ends with ]
+      // assume that it's an IPv6 address.
+            var ipv6Hostname = this.hostname[0] === "[" && this.hostname[this.hostname.length - 1] === "]";
+      // validate a little.
+            if (!ipv6Hostname) {
+        var hostparts = this.hostname.split(/\./);
+        for (i = 0, l = hostparts.length; i < l; i++) {
+          var part = hostparts[i];
+          if (!part) {
+            continue;
+          }
+          if (!part.match(hostnamePartPattern)) {
+            var newpart = "";
+            for (var j = 0, k = part.length; j < k; j++) {
+              if (part.charCodeAt(j) > 127) {
+                // we replace non-ASCII char with a temporary placeholder
+                // we need this to make sure size of hostname is not
+                // broken by replacing non-ASCII by nothing
+                newpart += "x";
+              } else {
+                newpart += part[j];
+              }
+            }
+            // we test again with ASCII char only
+                        if (!newpart.match(hostnamePartPattern)) {
+              var validParts = hostparts.slice(0, i);
+              var notHost = hostparts.slice(i + 1);
+              var bit = part.match(hostnamePartStart);
+              if (bit) {
+                validParts.push(bit[1]);
+                notHost.unshift(bit[2]);
+              }
+              if (notHost.length) {
+                rest = notHost.join(".") + rest;
+              }
+              this.hostname = validParts.join(".");
+              break;
+            }
+          }
+        }
+      }
+      if (this.hostname.length > hostnameMaxLen) {
+        this.hostname = "";
+      }
+      // strip [ and ] from the hostname
+      // the host field still retains them, though
+            if (ipv6Hostname) {
+        this.hostname = this.hostname.substr(1, this.hostname.length - 2);
+      }
+    }
+    // chop off from the tail first.
+        var hash = rest.indexOf("#");
+    if (hash !== -1) {
+      // got a fragment string.
+      this.hash = rest.substr(hash);
+      rest = rest.slice(0, hash);
+    }
+    var qm = rest.indexOf("?");
+    if (qm !== -1) {
+      this.search = rest.substr(qm);
+      rest = rest.slice(0, qm);
+    }
+    if (rest) {
+      this.pathname = rest;
+    }
+    if (slashedProtocol[lowerProto] && this.hostname && !this.pathname) {
+      this.pathname = "";
+    }
+    return this;
+  };
+  Url.prototype.parseHost = function(host) {
+    var port = portPattern.exec(host);
+    if (port) {
+      port = port[0];
+      if (port !== ":") {
+        this.port = port.substr(1);
+      }
+      host = host.substr(0, host.length - port.length);
+    }
+    if (host) {
+      this.hostname = host;
+    }
+  };
+  var parse$1 = urlParse;
+  var encode$1 = encode_1;
+  var decode$1 = decode_1;
+  var format = format$1;
+  var parse = parse$1;
+  var mdurl = {
+    encode: encode$1,
+    decode: decode$1,
+    format: format,
+    parse: parse
+  };
+  var regex$3 = /[\0-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]/;
+  var regex$2 = /[\0-\x1F\x7F-\x9F]/;
+  var regex$1 = /[\xAD\u0600-\u0605\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F]/;
+  var regex = /[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]/;
+  var Any = regex$3;
+  var Cc = regex$2;
+  var Cf = regex$1;
+  var P = regex$4;
+  var Z = regex;
+  var uc_micro = {
+    Any: Any,
+    Cc: Cc,
+    Cf: Cf,
+    P: P,
+    Z: Z
+  };
+  var utils = createCommonjsModule((function(module, exports) {
+    function _class(obj) {
+      return Object.prototype.toString.call(obj);
+    }
+    function isString(obj) {
+      return _class(obj) === "[object String]";
+    }
+    var _hasOwnProperty = Object.prototype.hasOwnProperty;
+    function has(object, key) {
+      return _hasOwnProperty.call(object, key);
+    }
+    // Merge objects
+    
+        function assign(obj /*from1, from2, from3, ...*/) {
+      var sources = Array.prototype.slice.call(arguments, 1);
+      sources.forEach((function(source) {
+        if (!source) {
+          return;
+        }
+        if (typeof source !== "object") {
+          throw new TypeError(source + "must be object");
+        }
+        Object.keys(source).forEach((function(key) {
+          obj[key] = source[key];
+        }));
+      }));
+      return obj;
+    }
+    // Remove element from array and put another array at those position.
+    // Useful for some operations with tokens
+        function arrayReplaceAt(src, pos, newElements) {
+      return [].concat(src.slice(0, pos), newElements, src.slice(pos + 1));
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        function isValidEntityCode(c) {
+      /*eslint no-bitwise:0*/
+      // broken sequence
+      if (c >= 55296 && c <= 57343) {
+        return false;
+      }
+      // never used
+            if (c >= 64976 && c <= 65007) {
+        return false;
+      }
+      if ((c & 65535) === 65535 || (c & 65535) === 65534) {
+        return false;
+      }
+      // control codes
+            if (c >= 0 && c <= 8) {
+        return false;
+      }
+      if (c === 11) {
+        return false;
+      }
+      if (c >= 14 && c <= 31) {
+        return false;
+      }
+      if (c >= 127 && c <= 159) {
+        return false;
+      }
+      // out of range
+            if (c > 1114111) {
+        return false;
+      }
+      return true;
+    }
+    function fromCodePoint(c) {
+      /*eslint no-bitwise:0*/
+      if (c > 65535) {
+        c -= 65536;
+        var surrogate1 = 55296 + (c >> 10), surrogate2 = 56320 + (c & 1023);
+        return String.fromCharCode(surrogate1, surrogate2);
+      }
+      return String.fromCharCode(c);
+    }
+    var UNESCAPE_MD_RE = /\\([!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])/g;
+    var ENTITY_RE = /&([a-z#][a-z0-9]{1,31});/gi;
+    var UNESCAPE_ALL_RE = new RegExp(UNESCAPE_MD_RE.source + "|" + ENTITY_RE.source, "gi");
+    var DIGITAL_ENTITY_TEST_RE = /^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i;
+    function replaceEntityPattern(match, name) {
+      var code;
+      if (has(entities, name)) {
+        return entities[name];
+      }
+      if (name.charCodeAt(0) === 35 /* # */ && DIGITAL_ENTITY_TEST_RE.test(name)) {
+        code = name[1].toLowerCase() === "x" ? parseInt(name.slice(2), 16) : parseInt(name.slice(1), 10);
+        if (isValidEntityCode(code)) {
+          return fromCodePoint(code);
+        }
+      }
+      return match;
+    }
+    /*function replaceEntities(str) {
+	  if (str.indexOf('&') < 0) { return str; }
+
+	  return str.replace(ENTITY_RE, replaceEntityPattern);
+	}*/    function unescapeMd(str) {
+      if (str.indexOf("\\") < 0) {
+        return str;
+      }
+      return str.replace(UNESCAPE_MD_RE, "$1");
+    }
+    function unescapeAll(str) {
+      if (str.indexOf("\\") < 0 && str.indexOf("&") < 0) {
+        return str;
+      }
+      return str.replace(UNESCAPE_ALL_RE, (function(match, escaped, entity) {
+        if (escaped) {
+          return escaped;
+        }
+        return replaceEntityPattern(match, entity);
+      }));
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        var HTML_ESCAPE_TEST_RE = /[&<>"]/;
+    var HTML_ESCAPE_REPLACE_RE = /[&<>"]/g;
+    var HTML_REPLACEMENTS = {
+      "&": "&",
+      "<": "<",
+      ">": ">",
+      '"': """
+    };
+    function replaceUnsafeChar(ch) {
+      return HTML_REPLACEMENTS[ch];
+    }
+    function escapeHtml(str) {
+      if (HTML_ESCAPE_TEST_RE.test(str)) {
+        return str.replace(HTML_ESCAPE_REPLACE_RE, replaceUnsafeChar);
+      }
+      return str;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        var REGEXP_ESCAPE_RE = /[.?*+^$[\]\\(){}|-]/g;
+    function escapeRE(str) {
+      return str.replace(REGEXP_ESCAPE_RE, "\\$&");
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+        function isSpace(code) {
+      switch (code) {
+       case 9:
+       case 32:
+        return true;
+      }
+      return false;
+    }
+    // Zs (unicode class) || [\t\f\v\r\n]
+        function isWhiteSpace(code) {
+      if (code >= 8192 && code <= 8202) {
+        return true;
+      }
+      switch (code) {
+       case 9:
+ // \t
+               case 10:
+ // \n
+               case 11:
+ // \v
+               case 12:
+ // \f
+               case 13:
+ // \r
+               case 32:
+       case 160:
+       case 5760:
+       case 8239:
+       case 8287:
+       case 12288:
+        return true;
+      }
+      return false;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    /*eslint-disable max-len*/
+    // Currently without astral characters support.
+        function isPunctChar(ch) {
+      return regex$4.test(ch);
+    }
+    // Markdown ASCII punctuation characters.
+    
+    // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
+    // http://spec.commonmark.org/0.15/#ascii-punctuation-character
+    
+    // Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
+    
+        function isMdAsciiPunct(ch) {
+      switch (ch) {
+       case 33 /* ! */ :
+       case 34 /* " */ :
+       case 35 /* # */ :
+       case 36 /* $ */ :
+       case 37 /* % */ :
+       case 38 /* & */ :
+       case 39 /* ' */ :
+       case 40 /* ( */ :
+       case 41 /* ) */ :
+       case 42 /* * */ :
+       case 43 /* + */ :
+       case 44 /* , */ :
+       case 45 /* - */ :
+       case 46 /* . */ :
+       case 47 /* / */ :
+       case 58 /* : */ :
+       case 59 /* ; */ :
+       case 60 /* < */ :
+       case 61 /* = */ :
+       case 62 /* > */ :
+       case 63 /* ? */ :
+       case 64 /* @ */ :
+       case 91 /* [ */ :
+       case 92 /* \ */ :
+       case 93 /* ] */ :
+       case 94 /* ^ */ :
+       case 95 /* _ */ :
+       case 96 /* ` */ :
+       case 123 /* { */ :
+       case 124 /* | */ :
+       case 125 /* } */ :
+       case 126 /* ~ */ :
+        return true;
+
+       default:
+        return false;
+      }
+    }
+    // Hepler to unify [reference labels].
+    
+        function normalizeReference(str) {
+      // Trim and collapse whitespace
+      str = str.trim().replace(/\s+/g, " ");
+      // In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
+      // fixed in v12 (couldn't find any details).
+      
+      // So treat this one as a special case
+      // (remove this when node v10 is no longer supported).
+      
+            if ("\u1e9e".toLowerCase() === "\u1e7e") {
+        str = str.replace(/\u1e9e/g, "\xdf");
+      }
+      // .toLowerCase().toUpperCase() should get rid of all differences
+      // between letter variants.
+      
+      // Simple .toLowerCase() doesn't normalize 125 code points correctly,
+      // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
+      // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
+      // uppercased versions).
+      
+      // Here's an example showing how it happens. Lets take greek letter omega:
+      // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
+      
+      // Unicode entries:
+      // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
+      // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
+      // 03D1;GREEK THETA SYMBOL;Ll;0;L; 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
+      // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L; 0398;;;;N;;;;03B8;
+      
+      // Case-insensitive comparison should treat all of them as equivalent.
+      
+      // But .toLowerCase() doesn't change ϑ (it's already lowercase),
+      // and .toUpperCase() doesn't change ϴ (already uppercase).
+      
+      // Applying first lower then upper case normalizes any character:
+      // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
+      
+      // Note: this is equivalent to unicode case folding; unicode normalization
+      // is a different step that is not required here.
+      
+      // Final result should be uppercased, because it's later stored in an object
+      // (this avoid a conflict with Object.prototype members,
+      // most notably, `__proto__`)
+      
+            return str.toLowerCase().toUpperCase();
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    // Re-export libraries commonly used in both markdown-it and its plugins,
+    // so plugins won't have to depend on them explicitly, which reduces their
+    // bundled size (e.g. a browser build).
+    
+        exports.lib = {};
+    exports.lib.mdurl = mdurl;
+    exports.lib.ucmicro = uc_micro;
+    exports.assign = assign;
+    exports.isString = isString;
+    exports.has = has;
+    exports.unescapeMd = unescapeMd;
+    exports.unescapeAll = unescapeAll;
+    exports.isValidEntityCode = isValidEntityCode;
+    exports.fromCodePoint = fromCodePoint;
+    // exports.replaceEntities     = replaceEntities;
+        exports.escapeHtml = escapeHtml;
+    exports.arrayReplaceAt = arrayReplaceAt;
+    exports.isSpace = isSpace;
+    exports.isWhiteSpace = isWhiteSpace;
+    exports.isMdAsciiPunct = isMdAsciiPunct;
+    exports.isPunctChar = isPunctChar;
+    exports.escapeRE = escapeRE;
+    exports.normalizeReference = normalizeReference;
+  }));
+  // Parse link label
+    var parse_link_label = function parseLinkLabel(state, start, disableNested) {
+    var level, found, marker, prevPos, labelEnd = -1, max = state.posMax, oldPos = state.pos;
+    state.pos = start + 1;
+    level = 1;
+    while (state.pos < max) {
+      marker = state.src.charCodeAt(state.pos);
+      if (marker === 93 /* ] */) {
+        level--;
+        if (level === 0) {
+          found = true;
+          break;
+        }
+      }
+      prevPos = state.pos;
+      state.md.inline.skipToken(state);
+      if (marker === 91 /* [ */) {
+        if (prevPos === state.pos - 1) {
+          // increase level if we find text `[`, which is not a part of any token
+          level++;
+        } else if (disableNested) {
+          state.pos = oldPos;
+          return -1;
+        }
+      }
+    }
+    if (found) {
+      labelEnd = state.pos;
+    }
+    // restore old state
+        state.pos = oldPos;
+    return labelEnd;
+  };
+  var unescapeAll$2 = utils.unescapeAll;
+  var parse_link_destination = function parseLinkDestination(str, start, max) {
+    var code, level, pos = start, result = {
+      ok: false,
+      pos: 0,
+      lines: 0,
+      str: ""
+    };
+    if (str.charCodeAt(pos) === 60 /* < */) {
+      pos++;
+      while (pos < max) {
+        code = str.charCodeAt(pos);
+        if (code === 10 /* \n */) {
+          return result;
+        }
+        if (code === 60 /* < */) {
+          return result;
+        }
+        if (code === 62 /* > */) {
+          result.pos = pos + 1;
+          result.str = unescapeAll$2(str.slice(start + 1, pos));
+          result.ok = true;
+          return result;
+        }
+        if (code === 92 /* \ */ && pos + 1 < max) {
+          pos += 2;
+          continue;
+        }
+        pos++;
+      }
+      // no closing '>'
+            return result;
+    }
+    // this should be ... } else { ... branch
+        level = 0;
+    while (pos < max) {
+      code = str.charCodeAt(pos);
+      if (code === 32) {
+        break;
+      }
+      // ascii control characters
+            if (code < 32 || code === 127) {
+        break;
+      }
+      if (code === 92 /* \ */ && pos + 1 < max) {
+        if (str.charCodeAt(pos + 1) === 32) {
+          break;
+        }
+        pos += 2;
+        continue;
+      }
+      if (code === 40 /* ( */) {
+        level++;
+        if (level > 32) {
+          return result;
+        }
+      }
+      if (code === 41 /* ) */) {
+        if (level === 0) {
+          break;
+        }
+        level--;
+      }
+      pos++;
+    }
+    if (start === pos) {
+      return result;
+    }
+    if (level !== 0) {
+      return result;
+    }
+    result.str = unescapeAll$2(str.slice(start, pos));
+    result.pos = pos;
+    result.ok = true;
+    return result;
+  };
+  var unescapeAll$1 = utils.unescapeAll;
+  var parse_link_title = function parseLinkTitle(str, start, max) {
+    var code, marker, lines = 0, pos = start, result = {
+      ok: false,
+      pos: 0,
+      lines: 0,
+      str: ""
+    };
+    if (pos >= max) {
+      return result;
+    }
+    marker = str.charCodeAt(pos);
+    if (marker !== 34 /* " */ && marker !== 39 /* ' */ && marker !== 40 /* ( */) {
+      return result;
+    }
+    pos++;
+    // if opening marker is "(", switch it to closing marker ")"
+        if (marker === 40) {
+      marker = 41;
+    }
+    while (pos < max) {
+      code = str.charCodeAt(pos);
+      if (code === marker) {
+        result.pos = pos + 1;
+        result.lines = lines;
+        result.str = unescapeAll$1(str.slice(start + 1, pos));
+        result.ok = true;
+        return result;
+      } else if (code === 40 /* ( */ && marker === 41 /* ) */) {
+        return result;
+      } else if (code === 10) {
+        lines++;
+      } else if (code === 92 /* \ */ && pos + 1 < max) {
+        pos++;
+        if (str.charCodeAt(pos) === 10) {
+          lines++;
+        }
+      }
+      pos++;
+    }
+    return result;
+  };
+  var parseLinkLabel = parse_link_label;
+  var parseLinkDestination = parse_link_destination;
+  var parseLinkTitle = parse_link_title;
+  var helpers = {
+    parseLinkLabel: parseLinkLabel,
+    parseLinkDestination: parseLinkDestination,
+    parseLinkTitle: parseLinkTitle
+  };
+  var assign$1 = utils.assign;
+  var unescapeAll = utils.unescapeAll;
+  var escapeHtml = utils.escapeHtml;
+  ////////////////////////////////////////////////////////////////////////////////
+    var default_rules = {};
+  default_rules.code_inline = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx];
+    return "" + escapeHtml(token.content) + "";
+  };
+  default_rules.code_block = function(tokens, idx, options, env, slf) {
+    var token = tokens[idx];
+    return "" + escapeHtml(tokens[idx].content) + "
\n"; + }; + default_rules.fence = function(tokens, idx, options, env, slf) { + var token = tokens[idx], info = token.info ? unescapeAll(token.info).trim() : "", langName = "", langAttrs = "", highlighted, i, arr, tmpAttrs, tmpToken; + if (info) { + arr = info.split(/(\s+)/g); + langName = arr[0]; + langAttrs = arr.slice(2).join(""); + } + if (options.highlight) { + highlighted = options.highlight(token.content, langName, langAttrs) || escapeHtml(token.content); + } else { + highlighted = escapeHtml(token.content); + } + if (highlighted.indexOf("" + highlighted + "\n"; + } + return "
" + highlighted + "
\n"; + }; + default_rules.image = function(tokens, idx, options, env, slf) { + var token = tokens[idx]; + // "alt" attr MUST be set, even if empty. Because it's mandatory and + // should be placed on proper position for tests. + + // Replace content with actual value + token.attrs[token.attrIndex("alt")][1] = slf.renderInlineAsText(token.children, options, env); + return slf.renderToken(tokens, idx, options); + }; + default_rules.hardbreak = function(tokens, idx, options /*, env */) { + return options.xhtmlOut ? "
\n" : "
\n"; + }; + default_rules.softbreak = function(tokens, idx, options /*, env */) { + return options.breaks ? options.xhtmlOut ? "
\n" : "
\n" : "\n"; + }; + default_rules.text = function(tokens, idx /*, options, env */) { + return escapeHtml(tokens[idx].content); + }; + default_rules.html_block = function(tokens, idx /*, options, env */) { + return tokens[idx].content; + }; + default_rules.html_inline = function(tokens, idx /*, options, env */) { + return tokens[idx].content; + }; + /** + * new Renderer() + * + * Creates new [[Renderer]] instance and fill [[Renderer#rules]] with defaults. + **/ function Renderer() { + /** + * Renderer#rules -> Object + * + * Contains render rules for tokens. Can be updated and extended. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')(); + * + * md.renderer.rules.strong_open = function () { return ''; }; + * md.renderer.rules.strong_close = function () { return ''; }; + * + * var result = md.renderInline(...); + * ``` + * + * Each rule is called as independent static function with fixed signature: + * + * ```javascript + * function my_token_render(tokens, idx, options, env, renderer) { + * // ... + * return renderedHTML; + * } + * ``` + * + * See [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js) + * for more details and examples. + **/ + this.rules = assign$1({}, default_rules); + } + /** + * Renderer.renderAttrs(token) -> String + * + * Render token attributes to string. + **/ Renderer.prototype.renderAttrs = function renderAttrs(token) { + var i, l, result; + if (!token.attrs) { + return ""; + } + result = ""; + for (i = 0, l = token.attrs.length; i < l; i++) { + result += " " + escapeHtml(token.attrs[i][0]) + '="' + escapeHtml(token.attrs[i][1]) + '"'; + } + return result; + }; + /** + * Renderer.renderToken(tokens, idx, options) -> String + * - tokens (Array): list of tokens + * - idx (Numbed): token index to render + * - options (Object): params of parser instance + * + * Default token renderer. Can be overriden by custom function + * in [[Renderer#rules]]. + **/ Renderer.prototype.renderToken = function renderToken(tokens, idx, options) { + var nextToken, result = "", needLf = false, token = tokens[idx]; + // Tight list paragraphs + if (token.hidden) { + return ""; + } + // Insert a newline between hidden paragraph and subsequent opening + // block-level tag. + + // For example, here we should insert a newline before blockquote: + // - a + // > + + if (token.block && token.nesting !== -1 && idx && tokens[idx - 1].hidden) { + result += "\n"; + } + // Add token name, e.g. ``. + needLf = false; + } + } + } + } + result += needLf ? ">\n" : ">"; + return result; + }; + /** + * Renderer.renderInline(tokens, options, env) -> String + * - tokens (Array): list on block tokens to render + * - options (Object): params of parser instance + * - env (Object): additional data from parsed input (references, for example) + * + * The same as [[Renderer.render]], but for single token of `inline` type. + **/ Renderer.prototype.renderInline = function(tokens, options, env) { + var type, result = "", rules = this.rules; + for (var i = 0, len = tokens.length; i < len; i++) { + type = tokens[i].type; + if (typeof rules[type] !== "undefined") { + result += rules[type](tokens, i, options, env, this); + } else { + result += this.renderToken(tokens, i, options); + } + } + return result; + }; + /** internal + * Renderer.renderInlineAsText(tokens, options, env) -> String + * - tokens (Array): list on block tokens to render + * - options (Object): params of parser instance + * - env (Object): additional data from parsed input (references, for example) + * + * Special kludge for image `alt` attributes to conform CommonMark spec. + * Don't try to use it! Spec requires to show `alt` content with stripped markup, + * instead of simple escaping. + **/ Renderer.prototype.renderInlineAsText = function(tokens, options, env) { + var result = ""; + for (var i = 0, len = tokens.length; i < len; i++) { + if (tokens[i].type === "text") { + result += tokens[i].content; + } else if (tokens[i].type === "image") { + result += this.renderInlineAsText(tokens[i].children, options, env); + } else if (tokens[i].type === "softbreak") { + result += "\n"; + } + } + return result; + }; + /** + * Renderer.render(tokens, options, env) -> String + * - tokens (Array): list on block tokens to render + * - options (Object): params of parser instance + * - env (Object): additional data from parsed input (references, for example) + * + * Takes token stream and generates HTML. Probably, you will never need to call + * this method directly. + **/ Renderer.prototype.render = function(tokens, options, env) { + var i, len, type, result = "", rules = this.rules; + for (i = 0, len = tokens.length; i < len; i++) { + type = tokens[i].type; + if (type === "inline") { + result += this.renderInline(tokens[i].children, options, env); + } else if (typeof rules[type] !== "undefined") { + result += rules[type](tokens, i, options, env, this); + } else { + result += this.renderToken(tokens, i, options, env); + } + } + return result; + }; + var renderer = Renderer; + /** + * class Ruler + * + * Helper class, used by [[MarkdownIt#core]], [[MarkdownIt#block]] and + * [[MarkdownIt#inline]] to manage sequences of functions (rules): + * + * - keep rules in defined order + * - assign the name to each rule + * - enable/disable rules + * - add/replace rules + * - allow assign rules to additional named chains (in the same) + * - cacheing lists of active rules + * + * You will not need use this class directly until write plugins. For simple + * rules control use [[MarkdownIt.disable]], [[MarkdownIt.enable]] and + * [[MarkdownIt.use]]. + **/ + /** + * new Ruler() + **/ function Ruler() { + // List of added rules. Each element is: + // { + // name: XXX, + // enabled: Boolean, + // fn: Function(), + // alt: [ name2, name3 ] + // } + this.__rules__ = []; + // Cached rule chains. + + // First level - chain name, '' for default. + // Second level - diginal anchor for fast filtering by charcodes. + + this.__cache__ = null; + } + //////////////////////////////////////////////////////////////////////////////// + // Helper methods, should not be used directly + // Find rule index by name + + Ruler.prototype.__find__ = function(name) { + for (var i = 0; i < this.__rules__.length; i++) { + if (this.__rules__[i].name === name) { + return i; + } + } + return -1; + }; + // Build rules lookup cache + + Ruler.prototype.__compile__ = function() { + var self = this; + var chains = [ "" ]; + // collect unique names + self.__rules__.forEach((function(rule) { + if (!rule.enabled) { + return; + } + rule.alt.forEach((function(altName) { + if (chains.indexOf(altName) < 0) { + chains.push(altName); + } + })); + })); + self.__cache__ = {}; + chains.forEach((function(chain) { + self.__cache__[chain] = []; + self.__rules__.forEach((function(rule) { + if (!rule.enabled) { + return; + } + if (chain && rule.alt.indexOf(chain) < 0) { + return; + } + self.__cache__[chain].push(rule.fn); + })); + })); + }; + /** + * Ruler.at(name, fn [, options]) + * - name (String): rule name to replace. + * - fn (Function): new rule function. + * - options (Object): new rule options (not mandatory). + * + * Replace rule by name with new function & options. Throws error if name not + * found. + * + * ##### Options: + * + * - __alt__ - array with names of "alternate" chains. + * + * ##### Example + * + * Replace existing typographer replacement rule with new one: + * + * ```javascript + * var md = require('markdown-it')(); + * + * md.core.ruler.at('replacements', function replace(state) { + * //... + * }); + * ``` + **/ Ruler.prototype.at = function(name, fn, options) { + var index = this.__find__(name); + var opt = options || {}; + if (index === -1) { + throw new Error("Parser rule not found: " + name); + } + this.__rules__[index].fn = fn; + this.__rules__[index].alt = opt.alt || []; + this.__cache__ = null; + }; + /** + * Ruler.before(beforeName, ruleName, fn [, options]) + * - beforeName (String): new rule will be added before this one. + * - ruleName (String): name of added rule. + * - fn (Function): rule function. + * - options (Object): rule options (not mandatory). + * + * Add new rule to chain before one with given name. See also + * [[Ruler.after]], [[Ruler.push]]. + * + * ##### Options: + * + * - __alt__ - array with names of "alternate" chains. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')(); + * + * md.block.ruler.before('paragraph', 'my_rule', function replace(state) { + * //... + * }); + * ``` + **/ Ruler.prototype.before = function(beforeName, ruleName, fn, options) { + var index = this.__find__(beforeName); + var opt = options || {}; + if (index === -1) { + throw new Error("Parser rule not found: " + beforeName); + } + this.__rules__.splice(index, 0, { + name: ruleName, + enabled: true, + fn: fn, + alt: opt.alt || [] + }); + this.__cache__ = null; + }; + /** + * Ruler.after(afterName, ruleName, fn [, options]) + * - afterName (String): new rule will be added after this one. + * - ruleName (String): name of added rule. + * - fn (Function): rule function. + * - options (Object): rule options (not mandatory). + * + * Add new rule to chain after one with given name. See also + * [[Ruler.before]], [[Ruler.push]]. + * + * ##### Options: + * + * - __alt__ - array with names of "alternate" chains. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')(); + * + * md.inline.ruler.after('text', 'my_rule', function replace(state) { + * //... + * }); + * ``` + **/ Ruler.prototype.after = function(afterName, ruleName, fn, options) { + var index = this.__find__(afterName); + var opt = options || {}; + if (index === -1) { + throw new Error("Parser rule not found: " + afterName); + } + this.__rules__.splice(index + 1, 0, { + name: ruleName, + enabled: true, + fn: fn, + alt: opt.alt || [] + }); + this.__cache__ = null; + }; + /** + * Ruler.push(ruleName, fn [, options]) + * - ruleName (String): name of added rule. + * - fn (Function): rule function. + * - options (Object): rule options (not mandatory). + * + * Push new rule to the end of chain. See also + * [[Ruler.before]], [[Ruler.after]]. + * + * ##### Options: + * + * - __alt__ - array with names of "alternate" chains. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')(); + * + * md.core.ruler.push('my_rule', function replace(state) { + * //... + * }); + * ``` + **/ Ruler.prototype.push = function(ruleName, fn, options) { + var opt = options || {}; + this.__rules__.push({ + name: ruleName, + enabled: true, + fn: fn, + alt: opt.alt || [] + }); + this.__cache__ = null; + }; + /** + * Ruler.enable(list [, ignoreInvalid]) -> Array + * - list (String|Array): list of rule names to enable. + * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found. + * + * Enable rules with given names. If any rule name not found - throw Error. + * Errors can be disabled by second param. + * + * Returns list of found rule names (if no exception happened). + * + * See also [[Ruler.disable]], [[Ruler.enableOnly]]. + **/ Ruler.prototype.enable = function(list, ignoreInvalid) { + if (!Array.isArray(list)) { + list = [ list ]; + } + var result = []; + // Search by name and enable + list.forEach((function(name) { + var idx = this.__find__(name); + if (idx < 0) { + if (ignoreInvalid) { + return; + } + throw new Error("Rules manager: invalid rule name " + name); + } + this.__rules__[idx].enabled = true; + result.push(name); + }), this); + this.__cache__ = null; + return result; + }; + /** + * Ruler.enableOnly(list [, ignoreInvalid]) + * - list (String|Array): list of rule names to enable (whitelist). + * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found. + * + * Enable rules with given names, and disable everything else. If any rule name + * not found - throw Error. Errors can be disabled by second param. + * + * See also [[Ruler.disable]], [[Ruler.enable]]. + **/ Ruler.prototype.enableOnly = function(list, ignoreInvalid) { + if (!Array.isArray(list)) { + list = [ list ]; + } + this.__rules__.forEach((function(rule) { + rule.enabled = false; + })); + this.enable(list, ignoreInvalid); + }; + /** + * Ruler.disable(list [, ignoreInvalid]) -> Array + * - list (String|Array): list of rule names to disable. + * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found. + * + * Disable rules with given names. If any rule name not found - throw Error. + * Errors can be disabled by second param. + * + * Returns list of found rule names (if no exception happened). + * + * See also [[Ruler.enable]], [[Ruler.enableOnly]]. + **/ Ruler.prototype.disable = function(list, ignoreInvalid) { + if (!Array.isArray(list)) { + list = [ list ]; + } + var result = []; + // Search by name and disable + list.forEach((function(name) { + var idx = this.__find__(name); + if (idx < 0) { + if (ignoreInvalid) { + return; + } + throw new Error("Rules manager: invalid rule name " + name); + } + this.__rules__[idx].enabled = false; + result.push(name); + }), this); + this.__cache__ = null; + return result; + }; + /** + * Ruler.getRules(chainName) -> Array + * + * Return array of active functions (rules) for given chain name. It analyzes + * rules configuration, compiles caches if not exists and returns result. + * + * Default chain name is `''` (empty string). It can't be skipped. That's + * done intentionally, to keep signature monomorphic for high speed. + **/ Ruler.prototype.getRules = function(chainName) { + if (this.__cache__ === null) { + this.__compile__(); + } + // Chain can be empty, if rules disabled. But we still have to return Array. + return this.__cache__[chainName] || []; + }; + var ruler = Ruler; + // Normalize input string + // https://spec.commonmark.org/0.29/#line-ending + var NEWLINES_RE = /\r\n?|\n/g; + var NULL_RE = /\0/g; + var normalize = function normalize(state) { + var str; + // Normalize newlines + str = state.src.replace(NEWLINES_RE, "\n"); + // Replace NULL characters + str = str.replace(NULL_RE, "\ufffd"); + state.src = str; + }; + var block = function block(state) { + var token; + if (state.inlineMode) { + token = new state.Token("inline", "", 0); + token.content = state.src; + token.map = [ 0, 1 ]; + token.children = []; + state.tokens.push(token); + } else { + state.md.block.parse(state.src, state.md, state.env, state.tokens); + } + }; + var inline = function inline(state) { + var tokens = state.tokens, tok, i, l; + // Parse inlines + for (i = 0, l = tokens.length; i < l; i++) { + tok = tokens[i]; + if (tok.type === "inline") { + state.md.inline.parse(tok.content, state.md, state.env, tok.children); + } + } + }; + var arrayReplaceAt = utils.arrayReplaceAt; + function isLinkOpen$1(str) { + return /^\s]/i.test(str); + } + function isLinkClose$1(str) { + return /^<\/a\s*>/i.test(str); + } + var linkify$1 = function linkify(state) { + var i, j, l, tokens, token, currentToken, nodes, ln, text, pos, lastPos, level, htmlLinkLevel, url, fullUrl, urlText, blockTokens = state.tokens, links; + if (!state.md.options.linkify) { + return; + } + for (j = 0, l = blockTokens.length; j < l; j++) { + if (blockTokens[j].type !== "inline" || !state.md.linkify.pretest(blockTokens[j].content)) { + continue; + } + tokens = blockTokens[j].children; + htmlLinkLevel = 0; + // We scan from the end, to keep position when new tags added. + // Use reversed logic in links start/end match + for (i = tokens.length - 1; i >= 0; i--) { + currentToken = tokens[i]; + // Skip content of markdown links + if (currentToken.type === "link_close") { + i--; + while (tokens[i].level !== currentToken.level && tokens[i].type !== "link_open") { + i--; + } + continue; + } + // Skip content of html tag links + if (currentToken.type === "html_inline") { + if (isLinkOpen$1(currentToken.content) && htmlLinkLevel > 0) { + htmlLinkLevel--; + } + if (isLinkClose$1(currentToken.content)) { + htmlLinkLevel++; + } + } + if (htmlLinkLevel > 0) { + continue; + } + if (currentToken.type === "text" && state.md.linkify.test(currentToken.content)) { + text = currentToken.content; + links = state.md.linkify.match(text); + // Now split string to nodes + nodes = []; + level = currentToken.level; + lastPos = 0; + // forbid escape sequence at the start of the string, + // this avoids http\://example.com/ from being linkified as + // http://example.com/ + if (links.length > 0 && links[0].index === 0 && i > 0 && tokens[i - 1].type === "text_special") { + links = links.slice(1); + } + for (ln = 0; ln < links.length; ln++) { + url = links[ln].url; + fullUrl = state.md.normalizeLink(url); + if (!state.md.validateLink(fullUrl)) { + continue; + } + urlText = links[ln].text; + // Linkifier might send raw hostnames like "example.com", where url + // starts with domain name. So we prepend http:// in those cases, + // and remove it afterwards. + + if (!links[ln].schema) { + urlText = state.md.normalizeLinkText("http://" + urlText).replace(/^http:\/\//, ""); + } else if (links[ln].schema === "mailto:" && !/^mailto:/i.test(urlText)) { + urlText = state.md.normalizeLinkText("mailto:" + urlText).replace(/^mailto:/, ""); + } else { + urlText = state.md.normalizeLinkText(urlText); + } + pos = links[ln].index; + if (pos > lastPos) { + token = new state.Token("text", "", 0); + token.content = text.slice(lastPos, pos); + token.level = level; + nodes.push(token); + } + token = new state.Token("link_open", "a", 1); + token.attrs = [ [ "href", fullUrl ] ]; + token.level = level++; + token.markup = "linkify"; + token.info = "auto"; + nodes.push(token); + token = new state.Token("text", "", 0); + token.content = urlText; + token.level = level; + nodes.push(token); + token = new state.Token("link_close", "a", -1); + token.level = --level; + token.markup = "linkify"; + token.info = "auto"; + nodes.push(token); + lastPos = links[ln].lastIndex; + } + if (lastPos < text.length) { + token = new state.Token("text", "", 0); + token.content = text.slice(lastPos); + token.level = level; + nodes.push(token); + } + // replace current node + blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes); + } + } + } + }; + // Simple typographic replacements + // TODO: + // - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ + // - multiplications 2 x 4 -> 2 × 4 + var RARE_RE = /\+-|\.\.|\?\?\?\?|!!!!|,,|--/; + // Workaround for phantomjs - need regex without /g flag, + // or root check will fail every second time + var SCOPED_ABBR_TEST_RE = /\((c|tm|r)\)/i; + var SCOPED_ABBR_RE = /\((c|tm|r)\)/gi; + var SCOPED_ABBR = { + c: "\xa9", + r: "\xae", + tm: "\u2122" + }; + function replaceFn(match, name) { + return SCOPED_ABBR[name.toLowerCase()]; + } + function replace_scoped(inlineTokens) { + var i, token, inside_autolink = 0; + for (i = inlineTokens.length - 1; i >= 0; i--) { + token = inlineTokens[i]; + if (token.type === "text" && !inside_autolink) { + token.content = token.content.replace(SCOPED_ABBR_RE, replaceFn); + } + if (token.type === "link_open" && token.info === "auto") { + inside_autolink--; + } + if (token.type === "link_close" && token.info === "auto") { + inside_autolink++; + } + } + } + function replace_rare(inlineTokens) { + var i, token, inside_autolink = 0; + for (i = inlineTokens.length - 1; i >= 0; i--) { + token = inlineTokens[i]; + if (token.type === "text" && !inside_autolink) { + if (RARE_RE.test(token.content)) { + token.content = token.content.replace(/\+-/g, "\xb1").replace(/\.{2,}/g, "\u2026").replace(/([?!])\u2026/g, "$1..").replace(/([?!]){4,}/g, "$1$1$1").replace(/,{2,}/g, ",").replace(/(^|[^-])---(?=[^-]|$)/gm, "$1\u2014").replace(/(^|\s)--(?=\s|$)/gm, "$1\u2013").replace(/(^|[^-\s])--(?=[^-\s]|$)/gm, "$1\u2013"); + } + } + if (token.type === "link_open" && token.info === "auto") { + inside_autolink--; + } + if (token.type === "link_close" && token.info === "auto") { + inside_autolink++; + } + } + } + var replacements = function replace(state) { + var blkIdx; + if (!state.md.options.typographer) { + return; + } + for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) { + if (state.tokens[blkIdx].type !== "inline") { + continue; + } + if (SCOPED_ABBR_TEST_RE.test(state.tokens[blkIdx].content)) { + replace_scoped(state.tokens[blkIdx].children); + } + if (RARE_RE.test(state.tokens[blkIdx].content)) { + replace_rare(state.tokens[blkIdx].children); + } + } + }; + var isWhiteSpace$1 = utils.isWhiteSpace; + var isPunctChar$1 = utils.isPunctChar; + var isMdAsciiPunct$1 = utils.isMdAsciiPunct; + var QUOTE_TEST_RE = /['"]/; + var QUOTE_RE = /['"]/g; + var APOSTROPHE = "\u2019"; + /* ’ */ function replaceAt(str, index, ch) { + return str.slice(0, index) + ch + str.slice(index + 1); + } + function process_inlines(tokens, state) { + var i, token, text, t, pos, max, thisLevel, item, lastChar, nextChar, isLastPunctChar, isNextPunctChar, isLastWhiteSpace, isNextWhiteSpace, canOpen, canClose, j, isSingle, stack, openQuote, closeQuote; + stack = []; + for (i = 0; i < tokens.length; i++) { + token = tokens[i]; + thisLevel = tokens[i].level; + for (j = stack.length - 1; j >= 0; j--) { + if (stack[j].level <= thisLevel) { + break; + } + } + stack.length = j + 1; + if (token.type !== "text") { + continue; + } + text = token.content; + pos = 0; + max = text.length; + /*eslint no-labels:0,block-scoped-var:0*/ OUTER: while (pos < max) { + QUOTE_RE.lastIndex = pos; + t = QUOTE_RE.exec(text); + if (!t) { + break; + } + canOpen = canClose = true; + pos = t.index + 1; + isSingle = t[0] === "'"; + // Find previous character, + // default to space if it's the beginning of the line + + lastChar = 32; + if (t.index - 1 >= 0) { + lastChar = text.charCodeAt(t.index - 1); + } else { + for (j = i - 1; j >= 0; j--) { + if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break; + // lastChar defaults to 0x20 + if (!tokens[j].content) continue; + // should skip all tokens except 'text', 'html_inline' or 'code_inline' + lastChar = tokens[j].content.charCodeAt(tokens[j].content.length - 1); + break; + } + } + // Find next character, + // default to space if it's the end of the line + + nextChar = 32; + if (pos < max) { + nextChar = text.charCodeAt(pos); + } else { + for (j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === "softbreak" || tokens[j].type === "hardbreak") break; + // nextChar defaults to 0x20 + if (!tokens[j].content) continue; + // should skip all tokens except 'text', 'html_inline' or 'code_inline' + nextChar = tokens[j].content.charCodeAt(0); + break; + } + } + isLastPunctChar = isMdAsciiPunct$1(lastChar) || isPunctChar$1(String.fromCharCode(lastChar)); + isNextPunctChar = isMdAsciiPunct$1(nextChar) || isPunctChar$1(String.fromCharCode(nextChar)); + isLastWhiteSpace = isWhiteSpace$1(lastChar); + isNextWhiteSpace = isWhiteSpace$1(nextChar); + if (isNextWhiteSpace) { + canOpen = false; + } else if (isNextPunctChar) { + if (!(isLastWhiteSpace || isLastPunctChar)) { + canOpen = false; + } + } + if (isLastWhiteSpace) { + canClose = false; + } else if (isLastPunctChar) { + if (!(isNextWhiteSpace || isNextPunctChar)) { + canClose = false; + } + } + if (nextChar === 34 /* " */ && t[0] === '"') { + if (lastChar >= 48 /* 0 */ && lastChar <= 57 /* 9 */) { + // special case: 1"" - count first quote as an inch + canClose = canOpen = false; + } + } + if (canOpen && canClose) { + // Replace quotes in the middle of punctuation sequence, but not + // in the middle of the words, i.e.: + // 1. foo " bar " baz - not replaced + // 2. foo-"-bar-"-baz - replaced + // 3. foo"bar"baz - not replaced + canOpen = isLastPunctChar; + canClose = isNextPunctChar; + } + if (!canOpen && !canClose) { + // middle of word + if (isSingle) { + token.content = replaceAt(token.content, t.index, APOSTROPHE); + } + continue; + } + if (canClose) { + // this could be a closing quote, rewind the stack to get a match + for (j = stack.length - 1; j >= 0; j--) { + item = stack[j]; + if (stack[j].level < thisLevel) { + break; + } + if (item.single === isSingle && stack[j].level === thisLevel) { + item = stack[j]; + if (isSingle) { + openQuote = state.md.options.quotes[2]; + closeQuote = state.md.options.quotes[3]; + } else { + openQuote = state.md.options.quotes[0]; + closeQuote = state.md.options.quotes[1]; + } + // replace token.content *before* tokens[item.token].content, + // because, if they are pointing at the same token, replaceAt + // could mess up indices when quote length != 1 + token.content = replaceAt(token.content, t.index, closeQuote); + tokens[item.token].content = replaceAt(tokens[item.token].content, item.pos, openQuote); + pos += closeQuote.length - 1; + if (item.token === i) { + pos += openQuote.length - 1; + } + text = token.content; + max = text.length; + stack.length = j; + continue OUTER; + } + } + } + if (canOpen) { + stack.push({ + token: i, + pos: t.index, + single: isSingle, + level: thisLevel + }); + } else if (canClose && isSingle) { + token.content = replaceAt(token.content, t.index, APOSTROPHE); + } + } + } + } + var smartquotes = function smartquotes(state) { + /*eslint max-depth:0*/ + var blkIdx; + if (!state.md.options.typographer) { + return; + } + for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) { + if (state.tokens[blkIdx].type !== "inline" || !QUOTE_TEST_RE.test(state.tokens[blkIdx].content)) { + continue; + } + process_inlines(state.tokens[blkIdx].children, state); + } + }; + // Join raw text tokens with the rest of the text + var text_join = function text_join(state) { + var j, l, tokens, curr, max, last, blockTokens = state.tokens; + for (j = 0, l = blockTokens.length; j < l; j++) { + if (blockTokens[j].type !== "inline") continue; + tokens = blockTokens[j].children; + max = tokens.length; + for (curr = 0; curr < max; curr++) { + if (tokens[curr].type === "text_special") { + tokens[curr].type = "text"; + } + } + for (curr = last = 0; curr < max; curr++) { + if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") { + // collapse two adjacent text nodes + tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content; + } else { + if (curr !== last) { + tokens[last] = tokens[curr]; + } + last++; + } + } + if (curr !== last) { + tokens.length = last; + } + } + }; + // Token class + /** + * class Token + **/ + /** + * new Token(type, tag, nesting) + * + * Create new token and fill passed properties. + **/ function Token(type, tag, nesting) { + /** + * Token#type -> String + * + * Type of the token (string, e.g. "paragraph_open") + **/ + this.type = type; + /** + * Token#tag -> String + * + * html tag name, e.g. "p" + **/ this.tag = tag; + /** + * Token#attrs -> Array + * + * Html attributes. Format: `[ [ name1, value1 ], [ name2, value2 ] ]` + **/ this.attrs = null; + /** + * Token#map -> Array + * + * Source map info. Format: `[ line_begin, line_end ]` + **/ this.map = null; + /** + * Token#nesting -> Number + * + * Level change (number in {-1, 0, 1} set), where: + * + * - `1` means the tag is opening + * - `0` means the tag is self-closing + * - `-1` means the tag is closing + **/ this.nesting = nesting; + /** + * Token#level -> Number + * + * nesting level, the same as `state.level` + **/ this.level = 0; + /** + * Token#children -> Array + * + * An array of child nodes (inline and img tokens) + **/ this.children = null; + /** + * Token#content -> String + * + * In a case of self-closing tag (code, html, fence, etc.), + * it has contents of this tag. + **/ this.content = ""; + /** + * Token#markup -> String + * + * '*' or '_' for emphasis, fence string for fence, etc. + **/ this.markup = ""; + /** + * Token#info -> String + * + * Additional information: + * + * - Info string for "fence" tokens + * - The value "auto" for autolink "link_open" and "link_close" tokens + * - The string value of the item marker for ordered-list "list_item_open" tokens + **/ this.info = ""; + /** + * Token#meta -> Object + * + * A place for plugins to store an arbitrary data + **/ this.meta = null; + /** + * Token#block -> Boolean + * + * True for block-level tokens, false for inline tokens. + * Used in renderer to calculate line breaks + **/ this.block = false; + /** + * Token#hidden -> Boolean + * + * If it's true, ignore this element when rendering. Used for tight lists + * to hide paragraphs. + **/ this.hidden = false; + } + /** + * Token.attrIndex(name) -> Number + * + * Search attribute index by name. + **/ Token.prototype.attrIndex = function attrIndex(name) { + var attrs, i, len; + if (!this.attrs) { + return -1; + } + attrs = this.attrs; + for (i = 0, len = attrs.length; i < len; i++) { + if (attrs[i][0] === name) { + return i; + } + } + return -1; + }; + /** + * Token.attrPush(attrData) + * + * Add `[ name, value ]` attribute to list. Init attrs if necessary + **/ Token.prototype.attrPush = function attrPush(attrData) { + if (this.attrs) { + this.attrs.push(attrData); + } else { + this.attrs = [ attrData ]; + } + }; + /** + * Token.attrSet(name, value) + * + * Set `name` attribute to `value`. Override old value if exists. + **/ Token.prototype.attrSet = function attrSet(name, value) { + var idx = this.attrIndex(name), attrData = [ name, value ]; + if (idx < 0) { + this.attrPush(attrData); + } else { + this.attrs[idx] = attrData; + } + }; + /** + * Token.attrGet(name) + * + * Get the value of attribute `name`, or null if it does not exist. + **/ Token.prototype.attrGet = function attrGet(name) { + var idx = this.attrIndex(name), value = null; + if (idx >= 0) { + value = this.attrs[idx][1]; + } + return value; + }; + /** + * Token.attrJoin(name, value) + * + * Join value to existing attribute via space. Or create new attribute if not + * exists. Useful to operate with token classes. + **/ Token.prototype.attrJoin = function attrJoin(name, value) { + var idx = this.attrIndex(name); + if (idx < 0) { + this.attrPush([ name, value ]); + } else { + this.attrs[idx][1] = this.attrs[idx][1] + " " + value; + } + }; + var token = Token; + function StateCore(src, md, env) { + this.src = src; + this.env = env; + this.tokens = []; + this.inlineMode = false; + this.md = md; + // link to parser instance + } + // re-export Token class to use in core rules + StateCore.prototype.Token = token; + var state_core = StateCore; + var _rules$2 = [ [ "normalize", normalize ], [ "block", block ], [ "inline", inline ], [ "linkify", linkify$1 ], [ "replacements", replacements ], [ "smartquotes", smartquotes ], + // `text_join` finds `text_special` tokens (for escape sequences) + // and joins them with the rest of the text + [ "text_join", text_join ] ]; + /** + * new Core() + **/ function Core() { + /** + * Core#ruler -> Ruler + * + * [[Ruler]] instance. Keep configuration of core rules. + **/ + this.ruler = new ruler; + for (var i = 0; i < _rules$2.length; i++) { + this.ruler.push(_rules$2[i][0], _rules$2[i][1]); + } + } + /** + * Core.process(state) + * + * Executes core chain rules. + **/ Core.prototype.process = function(state) { + var i, l, rules; + rules = this.ruler.getRules(""); + for (i = 0, l = rules.length; i < l; i++) { + rules[i](state); + } + }; + Core.prototype.State = state_core; + var parser_core = Core; + var isSpace$a = utils.isSpace; + function getLine(state, line) { + var pos = state.bMarks[line] + state.tShift[line], max = state.eMarks[line]; + return state.src.slice(pos, max); + } + function escapedSplit(str) { + var result = [], pos = 0, max = str.length, ch, isEscaped = false, lastPos = 0, current = ""; + ch = str.charCodeAt(pos); + while (pos < max) { + if (ch === 124 /* | */) { + if (!isEscaped) { + // pipe separating cells, '|' + result.push(current + str.substring(lastPos, pos)); + current = ""; + lastPos = pos + 1; + } else { + // escaped pipe, '\|' + current += str.substring(lastPos, pos - 1); + lastPos = pos; + } + } + isEscaped = ch === 92 /* \ */; + pos++; + ch = str.charCodeAt(pos); + } + result.push(current + str.substring(lastPos)); + return result; + } + var table = function table(state, startLine, endLine, silent) { + var ch, lineText, pos, i, l, nextLine, columns, columnCount, token, aligns, t, tableLines, tbodyLines, oldParentType, terminate, terminatorRules, firstCh, secondCh; + // should have at least two lines + if (startLine + 2 > endLine) { + return false; + } + nextLine = startLine + 1; + if (state.sCount[nextLine] < state.blkIndent) { + return false; + } + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[nextLine] - state.blkIndent >= 4) { + return false; + } + // first character of the second line should be '|', '-', ':', + // and no other characters are allowed but spaces; + // basically, this is the equivalent of /^[-:|][-:|\s]*$/ regexp + pos = state.bMarks[nextLine] + state.tShift[nextLine]; + if (pos >= state.eMarks[nextLine]) { + return false; + } + firstCh = state.src.charCodeAt(pos++); + if (firstCh !== 124 /* | */ && firstCh !== 45 /* - */ && firstCh !== 58 /* : */) { + return false; + } + if (pos >= state.eMarks[nextLine]) { + return false; + } + secondCh = state.src.charCodeAt(pos++); + if (secondCh !== 124 /* | */ && secondCh !== 45 /* - */ && secondCh !== 58 /* : */ && !isSpace$a(secondCh)) { + return false; + } + // if first character is '-', then second character must not be a space + // (due to parsing ambiguity with list) + if (firstCh === 45 /* - */ && isSpace$a(secondCh)) { + return false; + } + while (pos < state.eMarks[nextLine]) { + ch = state.src.charCodeAt(pos); + if (ch !== 124 /* | */ && ch !== 45 /* - */ && ch !== 58 /* : */ && !isSpace$a(ch)) { + return false; + } + pos++; + } + lineText = getLine(state, startLine + 1); + columns = lineText.split("|"); + aligns = []; + for (i = 0; i < columns.length; i++) { + t = columns[i].trim(); + if (!t) { + // allow empty columns before and after table, but not in between columns; + // e.g. allow ` |---| `, disallow ` ---||--- ` + if (i === 0 || i === columns.length - 1) { + continue; + } else { + return false; + } + } + if (!/^:?-+:?$/.test(t)) { + return false; + } + if (t.charCodeAt(t.length - 1) === 58 /* : */) { + aligns.push(t.charCodeAt(0) === 58 /* : */ ? "center" : "right"); + } else if (t.charCodeAt(0) === 58 /* : */) { + aligns.push("left"); + } else { + aligns.push(""); + } + } + lineText = getLine(state, startLine).trim(); + if (lineText.indexOf("|") === -1) { + return false; + } + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + columns = escapedSplit(lineText); + if (columns.length && columns[0] === "") columns.shift(); + if (columns.length && columns[columns.length - 1] === "") columns.pop(); + // header row will define an amount of columns in the entire table, + // and align row should be exactly the same (the rest of the rows can differ) + columnCount = columns.length; + if (columnCount === 0 || columnCount !== aligns.length) { + return false; + } + if (silent) { + return true; + } + oldParentType = state.parentType; + state.parentType = "table"; + // use 'blockquote' lists for termination because it's + // the most similar to tables + terminatorRules = state.md.block.ruler.getRules("blockquote"); + token = state.push("table_open", "table", 1); + token.map = tableLines = [ startLine, 0 ]; + token = state.push("thead_open", "thead", 1); + token.map = [ startLine, startLine + 1 ]; + token = state.push("tr_open", "tr", 1); + token.map = [ startLine, startLine + 1 ]; + for (i = 0; i < columns.length; i++) { + token = state.push("th_open", "th", 1); + if (aligns[i]) { + token.attrs = [ [ "style", "text-align:" + aligns[i] ] ]; + } + token = state.push("inline", "", 0); + token.content = columns[i].trim(); + token.children = []; + token = state.push("th_close", "th", -1); + } + token = state.push("tr_close", "tr", -1); + token = state.push("thead_close", "thead", -1); + for (nextLine = startLine + 2; nextLine < endLine; nextLine++) { + if (state.sCount[nextLine] < state.blkIndent) { + break; + } + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + lineText = getLine(state, nextLine).trim(); + if (!lineText) { + break; + } + if (state.sCount[nextLine] - state.blkIndent >= 4) { + break; + } + columns = escapedSplit(lineText); + if (columns.length && columns[0] === "") columns.shift(); + if (columns.length && columns[columns.length - 1] === "") columns.pop(); + if (nextLine === startLine + 2) { + token = state.push("tbody_open", "tbody", 1); + token.map = tbodyLines = [ startLine + 2, 0 ]; + } + token = state.push("tr_open", "tr", 1); + token.map = [ nextLine, nextLine + 1 ]; + for (i = 0; i < columnCount; i++) { + token = state.push("td_open", "td", 1); + if (aligns[i]) { + token.attrs = [ [ "style", "text-align:" + aligns[i] ] ]; + } + token = state.push("inline", "", 0); + token.content = columns[i] ? columns[i].trim() : ""; + token.children = []; + token = state.push("td_close", "td", -1); + } + token = state.push("tr_close", "tr", -1); + } + if (tbodyLines) { + token = state.push("tbody_close", "tbody", -1); + tbodyLines[1] = nextLine; + } + token = state.push("table_close", "table", -1); + tableLines[1] = nextLine; + state.parentType = oldParentType; + state.line = nextLine; + return true; + }; + // Code block (4 spaces padded) + var code = function code(state, startLine, endLine /*, silent*/) { + var nextLine, last, token; + if (state.sCount[startLine] - state.blkIndent < 4) { + return false; + } + last = nextLine = startLine + 1; + while (nextLine < endLine) { + if (state.isEmpty(nextLine)) { + nextLine++; + continue; + } + if (state.sCount[nextLine] - state.blkIndent >= 4) { + nextLine++; + last = nextLine; + continue; + } + break; + } + state.line = last; + token = state.push("code_block", "code", 0); + token.content = state.getLines(startLine, last, 4 + state.blkIndent, false) + "\n"; + token.map = [ startLine, state.line ]; + return true; + }; + // fences (``` lang, ~~~ lang) + var fence = function fence(state, startLine, endLine, silent) { + var marker, len, params, nextLine, mem, token, markup, haveEndMarker = false, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + if (pos + 3 > max) { + return false; + } + marker = state.src.charCodeAt(pos); + if (marker !== 126 /* ~ */ && marker !== 96 /* ` */) { + return false; + } + // scan marker length + mem = pos; + pos = state.skipChars(pos, marker); + len = pos - mem; + if (len < 3) { + return false; + } + markup = state.src.slice(mem, pos); + params = state.src.slice(pos, max); + if (marker === 96 /* ` */) { + if (params.indexOf(String.fromCharCode(marker)) >= 0) { + return false; + } + } + // Since start is found, we can report success here in validation mode + if (silent) { + return true; + } + // search end of block + nextLine = startLine; + for (;;) { + nextLine++; + if (nextLine >= endLine) { + // unclosed block should be autoclosed by end of document. + // also block seems to be autoclosed by end of parent + break; + } + pos = mem = state.bMarks[nextLine] + state.tShift[nextLine]; + max = state.eMarks[nextLine]; + if (pos < max && state.sCount[nextLine] < state.blkIndent) { + // non-empty line with negative indent should stop the list: + // - ``` + // test + break; + } + if (state.src.charCodeAt(pos) !== marker) { + continue; + } + if (state.sCount[nextLine] - state.blkIndent >= 4) { + // closing fence should be indented less than 4 spaces + continue; + } + pos = state.skipChars(pos, marker); + // closing code fence must be at least as long as the opening one + if (pos - mem < len) { + continue; + } + // make sure tail has spaces only + pos = state.skipSpaces(pos); + if (pos < max) { + continue; + } + haveEndMarker = true; + // found! + break; + } + // If a fence has heading spaces, they should be removed from its inner block + len = state.sCount[startLine]; + state.line = nextLine + (haveEndMarker ? 1 : 0); + token = state.push("fence", "code", 0); + token.info = params; + token.content = state.getLines(startLine + 1, nextLine, len, true); + token.markup = markup; + token.map = [ startLine, state.line ]; + return true; + }; + var isSpace$9 = utils.isSpace; + var blockquote = function blockquote(state, startLine, endLine, silent) { + var adjustTab, ch, i, initial, l, lastLineEmpty, lines, nextLine, offset, oldBMarks, oldBSCount, oldIndent, oldParentType, oldSCount, oldTShift, spaceAfterMarker, terminate, terminatorRules, token, isOutdented, oldLineMax = state.lineMax, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + // check the block quote marker + if (state.src.charCodeAt(pos) !== 62 /* > */) { + return false; + } + // we know that it's going to be a valid blockquote, + // so no point trying to find the end of it in silent mode + if (silent) { + return true; + } + oldBMarks = []; + oldBSCount = []; + oldSCount = []; + oldTShift = []; + terminatorRules = state.md.block.ruler.getRules("blockquote"); + oldParentType = state.parentType; + state.parentType = "blockquote"; + // Search the end of the block + + // Block ends with either: + // 1. an empty line outside: + // ``` + // > test + + // ``` + // 2. an empty line inside: + // ``` + // > + // test + // ``` + // 3. another tag: + // ``` + // > test + // - - - + // ``` + for (nextLine = startLine; nextLine < endLine; nextLine++) { + // check if it's outdented, i.e. it's inside list item and indented + // less than said list item: + // ``` + // 1. anything + // > current blockquote + // 2. checking this line + // ``` + isOutdented = state.sCount[nextLine] < state.blkIndent; + pos = state.bMarks[nextLine] + state.tShift[nextLine]; + max = state.eMarks[nextLine]; + if (pos >= max) { + // Case 1: line is not inside the blockquote, and this line is empty. + break; + } + if (state.src.charCodeAt(pos++) === 62 /* > */ && !isOutdented) { + // This line is inside the blockquote. + // set offset past spaces and ">" + initial = state.sCount[nextLine] + 1; + // skip one optional space after '>' + if (state.src.charCodeAt(pos) === 32 /* space */) { + // ' > test ' + // ^ -- position start of line here: + pos++; + initial++; + adjustTab = false; + spaceAfterMarker = true; + } else if (state.src.charCodeAt(pos) === 9 /* tab */) { + spaceAfterMarker = true; + if ((state.bsCount[nextLine] + initial) % 4 === 3) { + // ' >\t test ' + // ^ -- position start of line here (tab has width===1) + pos++; + initial++; + adjustTab = false; + } else { + // ' >\t test ' + // ^ -- position start of line here + shift bsCount slightly + // to make extra space appear + adjustTab = true; + } + } else { + spaceAfterMarker = false; + } + offset = initial; + oldBMarks.push(state.bMarks[nextLine]); + state.bMarks[nextLine] = pos; + while (pos < max) { + ch = state.src.charCodeAt(pos); + if (isSpace$9(ch)) { + if (ch === 9) { + offset += 4 - (offset + state.bsCount[nextLine] + (adjustTab ? 1 : 0)) % 4; + } else { + offset++; + } + } else { + break; + } + pos++; + } + lastLineEmpty = pos >= max; + oldBSCount.push(state.bsCount[nextLine]); + state.bsCount[nextLine] = state.sCount[nextLine] + 1 + (spaceAfterMarker ? 1 : 0); + oldSCount.push(state.sCount[nextLine]); + state.sCount[nextLine] = offset - initial; + oldTShift.push(state.tShift[nextLine]); + state.tShift[nextLine] = pos - state.bMarks[nextLine]; + continue; + } + // Case 2: line is not inside the blockquote, and the last line was empty. + if (lastLineEmpty) { + break; + } + // Case 3: another tag found. + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + // Quirk to enforce "hard termination mode" for paragraphs; + // normally if you call `tokenize(state, startLine, nextLine)`, + // paragraphs will look below nextLine for paragraph continuation, + // but if blockquote is terminated by another tag, they shouldn't + state.lineMax = nextLine; + if (state.blkIndent !== 0) { + // state.blkIndent was non-zero, we now set it to zero, + // so we need to re-calculate all offsets to appear as + // if indent wasn't changed + oldBMarks.push(state.bMarks[nextLine]); + oldBSCount.push(state.bsCount[nextLine]); + oldTShift.push(state.tShift[nextLine]); + oldSCount.push(state.sCount[nextLine]); + state.sCount[nextLine] -= state.blkIndent; + } + break; + } + oldBMarks.push(state.bMarks[nextLine]); + oldBSCount.push(state.bsCount[nextLine]); + oldTShift.push(state.tShift[nextLine]); + oldSCount.push(state.sCount[nextLine]); + // A negative indentation means that this is a paragraph continuation + + state.sCount[nextLine] = -1; + } + oldIndent = state.blkIndent; + state.blkIndent = 0; + token = state.push("blockquote_open", "blockquote", 1); + token.markup = ">"; + token.map = lines = [ startLine, 0 ]; + state.md.block.tokenize(state, startLine, nextLine); + token = state.push("blockquote_close", "blockquote", -1); + token.markup = ">"; + state.lineMax = oldLineMax; + state.parentType = oldParentType; + lines[1] = state.line; + // Restore original tShift; this might not be necessary since the parser + // has already been here, but just to make sure we can do that. + for (i = 0; i < oldTShift.length; i++) { + state.bMarks[i + startLine] = oldBMarks[i]; + state.tShift[i + startLine] = oldTShift[i]; + state.sCount[i + startLine] = oldSCount[i]; + state.bsCount[i + startLine] = oldBSCount[i]; + } + state.blkIndent = oldIndent; + return true; + }; + var isSpace$8 = utils.isSpace; + var hr = function hr(state, startLine, endLine, silent) { + var marker, cnt, ch, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + marker = state.src.charCodeAt(pos++); + // Check hr marker + if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 95 /* _ */) { + return false; + } + // markers can be mixed with spaces, but there should be at least 3 of them + cnt = 1; + while (pos < max) { + ch = state.src.charCodeAt(pos++); + if (ch !== marker && !isSpace$8(ch)) { + return false; + } + if (ch === marker) { + cnt++; + } + } + if (cnt < 3) { + return false; + } + if (silent) { + return true; + } + state.line = startLine + 1; + token = state.push("hr", "hr", 0); + token.map = [ startLine, state.line ]; + token.markup = Array(cnt + 1).join(String.fromCharCode(marker)); + return true; + }; + var isSpace$7 = utils.isSpace; + // Search `[-+*][\n ]`, returns next pos after marker on success + // or -1 on fail. + function skipBulletListMarker(state, startLine) { + var marker, pos, max, ch; + pos = state.bMarks[startLine] + state.tShift[startLine]; + max = state.eMarks[startLine]; + marker = state.src.charCodeAt(pos++); + // Check bullet + if (marker !== 42 /* * */ && marker !== 45 /* - */ && marker !== 43 /* + */) { + return -1; + } + if (pos < max) { + ch = state.src.charCodeAt(pos); + if (!isSpace$7(ch)) { + // " -test " - is not a list item + return -1; + } + } + return pos; + } + // Search `\d+[.)][\n ]`, returns next pos after marker on success + // or -1 on fail. + function skipOrderedListMarker(state, startLine) { + var ch, start = state.bMarks[startLine] + state.tShift[startLine], pos = start, max = state.eMarks[startLine]; + // List marker should have at least 2 chars (digit + dot) + if (pos + 1 >= max) { + return -1; + } + ch = state.src.charCodeAt(pos++); + if (ch < 48 /* 0 */ || ch > 57 /* 9 */) { + return -1; + } + for (;;) { + // EOL -> fail + if (pos >= max) { + return -1; + } + ch = state.src.charCodeAt(pos++); + if (ch >= 48 /* 0 */ && ch <= 57 /* 9 */) { + // List marker should have no more than 9 digits + // (prevents integer overflow in browsers) + if (pos - start >= 10) { + return -1; + } + continue; + } + // found valid marker + if (ch === 41 /* ) */ || ch === 46 /* . */) { + break; + } + return -1; + } + if (pos < max) { + ch = state.src.charCodeAt(pos); + if (!isSpace$7(ch)) { + // " 1.test " - is not a list item + return -1; + } + } + return pos; + } + function markTightParagraphs(state, idx) { + var i, l, level = state.level + 2; + for (i = idx + 2, l = state.tokens.length - 2; i < l; i++) { + if (state.tokens[i].level === level && state.tokens[i].type === "paragraph_open") { + state.tokens[i + 2].hidden = true; + state.tokens[i].hidden = true; + i += 2; + } + } + } + var list = function list(state, startLine, endLine, silent) { + var ch, contentStart, i, indent, indentAfterMarker, initial, isOrdered, itemLines, l, listLines, listTokIdx, markerCharCode, markerValue, max, offset, oldListIndent, oldParentType, oldSCount, oldTShift, oldTight, pos, posAfterMarker, prevEmptyEnd, start, terminate, terminatorRules, token, nextLine = startLine, isTerminatingParagraph = false, tight = true; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[nextLine] - state.blkIndent >= 4) { + return false; + } + // Special case: + // - item 1 + // - item 2 + // - item 3 + // - item 4 + // - this one is a paragraph continuation + if (state.listIndent >= 0 && state.sCount[nextLine] - state.listIndent >= 4 && state.sCount[nextLine] < state.blkIndent) { + return false; + } + // limit conditions when list can interrupt + // a paragraph (validation mode only) + if (silent && state.parentType === "paragraph") { + // Next list item should still terminate previous list item; + // This code can fail if plugins use blkIndent as well as lists, + // but I hope the spec gets fixed long before that happens. + if (state.sCount[nextLine] >= state.blkIndent) { + isTerminatingParagraph = true; + } + } + // Detect list type and position after marker + if ((posAfterMarker = skipOrderedListMarker(state, nextLine)) >= 0) { + isOrdered = true; + start = state.bMarks[nextLine] + state.tShift[nextLine]; + markerValue = Number(state.src.slice(start, posAfterMarker - 1)); + // If we're starting a new ordered list right after + // a paragraph, it should start with 1. + if (isTerminatingParagraph && markerValue !== 1) return false; + } else if ((posAfterMarker = skipBulletListMarker(state, nextLine)) >= 0) { + isOrdered = false; + } else { + return false; + } + // If we're starting a new unordered list right after + // a paragraph, first line should not be empty. + if (isTerminatingParagraph) { + if (state.skipSpaces(posAfterMarker) >= state.eMarks[nextLine]) return false; + } + // For validation mode we can terminate immediately + if (silent) { + return true; + } + // We should terminate list on style change. Remember first one to compare. + markerCharCode = state.src.charCodeAt(posAfterMarker - 1); + // Start list + listTokIdx = state.tokens.length; + if (isOrdered) { + token = state.push("ordered_list_open", "ol", 1); + if (markerValue !== 1) { + token.attrs = [ [ "start", markerValue ] ]; + } + } else { + token = state.push("bullet_list_open", "ul", 1); + } + token.map = listLines = [ nextLine, 0 ]; + token.markup = String.fromCharCode(markerCharCode); + + // Iterate list items + + prevEmptyEnd = false; + terminatorRules = state.md.block.ruler.getRules("list"); + oldParentType = state.parentType; + state.parentType = "list"; + while (nextLine < endLine) { + pos = posAfterMarker; + max = state.eMarks[nextLine]; + initial = offset = state.sCount[nextLine] + posAfterMarker - (state.bMarks[nextLine] + state.tShift[nextLine]); + while (pos < max) { + ch = state.src.charCodeAt(pos); + if (ch === 9) { + offset += 4 - (offset + state.bsCount[nextLine]) % 4; + } else if (ch === 32) { + offset++; + } else { + break; + } + pos++; + } + contentStart = pos; + if (contentStart >= max) { + // trimming space in "- \n 3" case, indent is 1 here + indentAfterMarker = 1; + } else { + indentAfterMarker = offset - initial; + } + // If we have more than 4 spaces, the indent is 1 + // (the rest is just indented code block) + if (indentAfterMarker > 4) { + indentAfterMarker = 1; + } + // " - test" + // ^^^^^ - calculating total length of this thing + indent = initial + indentAfterMarker; + // Run subparser & write tokens + token = state.push("list_item_open", "li", 1); + token.markup = String.fromCharCode(markerCharCode); + token.map = itemLines = [ nextLine, 0 ]; + if (isOrdered) { + token.info = state.src.slice(start, posAfterMarker - 1); + } + // change current state, then restore it after parser subcall + oldTight = state.tight; + oldTShift = state.tShift[nextLine]; + oldSCount = state.sCount[nextLine]; + // - example list + // ^ listIndent position will be here + // ^ blkIndent position will be here + + oldListIndent = state.listIndent; + state.listIndent = state.blkIndent; + state.blkIndent = indent; + state.tight = true; + state.tShift[nextLine] = contentStart - state.bMarks[nextLine]; + state.sCount[nextLine] = offset; + if (contentStart >= max && state.isEmpty(nextLine + 1)) { + // workaround for this case + // (list item is empty, list terminates before "foo"): + // ~~~~~~~~ + // - + // foo + // ~~~~~~~~ + state.line = Math.min(state.line + 2, endLine); + } else { + state.md.block.tokenize(state, nextLine, endLine, true); + } + // If any of list item is tight, mark list as tight + if (!state.tight || prevEmptyEnd) { + tight = false; + } + // Item become loose if finish with empty line, + // but we should filter last element, because it means list finish + prevEmptyEnd = state.line - nextLine > 1 && state.isEmpty(state.line - 1); + state.blkIndent = state.listIndent; + state.listIndent = oldListIndent; + state.tShift[nextLine] = oldTShift; + state.sCount[nextLine] = oldSCount; + state.tight = oldTight; + token = state.push("list_item_close", "li", -1); + token.markup = String.fromCharCode(markerCharCode); + nextLine = state.line; + itemLines[1] = nextLine; + if (nextLine >= endLine) { + break; + } + + // Try to check if list is terminated or continued. + + if (state.sCount[nextLine] < state.blkIndent) { + break; + } + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[nextLine] - state.blkIndent >= 4) { + break; + } + // fail if terminating block found + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + // fail if list has another type + if (isOrdered) { + posAfterMarker = skipOrderedListMarker(state, nextLine); + if (posAfterMarker < 0) { + break; + } + start = state.bMarks[nextLine] + state.tShift[nextLine]; + } else { + posAfterMarker = skipBulletListMarker(state, nextLine); + if (posAfterMarker < 0) { + break; + } + } + if (markerCharCode !== state.src.charCodeAt(posAfterMarker - 1)) { + break; + } + } + // Finalize list + if (isOrdered) { + token = state.push("ordered_list_close", "ol", -1); + } else { + token = state.push("bullet_list_close", "ul", -1); + } + token.markup = String.fromCharCode(markerCharCode); + listLines[1] = nextLine; + state.line = nextLine; + state.parentType = oldParentType; + // mark paragraphs tight if needed + if (tight) { + markTightParagraphs(state, listTokIdx); + } + return true; + }; + var normalizeReference$2 = utils.normalizeReference; + var isSpace$6 = utils.isSpace; + var reference = function reference(state, startLine, _endLine, silent) { + var ch, destEndPos, destEndLineNo, endLine, href, i, l, label, labelEnd, oldParentType, res, start, str, terminate, terminatorRules, title, lines = 0, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine], nextLine = startLine + 1; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + if (state.src.charCodeAt(pos) !== 91 /* [ */) { + return false; + } + // Simple check to quickly interrupt scan on [link](url) at the start of line. + // Can be useful on practice: https://github.com/markdown-it/markdown-it/issues/54 + while (++pos < max) { + if (state.src.charCodeAt(pos) === 93 /* ] */ && state.src.charCodeAt(pos - 1) !== 92 /* \ */) { + if (pos + 1 === max) { + return false; + } + if (state.src.charCodeAt(pos + 1) !== 58 /* : */) { + return false; + } + break; + } + } + endLine = state.lineMax; + // jump line-by-line until empty one or EOF + terminatorRules = state.md.block.ruler.getRules("reference"); + oldParentType = state.parentType; + state.parentType = "reference"; + for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) { + // this would be a code block normally, but after paragraph + // it's considered a lazy continuation regardless of what's there + if (state.sCount[nextLine] - state.blkIndent > 3) { + continue; + } + // quirk for blockquotes, this line should already be checked by that rule + if (state.sCount[nextLine] < 0) { + continue; + } + // Some tags can terminate paragraph without empty line. + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + } + str = state.getLines(startLine, nextLine, state.blkIndent, false).trim(); + max = str.length; + for (pos = 1; pos < max; pos++) { + ch = str.charCodeAt(pos); + if (ch === 91 /* [ */) { + return false; + } else if (ch === 93 /* ] */) { + labelEnd = pos; + break; + } else if (ch === 10 /* \n */) { + lines++; + } else if (ch === 92 /* \ */) { + pos++; + if (pos < max && str.charCodeAt(pos) === 10) { + lines++; + } + } + } + if (labelEnd < 0 || str.charCodeAt(labelEnd + 1) !== 58 /* : */) { + return false; + } + // [label]: destination 'title' + // ^^^ skip optional whitespace here + for (pos = labelEnd + 2; pos < max; pos++) { + ch = str.charCodeAt(pos); + if (ch === 10) { + lines++; + } else if (isSpace$6(ch)) ; else { + break; + } + } + // [label]: destination 'title' + // ^^^^^^^^^^^ parse this + res = state.md.helpers.parseLinkDestination(str, pos, max); + if (!res.ok) { + return false; + } + href = state.md.normalizeLink(res.str); + if (!state.md.validateLink(href)) { + return false; + } + pos = res.pos; + lines += res.lines; + // save cursor state, we could require to rollback later + destEndPos = pos; + destEndLineNo = lines; + // [label]: destination 'title' + // ^^^ skipping those spaces + start = pos; + for (;pos < max; pos++) { + ch = str.charCodeAt(pos); + if (ch === 10) { + lines++; + } else if (isSpace$6(ch)) ; else { + break; + } + } + // [label]: destination 'title' + // ^^^^^^^ parse this + res = state.md.helpers.parseLinkTitle(str, pos, max); + if (pos < max && start !== pos && res.ok) { + title = res.str; + pos = res.pos; + lines += res.lines; + } else { + title = ""; + pos = destEndPos; + lines = destEndLineNo; + } + // skip trailing spaces until the rest of the line + while (pos < max) { + ch = str.charCodeAt(pos); + if (!isSpace$6(ch)) { + break; + } + pos++; + } + if (pos < max && str.charCodeAt(pos) !== 10) { + if (title) { + // garbage at the end of the line after title, + // but it could still be a valid reference if we roll back + title = ""; + pos = destEndPos; + lines = destEndLineNo; + while (pos < max) { + ch = str.charCodeAt(pos); + if (!isSpace$6(ch)) { + break; + } + pos++; + } + } + } + if (pos < max && str.charCodeAt(pos) !== 10) { + // garbage at the end of the line + return false; + } + label = normalizeReference$2(str.slice(1, labelEnd)); + if (!label) { + // CommonMark 0.20 disallows empty labels + return false; + } + // Reference can not terminate anything. This check is for safety only. + /*istanbul ignore if*/ if (silent) { + return true; + } + if (typeof state.env.references === "undefined") { + state.env.references = {}; + } + if (typeof state.env.references[label] === "undefined") { + state.env.references[label] = { + title: title, + href: href + }; + } + state.parentType = oldParentType; + state.line = startLine + lines + 1; + return true; + }; + // List of valid html blocks names, accorting to commonmark spec + var html_blocks = [ "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul" ]; + // Regexps to match html elements + var attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"; + var unquoted = "[^\"'=<>`\\x00-\\x20]+"; + var single_quoted = "'[^']*'"; + var double_quoted = '"[^"]*"'; + var attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")"; + var attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)"; + var open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>"; + var close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>"; + var comment = "\x3c!----\x3e|\x3c!--(?:-?[^>-])(?:-?[^-])*--\x3e"; + var processing = "<[?][\\s\\S]*?[?]>"; + var declaration = "]*>"; + var cdata = ""; + var HTML_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + "|" + comment + "|" + processing + "|" + declaration + "|" + cdata + ")"); + var HTML_OPEN_CLOSE_TAG_RE$1 = new RegExp("^(?:" + open_tag + "|" + close_tag + ")"); + var HTML_TAG_RE_1 = HTML_TAG_RE$1; + var HTML_OPEN_CLOSE_TAG_RE_1 = HTML_OPEN_CLOSE_TAG_RE$1; + var html_re = { + HTML_TAG_RE: HTML_TAG_RE_1, + HTML_OPEN_CLOSE_TAG_RE: HTML_OPEN_CLOSE_TAG_RE_1 + }; + var HTML_OPEN_CLOSE_TAG_RE = html_re.HTML_OPEN_CLOSE_TAG_RE; + // An array of opening and corresponding closing sequences for html tags, + // last argument defines whether it can terminate a paragraph or not + + var HTML_SEQUENCES = [ [ /^<(script|pre|style|textarea)(?=(\s|>|$))/i, /<\/(script|pre|style|textarea)>/i, true ], [ /^/, true ], [ /^<\?/, /\?>/, true ], [ /^/, true ], [ /^/, true ], [ new RegExp("^|$))", "i"), /^$/, true ], [ new RegExp(HTML_OPEN_CLOSE_TAG_RE.source + "\\s*$"), /^$/, false ] ]; + var html_block = function html_block(state, startLine, endLine, silent) { + var i, nextLine, token, lineText, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + if (!state.md.options.html) { + return false; + } + if (state.src.charCodeAt(pos) !== 60 /* < */) { + return false; + } + lineText = state.src.slice(pos, max); + for (i = 0; i < HTML_SEQUENCES.length; i++) { + if (HTML_SEQUENCES[i][0].test(lineText)) { + break; + } + } + if (i === HTML_SEQUENCES.length) { + return false; + } + if (silent) { + // true if this sequence can be a terminator, false otherwise + return HTML_SEQUENCES[i][2]; + } + nextLine = startLine + 1; + // If we are here - we detected HTML block. + // Let's roll down till block end. + if (!HTML_SEQUENCES[i][1].test(lineText)) { + for (;nextLine < endLine; nextLine++) { + if (state.sCount[nextLine] < state.blkIndent) { + break; + } + pos = state.bMarks[nextLine] + state.tShift[nextLine]; + max = state.eMarks[nextLine]; + lineText = state.src.slice(pos, max); + if (HTML_SEQUENCES[i][1].test(lineText)) { + if (lineText.length !== 0) { + nextLine++; + } + break; + } + } + } + state.line = nextLine; + token = state.push("html_block", "", 0); + token.map = [ startLine, nextLine ]; + token.content = state.getLines(startLine, nextLine, state.blkIndent, true); + return true; + }; + var isSpace$5 = utils.isSpace; + var heading = function heading(state, startLine, endLine, silent) { + var ch, level, tmp, token, pos = state.bMarks[startLine] + state.tShift[startLine], max = state.eMarks[startLine]; + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + ch = state.src.charCodeAt(pos); + if (ch !== 35 /* # */ || pos >= max) { + return false; + } + // count heading level + level = 1; + ch = state.src.charCodeAt(++pos); + while (ch === 35 /* # */ && pos < max && level <= 6) { + level++; + ch = state.src.charCodeAt(++pos); + } + if (level > 6 || pos < max && !isSpace$5(ch)) { + return false; + } + if (silent) { + return true; + } + // Let's cut tails like ' ### ' from the end of string + max = state.skipSpacesBack(max, pos); + tmp = state.skipCharsBack(max, 35, pos); + // # + if (tmp > pos && isSpace$5(state.src.charCodeAt(tmp - 1))) { + max = tmp; + } + state.line = startLine + 1; + token = state.push("heading_open", "h" + String(level), 1); + token.markup = "########".slice(0, level); + token.map = [ startLine, state.line ]; + token = state.push("inline", "", 0); + token.content = state.src.slice(pos, max).trim(); + token.map = [ startLine, state.line ]; + token.children = []; + token = state.push("heading_close", "h" + String(level), -1); + token.markup = "########".slice(0, level); + return true; + }; + // lheading (---, ===) + var lheading = function lheading(state, startLine, endLine /*, silent*/) { + var content, terminate, i, l, token, pos, max, level, marker, nextLine = startLine + 1, oldParentType, terminatorRules = state.md.block.ruler.getRules("paragraph"); + // if it's indented more than 3 spaces, it should be a code block + if (state.sCount[startLine] - state.blkIndent >= 4) { + return false; + } + oldParentType = state.parentType; + state.parentType = "paragraph"; + // use paragraph to match terminatorRules + // jump line-by-line until empty one or EOF + for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) { + // this would be a code block normally, but after paragraph + // it's considered a lazy continuation regardless of what's there + if (state.sCount[nextLine] - state.blkIndent > 3) { + continue; + } + + // Check for underline in setext header + + if (state.sCount[nextLine] >= state.blkIndent) { + pos = state.bMarks[nextLine] + state.tShift[nextLine]; + max = state.eMarks[nextLine]; + if (pos < max) { + marker = state.src.charCodeAt(pos); + if (marker === 45 /* - */ || marker === 61 /* = */) { + pos = state.skipChars(pos, marker); + pos = state.skipSpaces(pos); + if (pos >= max) { + level = marker === 61 /* = */ ? 1 : 2; + break; + } + } + } + } + // quirk for blockquotes, this line should already be checked by that rule + if (state.sCount[nextLine] < 0) { + continue; + } + // Some tags can terminate paragraph without empty line. + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + } + if (!level) { + // Didn't find valid underline + return false; + } + content = state.getLines(startLine, nextLine, state.blkIndent, false).trim(); + state.line = nextLine + 1; + token = state.push("heading_open", "h" + String(level), 1); + token.markup = String.fromCharCode(marker); + token.map = [ startLine, state.line ]; + token = state.push("inline", "", 0); + token.content = content; + token.map = [ startLine, state.line - 1 ]; + token.children = []; + token = state.push("heading_close", "h" + String(level), -1); + token.markup = String.fromCharCode(marker); + state.parentType = oldParentType; + return true; + }; + // Paragraph + var paragraph = function paragraph(state, startLine, endLine) { + var content, terminate, i, l, token, oldParentType, nextLine = startLine + 1, terminatorRules = state.md.block.ruler.getRules("paragraph"); + oldParentType = state.parentType; + state.parentType = "paragraph"; + // jump line-by-line until empty one or EOF + for (;nextLine < endLine && !state.isEmpty(nextLine); nextLine++) { + // this would be a code block normally, but after paragraph + // it's considered a lazy continuation regardless of what's there + if (state.sCount[nextLine] - state.blkIndent > 3) { + continue; + } + // quirk for blockquotes, this line should already be checked by that rule + if (state.sCount[nextLine] < 0) { + continue; + } + // Some tags can terminate paragraph without empty line. + terminate = false; + for (i = 0, l = terminatorRules.length; i < l; i++) { + if (terminatorRules[i](state, nextLine, endLine, true)) { + terminate = true; + break; + } + } + if (terminate) { + break; + } + } + content = state.getLines(startLine, nextLine, state.blkIndent, false).trim(); + state.line = nextLine; + token = state.push("paragraph_open", "p", 1); + token.map = [ startLine, state.line ]; + token = state.push("inline", "", 0); + token.content = content; + token.map = [ startLine, state.line ]; + token.children = []; + token = state.push("paragraph_close", "p", -1); + state.parentType = oldParentType; + return true; + }; + var isSpace$4 = utils.isSpace; + function StateBlock(src, md, env, tokens) { + var ch, s, start, pos, len, indent, offset, indent_found; + this.src = src; + // link to parser instance + this.md = md; + this.env = env; + + // Internal state vartiables + + this.tokens = tokens; + this.bMarks = []; + // line begin offsets for fast jumps + this.eMarks = []; + // line end offsets for fast jumps + this.tShift = []; + // offsets of the first non-space characters (tabs not expanded) + this.sCount = []; + // indents for each line (tabs expanded) + // An amount of virtual spaces (tabs expanded) between beginning + // of each line (bMarks) and real beginning of that line. + + // It exists only as a hack because blockquotes override bMarks + // losing information in the process. + + // It's used only when expanding tabs, you can think about it as + // an initial tab length, e.g. bsCount=21 applied to string `\t123` + // means first tab should be expanded to 4-21%4 === 3 spaces. + + this.bsCount = []; + // block parser variables + this.blkIndent = 0; + // required block content indent (for example, if we are + // inside a list, it would be positioned after list marker) + this.line = 0; + // line index in src + this.lineMax = 0; + // lines count + this.tight = false; + // loose/tight mode for lists + this.ddIndent = -1; + // indent of the current dd block (-1 if there isn't any) + this.listIndent = -1; + // indent of the current list block (-1 if there isn't any) + // can be 'blockquote', 'list', 'root', 'paragraph' or 'reference' + // used in lists to determine if they interrupt a paragraph + this.parentType = "root"; + this.level = 0; + // renderer + this.result = ""; + // Create caches + // Generate markers. + s = this.src; + indent_found = false; + for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) { + ch = s.charCodeAt(pos); + if (!indent_found) { + if (isSpace$4(ch)) { + indent++; + if (ch === 9) { + offset += 4 - offset % 4; + } else { + offset++; + } + continue; + } else { + indent_found = true; + } + } + if (ch === 10 || pos === len - 1) { + if (ch !== 10) { + pos++; + } + this.bMarks.push(start); + this.eMarks.push(pos); + this.tShift.push(indent); + this.sCount.push(offset); + this.bsCount.push(0); + indent_found = false; + indent = 0; + offset = 0; + start = pos + 1; + } + } + // Push fake entry to simplify cache bounds checks + this.bMarks.push(s.length); + this.eMarks.push(s.length); + this.tShift.push(0); + this.sCount.push(0); + this.bsCount.push(0); + this.lineMax = this.bMarks.length - 1; + // don't count last fake line + } + // Push new token to "stream". + + StateBlock.prototype.push = function(type, tag, nesting) { + var token$1 = new token(type, tag, nesting); + token$1.block = true; + if (nesting < 0) this.level--; + // closing tag + token$1.level = this.level; + if (nesting > 0) this.level++; + // opening tag + this.tokens.push(token$1); + return token$1; + }; + StateBlock.prototype.isEmpty = function isEmpty(line) { + return this.bMarks[line] + this.tShift[line] >= this.eMarks[line]; + }; + StateBlock.prototype.skipEmptyLines = function skipEmptyLines(from) { + for (var max = this.lineMax; from < max; from++) { + if (this.bMarks[from] + this.tShift[from] < this.eMarks[from]) { + break; + } + } + return from; + }; + // Skip spaces from given position. + StateBlock.prototype.skipSpaces = function skipSpaces(pos) { + var ch; + for (var max = this.src.length; pos < max; pos++) { + ch = this.src.charCodeAt(pos); + if (!isSpace$4(ch)) { + break; + } + } + return pos; + }; + // Skip spaces from given position in reverse. + StateBlock.prototype.skipSpacesBack = function skipSpacesBack(pos, min) { + if (pos <= min) { + return pos; + } + while (pos > min) { + if (!isSpace$4(this.src.charCodeAt(--pos))) { + return pos + 1; + } + } + return pos; + }; + // Skip char codes from given position + StateBlock.prototype.skipChars = function skipChars(pos, code) { + for (var max = this.src.length; pos < max; pos++) { + if (this.src.charCodeAt(pos) !== code) { + break; + } + } + return pos; + }; + // Skip char codes reverse from given position - 1 + StateBlock.prototype.skipCharsBack = function skipCharsBack(pos, code, min) { + if (pos <= min) { + return pos; + } + while (pos > min) { + if (code !== this.src.charCodeAt(--pos)) { + return pos + 1; + } + } + return pos; + }; + // cut lines range from source. + StateBlock.prototype.getLines = function getLines(begin, end, indent, keepLastLF) { + var i, lineIndent, ch, first, last, queue, lineStart, line = begin; + if (begin >= end) { + return ""; + } + queue = new Array(end - begin); + for (i = 0; line < end; line++, i++) { + lineIndent = 0; + lineStart = first = this.bMarks[line]; + if (line + 1 < end || keepLastLF) { + // No need for bounds check because we have fake entry on tail. + last = this.eMarks[line] + 1; + } else { + last = this.eMarks[line]; + } + while (first < last && lineIndent < indent) { + ch = this.src.charCodeAt(first); + if (isSpace$4(ch)) { + if (ch === 9) { + lineIndent += 4 - (lineIndent + this.bsCount[line]) % 4; + } else { + lineIndent++; + } + } else if (first - lineStart < this.tShift[line]) { + // patched tShift masked characters to look like spaces (blockquotes, list markers) + lineIndent++; + } else { + break; + } + first++; + } + if (lineIndent > indent) { + // partially expanding tabs in code blocks, e.g '\t\tfoobar' + // with indent=2 becomes ' \tfoobar' + queue[i] = new Array(lineIndent - indent + 1).join(" ") + this.src.slice(first, last); + } else { + queue[i] = this.src.slice(first, last); + } + } + return queue.join(""); + }; + // re-export Token class to use in block rules + StateBlock.prototype.Token = token; + var state_block = StateBlock; + var _rules$1 = [ + // First 2 params - rule name & source. Secondary array - list of rules, + // which can be terminated by this one. + [ "table", table, [ "paragraph", "reference" ] ], [ "code", code ], [ "fence", fence, [ "paragraph", "reference", "blockquote", "list" ] ], [ "blockquote", blockquote, [ "paragraph", "reference", "blockquote", "list" ] ], [ "hr", hr, [ "paragraph", "reference", "blockquote", "list" ] ], [ "list", list, [ "paragraph", "reference", "blockquote" ] ], [ "reference", reference ], [ "html_block", html_block, [ "paragraph", "reference", "blockquote" ] ], [ "heading", heading, [ "paragraph", "reference", "blockquote" ] ], [ "lheading", lheading ], [ "paragraph", paragraph ] ]; + /** + * new ParserBlock() + **/ function ParserBlock() { + /** + * ParserBlock#ruler -> Ruler + * + * [[Ruler]] instance. Keep configuration of block rules. + **/ + this.ruler = new ruler; + for (var i = 0; i < _rules$1.length; i++) { + this.ruler.push(_rules$1[i][0], _rules$1[i][1], { + alt: (_rules$1[i][2] || []).slice() + }); + } + } + // Generate tokens for input range + + ParserBlock.prototype.tokenize = function(state, startLine, endLine) { + var ok, i, prevLine, rules = this.ruler.getRules(""), len = rules.length, line = startLine, hasEmptyLines = false, maxNesting = state.md.options.maxNesting; + while (line < endLine) { + state.line = line = state.skipEmptyLines(line); + if (line >= endLine) { + break; + } + // Termination condition for nested calls. + // Nested calls currently used for blockquotes & lists + if (state.sCount[line] < state.blkIndent) { + break; + } + // If nesting level exceeded - skip tail to the end. That's not ordinary + // situation and we should not care about content. + if (state.level >= maxNesting) { + state.line = endLine; + break; + } + // Try all possible rules. + // On success, rule should: + + // - update `state.line` + // - update `state.tokens` + // - return true + prevLine = state.line; + for (i = 0; i < len; i++) { + ok = rules[i](state, line, endLine, false); + if (ok) { + if (prevLine >= state.line) { + throw new Error("block rule didn't increment state.line"); + } + break; + } + } + // this can only happen if user disables paragraph rule + if (!ok) throw new Error("none of the block rules matched"); + // set state.tight if we had an empty line before current tag + // i.e. latest empty line should not count + state.tight = !hasEmptyLines; + // paragraph might "eat" one newline after it in nested lists + if (state.isEmpty(state.line - 1)) { + hasEmptyLines = true; + } + line = state.line; + if (line < endLine && state.isEmpty(line)) { + hasEmptyLines = true; + line++; + state.line = line; + } + } + }; + /** + * ParserBlock.parse(str, md, env, outTokens) + * + * Process input string and push block tokens into `outTokens` + **/ ParserBlock.prototype.parse = function(src, md, env, outTokens) { + var state; + if (!src) { + return; + } + state = new this.State(src, md, env, outTokens); + this.tokenize(state, state.line, state.lineMax); + }; + ParserBlock.prototype.State = state_block; + var parser_block = ParserBlock; + // Skip text characters for text token, place those to pending buffer + // Rule to skip pure text + // '{}$%@~+=:' reserved for extentions + // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~ + // !!!! Don't confuse with "Markdown ASCII Punctuation" chars + // http://spec.commonmark.org/0.15/#ascii-punctuation-character + function isTerminatorChar(ch) { + switch (ch) { + case 10 /* \n */ : + case 33 /* ! */ : + case 35 /* # */ : + case 36 /* $ */ : + case 37 /* % */ : + case 38 /* & */ : + case 42 /* * */ : + case 43 /* + */ : + case 45 /* - */ : + case 58 /* : */ : + case 60 /* < */ : + case 61 /* = */ : + case 62 /* > */ : + case 64 /* @ */ : + case 91 /* [ */ : + case 92 /* \ */ : + case 93 /* ] */ : + case 94 /* ^ */ : + case 95 /* _ */ : + case 96 /* ` */ : + case 123 /* { */ : + case 125 /* } */ : + case 126 /* ~ */ : + return true; + + default: + return false; + } + } + var text = function text(state, silent) { + var pos = state.pos; + while (pos < state.posMax && !isTerminatorChar(state.src.charCodeAt(pos))) { + pos++; + } + if (pos === state.pos) { + return false; + } + if (!silent) { + state.pending += state.src.slice(state.pos, pos); + } + state.pos = pos; + return true; + }; + // Process links like https://example.org/ + // RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + var SCHEME_RE = /(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$/i; + var linkify = function linkify(state, silent) { + var pos, max, match, proto, link, url, fullUrl, token; + if (!state.md.options.linkify) return false; + if (state.linkLevel > 0) return false; + pos = state.pos; + max = state.posMax; + if (pos + 3 > max) return false; + if (state.src.charCodeAt(pos) !== 58 /* : */) return false; + if (state.src.charCodeAt(pos + 1) !== 47 /* / */) return false; + if (state.src.charCodeAt(pos + 2) !== 47 /* / */) return false; + match = state.pending.match(SCHEME_RE); + if (!match) return false; + proto = match[1]; + link = state.md.linkify.matchAtStart(state.src.slice(pos - proto.length)); + if (!link) return false; + url = link.url; + // invalid link, but still detected by linkify somehow; + // need to check to prevent infinite loop below + if (url.length <= proto.length) return false; + // disallow '*' at the end of the link (conflicts with emphasis) + url = url.replace(/\*+$/, ""); + fullUrl = state.md.normalizeLink(url); + if (!state.md.validateLink(fullUrl)) return false; + if (!silent) { + state.pending = state.pending.slice(0, -proto.length); + token = state.push("link_open", "a", 1); + token.attrs = [ [ "href", fullUrl ] ]; + token.markup = "linkify"; + token.info = "auto"; + token = state.push("text", "", 0); + token.content = state.md.normalizeLinkText(url); + token = state.push("link_close", "a", -1); + token.markup = "linkify"; + token.info = "auto"; + } + state.pos += url.length - proto.length; + return true; + }; + var isSpace$3 = utils.isSpace; + var newline = function newline(state, silent) { + var pmax, max, ws, pos = state.pos; + if (state.src.charCodeAt(pos) !== 10 /* \n */) { + return false; + } + pmax = state.pending.length - 1; + max = state.posMax; + // ' \n' -> hardbreak + // Lookup in pending chars is bad practice! Don't copy to other rules! + // Pending string is stored in concat mode, indexed lookups will cause + // convertion to flat mode. + if (!silent) { + if (pmax >= 0 && state.pending.charCodeAt(pmax) === 32) { + if (pmax >= 1 && state.pending.charCodeAt(pmax - 1) === 32) { + // Find whitespaces tail of pending chars. + ws = pmax - 1; + while (ws >= 1 && state.pending.charCodeAt(ws - 1) === 32) ws--; + state.pending = state.pending.slice(0, ws); + state.push("hardbreak", "br", 0); + } else { + state.pending = state.pending.slice(0, -1); + state.push("softbreak", "br", 0); + } + } else { + state.push("softbreak", "br", 0); + } + } + pos++; + // skip heading spaces for next line + while (pos < max && isSpace$3(state.src.charCodeAt(pos))) { + pos++; + } + state.pos = pos; + return true; + }; + var isSpace$2 = utils.isSpace; + var ESCAPED = []; + for (var i = 0; i < 256; i++) { + ESCAPED.push(0); + } + "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-".split("").forEach((function(ch) { + ESCAPED[ch.charCodeAt(0)] = 1; + })); + var _escape = function escape(state, silent) { + var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax; + if (state.src.charCodeAt(pos) !== 92 /* \ */) return false; + pos++; + // '\' at the end of the inline block + if (pos >= max) return false; + ch1 = state.src.charCodeAt(pos); + if (ch1 === 10) { + if (!silent) { + state.push("hardbreak", "br", 0); + } + pos++; + // skip leading whitespaces from next line + while (pos < max) { + ch1 = state.src.charCodeAt(pos); + if (!isSpace$2(ch1)) break; + pos++; + } + state.pos = pos; + return true; + } + escapedStr = state.src[pos]; + if (ch1 >= 55296 && ch1 <= 56319 && pos + 1 < max) { + ch2 = state.src.charCodeAt(pos + 1); + if (ch2 >= 56320 && ch2 <= 57343) { + escapedStr += state.src[pos + 1]; + pos++; + } + } + origStr = "\\" + escapedStr; + if (!silent) { + token = state.push("text_special", "", 0); + if (ch1 < 256 && ESCAPED[ch1] !== 0) { + token.content = escapedStr; + } else { + token.content = origStr; + } + token.markup = origStr; + token.info = "escape"; + } + state.pos = pos + 1; + return true; + }; + // Parse backticks + var backticks = function backtick(state, silent) { + var start, max, marker, token, matchStart, matchEnd, openerLength, closerLength, pos = state.pos, ch = state.src.charCodeAt(pos); + if (ch !== 96 /* ` */) { + return false; + } + start = pos; + pos++; + max = state.posMax; + // scan marker length + while (pos < max && state.src.charCodeAt(pos) === 96 /* ` */) { + pos++; + } + marker = state.src.slice(start, pos); + openerLength = marker.length; + if (state.backticksScanned && (state.backticks[openerLength] || 0) <= start) { + if (!silent) state.pending += marker; + state.pos += openerLength; + return true; + } + matchEnd = pos; + // Nothing found in the cache, scan until the end of the line (or until marker is found) + while ((matchStart = state.src.indexOf("`", matchEnd)) !== -1) { + matchEnd = matchStart + 1; + // scan marker length + while (matchEnd < max && state.src.charCodeAt(matchEnd) === 96 /* ` */) { + matchEnd++; + } + closerLength = matchEnd - matchStart; + if (closerLength === openerLength) { + // Found matching closer length. + if (!silent) { + token = state.push("code_inline", "code", 0); + token.markup = marker; + token.content = state.src.slice(pos, matchStart).replace(/\n/g, " ").replace(/^ (.+) $/, "$1"); + } + state.pos = matchEnd; + return true; + } + // Some different length found, put it in cache as upper limit of where closer can be found + state.backticks[closerLength] = matchStart; + } + // Scanned through the end, didn't find anything + state.backticksScanned = true; + if (!silent) state.pending += marker; + state.pos += openerLength; + return true; + }; + // ~~strike through~~ + // Insert each marker as a separate text token, and add it to delimiter list + + var tokenize$1 = function strikethrough(state, silent) { + var i, scanned, token, len, ch, start = state.pos, marker = state.src.charCodeAt(start); + if (silent) { + return false; + } + if (marker !== 126 /* ~ */) { + return false; + } + scanned = state.scanDelims(state.pos, true); + len = scanned.length; + ch = String.fromCharCode(marker); + if (len < 2) { + return false; + } + if (len % 2) { + token = state.push("text", "", 0); + token.content = ch; + len--; + } + for (i = 0; i < len; i += 2) { + token = state.push("text", "", 0); + token.content = ch + ch; + state.delimiters.push({ + marker: marker, + length: 0, + // disable "rule of 3" length checks meant for emphasis + token: state.tokens.length - 1, + end: -1, + open: scanned.can_open, + close: scanned.can_close + }); + } + state.pos += scanned.length; + return true; + }; + function postProcess$1(state, delimiters) { + var i, j, startDelim, endDelim, token, loneMarkers = [], max = delimiters.length; + for (i = 0; i < max; i++) { + startDelim = delimiters[i]; + if (startDelim.marker !== 126 /* ~ */) { + continue; + } + if (startDelim.end === -1) { + continue; + } + endDelim = delimiters[startDelim.end]; + token = state.tokens[startDelim.token]; + token.type = "s_open"; + token.tag = "s"; + token.nesting = 1; + token.markup = "~~"; + token.content = ""; + token = state.tokens[endDelim.token]; + token.type = "s_close"; + token.tag = "s"; + token.nesting = -1; + token.markup = "~~"; + token.content = ""; + if (state.tokens[endDelim.token - 1].type === "text" && state.tokens[endDelim.token - 1].content === "~") { + loneMarkers.push(endDelim.token - 1); + } + } + // If a marker sequence has an odd number of characters, it's splitted + // like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the + // start of the sequence. + + // So, we have to move all those markers after subsequent s_close tags. + + while (loneMarkers.length) { + i = loneMarkers.pop(); + j = i + 1; + while (j < state.tokens.length && state.tokens[j].type === "s_close") { + j++; + } + j--; + if (i !== j) { + token = state.tokens[j]; + state.tokens[j] = state.tokens[i]; + state.tokens[i] = token; + } + } + } + // Walk through delimiter list and replace text tokens with tags + + var postProcess_1$1 = function strikethrough(state) { + var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length; + postProcess$1(state, state.delimiters); + for (curr = 0; curr < max; curr++) { + if (tokens_meta[curr] && tokens_meta[curr].delimiters) { + postProcess$1(state, tokens_meta[curr].delimiters); + } + } + }; + var strikethrough = { + tokenize: tokenize$1, + postProcess: postProcess_1$1 + }; + // Process *this* and _that_ + // Insert each marker as a separate text token, and add it to delimiter list + + var tokenize = function emphasis(state, silent) { + var i, scanned, token, start = state.pos, marker = state.src.charCodeAt(start); + if (silent) { + return false; + } + if (marker !== 95 /* _ */ && marker !== 42 /* * */) { + return false; + } + scanned = state.scanDelims(state.pos, marker === 42); + for (i = 0; i < scanned.length; i++) { + token = state.push("text", "", 0); + token.content = String.fromCharCode(marker); + state.delimiters.push({ + // Char code of the starting marker (number). + marker: marker, + // Total length of these series of delimiters. + length: scanned.length, + // A position of the token this delimiter corresponds to. + token: state.tokens.length - 1, + // If this delimiter is matched as a valid opener, `end` will be + // equal to its position, otherwise it's `-1`. + end: -1, + // Boolean flags that determine if this delimiter could open or close + // an emphasis. + open: scanned.can_open, + close: scanned.can_close + }); + } + state.pos += scanned.length; + return true; + }; + function postProcess(state, delimiters) { + var i, startDelim, endDelim, token, ch, isStrong, max = delimiters.length; + for (i = max - 1; i >= 0; i--) { + startDelim = delimiters[i]; + if (startDelim.marker !== 95 /* _ */ && startDelim.marker !== 42 /* * */) { + continue; + } + // Process only opening markers + if (startDelim.end === -1) { + continue; + } + endDelim = delimiters[startDelim.end]; + // If the previous delimiter has the same marker and is adjacent to this one, + // merge those into one strong delimiter. + + // `whatever` -> `whatever` + + isStrong = i > 0 && delimiters[i - 1].end === startDelim.end + 1 && + // check that first two markers match and adjacent + delimiters[i - 1].marker === startDelim.marker && delimiters[i - 1].token === startDelim.token - 1 && + // check that last two markers are adjacent (we can safely assume they match) + delimiters[startDelim.end + 1].token === endDelim.token + 1; + ch = String.fromCharCode(startDelim.marker); + token = state.tokens[startDelim.token]; + token.type = isStrong ? "strong_open" : "em_open"; + token.tag = isStrong ? "strong" : "em"; + token.nesting = 1; + token.markup = isStrong ? ch + ch : ch; + token.content = ""; + token = state.tokens[endDelim.token]; + token.type = isStrong ? "strong_close" : "em_close"; + token.tag = isStrong ? "strong" : "em"; + token.nesting = -1; + token.markup = isStrong ? ch + ch : ch; + token.content = ""; + if (isStrong) { + state.tokens[delimiters[i - 1].token].content = ""; + state.tokens[delimiters[startDelim.end + 1].token].content = ""; + i--; + } + } + } + // Walk through delimiter list and replace text tokens with tags + + var postProcess_1 = function emphasis(state) { + var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length; + postProcess(state, state.delimiters); + for (curr = 0; curr < max; curr++) { + if (tokens_meta[curr] && tokens_meta[curr].delimiters) { + postProcess(state, tokens_meta[curr].delimiters); + } + } + }; + var emphasis = { + tokenize: tokenize, + postProcess: postProcess_1 + }; + var normalizeReference$1 = utils.normalizeReference; + var isSpace$1 = utils.isSpace; + var link = function link(state, silent) { + var attrs, code, label, labelEnd, labelStart, pos, res, ref, token, href = "", title = "", oldPos = state.pos, max = state.posMax, start = state.pos, parseReference = true; + if (state.src.charCodeAt(state.pos) !== 91 /* [ */) { + return false; + } + labelStart = state.pos + 1; + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, true); + // parser failed to find ']', so it's not a valid link + if (labelEnd < 0) { + return false; + } + pos = labelEnd + 1; + if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) { + // Inline link + // might have found a valid shortcut link, disable reference parsing + parseReference = false; + // [link]( "title" ) + // ^^ skipping these spaces + pos++; + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace$1(code) && code !== 10) { + break; + } + } + if (pos >= max) { + return false; + } + // [link]( "title" ) + // ^^^^^^ parsing link destination + start = pos; + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax); + if (res.ok) { + href = state.md.normalizeLink(res.str); + if (state.md.validateLink(href)) { + pos = res.pos; + } else { + href = ""; + } + // [link]( "title" ) + // ^^ skipping these spaces + start = pos; + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace$1(code) && code !== 10) { + break; + } + } + // [link]( "title" ) + // ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax); + if (pos < max && start !== pos && res.ok) { + title = res.str; + pos = res.pos; + // [link]( "title" ) + // ^^ skipping these spaces + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace$1(code) && code !== 10) { + break; + } + } + } + } + if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) { + // parsing a valid shortcut link failed, fallback to reference + parseReference = true; + } + pos++; + } + if (parseReference) { + // Link reference + if (typeof state.env.references === "undefined") { + return false; + } + if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) { + start = pos + 1; + pos = state.md.helpers.parseLinkLabel(state, pos); + if (pos >= 0) { + label = state.src.slice(start, pos++); + } else { + pos = labelEnd + 1; + } + } else { + pos = labelEnd + 1; + } + // covers label === '' and label === undefined + // (collapsed reference link and shortcut reference link respectively) + if (!label) { + label = state.src.slice(labelStart, labelEnd); + } + ref = state.env.references[normalizeReference$1(label)]; + if (!ref) { + state.pos = oldPos; + return false; + } + href = ref.href; + title = ref.title; + } + + // We found the end of the link, and know for a fact it's a valid link; + // so all that's left to do is to call tokenizer. + + if (!silent) { + state.pos = labelStart; + state.posMax = labelEnd; + token = state.push("link_open", "a", 1); + token.attrs = attrs = [ [ "href", href ] ]; + if (title) { + attrs.push([ "title", title ]); + } + state.linkLevel++; + state.md.inline.tokenize(state); + state.linkLevel--; + token = state.push("link_close", "a", -1); + } + state.pos = pos; + state.posMax = max; + return true; + }; + var normalizeReference = utils.normalizeReference; + var isSpace = utils.isSpace; + var image = function image(state, silent) { + var attrs, code, content, label, labelEnd, labelStart, pos, ref, res, title, token, tokens, start, href = "", oldPos = state.pos, max = state.posMax; + if (state.src.charCodeAt(state.pos) !== 33 /* ! */) { + return false; + } + if (state.src.charCodeAt(state.pos + 1) !== 91 /* [ */) { + return false; + } + labelStart = state.pos + 2; + labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, false); + // parser failed to find ']', so it's not a valid link + if (labelEnd < 0) { + return false; + } + pos = labelEnd + 1; + if (pos < max && state.src.charCodeAt(pos) === 40 /* ( */) { + // Inline link + // [link]( "title" ) + // ^^ skipping these spaces + pos++; + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace(code) && code !== 10) { + break; + } + } + if (pos >= max) { + return false; + } + // [link]( "title" ) + // ^^^^^^ parsing link destination + start = pos; + res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax); + if (res.ok) { + href = state.md.normalizeLink(res.str); + if (state.md.validateLink(href)) { + pos = res.pos; + } else { + href = ""; + } + } + // [link]( "title" ) + // ^^ skipping these spaces + start = pos; + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace(code) && code !== 10) { + break; + } + } + // [link]( "title" ) + // ^^^^^^^ parsing link title + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax); + if (pos < max && start !== pos && res.ok) { + title = res.str; + pos = res.pos; + // [link]( "title" ) + // ^^ skipping these spaces + for (;pos < max; pos++) { + code = state.src.charCodeAt(pos); + if (!isSpace(code) && code !== 10) { + break; + } + } + } else { + title = ""; + } + if (pos >= max || state.src.charCodeAt(pos) !== 41 /* ) */) { + state.pos = oldPos; + return false; + } + pos++; + } else { + // Link reference + if (typeof state.env.references === "undefined") { + return false; + } + if (pos < max && state.src.charCodeAt(pos) === 91 /* [ */) { + start = pos + 1; + pos = state.md.helpers.parseLinkLabel(state, pos); + if (pos >= 0) { + label = state.src.slice(start, pos++); + } else { + pos = labelEnd + 1; + } + } else { + pos = labelEnd + 1; + } + // covers label === '' and label === undefined + // (collapsed reference link and shortcut reference link respectively) + if (!label) { + label = state.src.slice(labelStart, labelEnd); + } + ref = state.env.references[normalizeReference(label)]; + if (!ref) { + state.pos = oldPos; + return false; + } + href = ref.href; + title = ref.title; + } + + // We found the end of the link, and know for a fact it's a valid link; + // so all that's left to do is to call tokenizer. + + if (!silent) { + content = state.src.slice(labelStart, labelEnd); + state.md.inline.parse(content, state.md, state.env, tokens = []); + token = state.push("image", "img", 0); + token.attrs = attrs = [ [ "src", href ], [ "alt", "" ] ]; + token.children = tokens; + token.content = content; + if (title) { + attrs.push([ "title", title ]); + } + } + state.pos = pos; + state.posMax = max; + return true; + }; + // Process autolinks '' + /*eslint max-len:0*/ var EMAIL_RE = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$/; + var AUTOLINK_RE = /^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$/; + var autolink = function autolink(state, silent) { + var url, fullUrl, token, ch, start, max, pos = state.pos; + if (state.src.charCodeAt(pos) !== 60 /* < */) { + return false; + } + start = state.pos; + max = state.posMax; + for (;;) { + if (++pos >= max) return false; + ch = state.src.charCodeAt(pos); + if (ch === 60 /* < */) return false; + if (ch === 62 /* > */) break; + } + url = state.src.slice(start + 1, pos); + if (AUTOLINK_RE.test(url)) { + fullUrl = state.md.normalizeLink(url); + if (!state.md.validateLink(fullUrl)) { + return false; + } + if (!silent) { + token = state.push("link_open", "a", 1); + token.attrs = [ [ "href", fullUrl ] ]; + token.markup = "autolink"; + token.info = "auto"; + token = state.push("text", "", 0); + token.content = state.md.normalizeLinkText(url); + token = state.push("link_close", "a", -1); + token.markup = "autolink"; + token.info = "auto"; + } + state.pos += url.length + 2; + return true; + } + if (EMAIL_RE.test(url)) { + fullUrl = state.md.normalizeLink("mailto:" + url); + if (!state.md.validateLink(fullUrl)) { + return false; + } + if (!silent) { + token = state.push("link_open", "a", 1); + token.attrs = [ [ "href", fullUrl ] ]; + token.markup = "autolink"; + token.info = "auto"; + token = state.push("text", "", 0); + token.content = state.md.normalizeLinkText(url); + token = state.push("link_close", "a", -1); + token.markup = "autolink"; + token.info = "auto"; + } + state.pos += url.length + 2; + return true; + } + return false; + }; + var HTML_TAG_RE = html_re.HTML_TAG_RE; + function isLinkOpen(str) { + return /^\s]/i.test(str); + } + function isLinkClose(str) { + return /^<\/a\s*>/i.test(str); + } + function isLetter(ch) { + /*eslint no-bitwise:0*/ + var lc = ch | 32; + // to lower case + return lc >= 97 /* a */ && lc <= 122 /* z */; + } + var html_inline = function html_inline(state, silent) { + var ch, match, max, token, pos = state.pos; + if (!state.md.options.html) { + return false; + } + // Check start + max = state.posMax; + if (state.src.charCodeAt(pos) !== 60 /* < */ || pos + 2 >= max) { + return false; + } + // Quick fail on second char + ch = state.src.charCodeAt(pos + 1); + if (ch !== 33 /* ! */ && ch !== 63 /* ? */ && ch !== 47 /* / */ && !isLetter(ch)) { + return false; + } + match = state.src.slice(pos).match(HTML_TAG_RE); + if (!match) { + return false; + } + if (!silent) { + token = state.push("html_inline", "", 0); + token.content = match[0]; + if (isLinkOpen(token.content)) state.linkLevel++; + if (isLinkClose(token.content)) state.linkLevel--; + } + state.pos += match[0].length; + return true; + }; + var has = utils.has; + var isValidEntityCode = utils.isValidEntityCode; + var fromCodePoint = utils.fromCodePoint; + var DIGITAL_RE = /^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));/i; + var NAMED_RE = /^&([a-z][a-z0-9]{1,31});/i; + var entity = function entity(state, silent) { + var ch, code, match, token, pos = state.pos, max = state.posMax; + if (state.src.charCodeAt(pos) !== 38 /* & */) return false; + if (pos + 1 >= max) return false; + ch = state.src.charCodeAt(pos + 1); + if (ch === 35 /* # */) { + match = state.src.slice(pos).match(DIGITAL_RE); + if (match) { + if (!silent) { + code = match[1][0].toLowerCase() === "x" ? parseInt(match[1].slice(1), 16) : parseInt(match[1], 10); + token = state.push("text_special", "", 0); + token.content = isValidEntityCode(code) ? fromCodePoint(code) : fromCodePoint(65533); + token.markup = match[0]; + token.info = "entity"; + } + state.pos += match[0].length; + return true; + } + } else { + match = state.src.slice(pos).match(NAMED_RE); + if (match) { + if (has(entities, match[1])) { + if (!silent) { + token = state.push("text_special", "", 0); + token.content = entities[match[1]]; + token.markup = match[0]; + token.info = "entity"; + } + state.pos += match[0].length; + return true; + } + } + } + return false; + }; + // For each opening emphasis-like marker find a matching closing one + function processDelimiters(delimiters) { + var closerIdx, openerIdx, closer, opener, minOpenerIdx, newMinOpenerIdx, isOddMatch, lastJump, openersBottom = {}, max = delimiters.length; + if (!max) return; + // headerIdx is the first delimiter of the current (where closer is) delimiter run + var headerIdx = 0; + var lastTokenIdx = -2; + // needs any value lower than -1 + var jumps = []; + for (closerIdx = 0; closerIdx < max; closerIdx++) { + closer = delimiters[closerIdx]; + jumps.push(0); + // markers belong to same delimiter run if: + // - they have adjacent tokens + // - AND markers are the same + + if (delimiters[headerIdx].marker !== closer.marker || lastTokenIdx !== closer.token - 1) { + headerIdx = closerIdx; + } + lastTokenIdx = closer.token; + // Length is only used for emphasis-specific "rule of 3", + // if it's not defined (in strikethrough or 3rd party plugins), + // we can default it to 0 to disable those checks. + + closer.length = closer.length || 0; + if (!closer.close) continue; + // Previously calculated lower bounds (previous fails) + // for each marker, each delimiter length modulo 3, + // and for whether this closer can be an opener; + // https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460 + if (!openersBottom.hasOwnProperty(closer.marker)) { + openersBottom[closer.marker] = [ -1, -1, -1, -1, -1, -1 ]; + } + minOpenerIdx = openersBottom[closer.marker][(closer.open ? 3 : 0) + closer.length % 3]; + openerIdx = headerIdx - jumps[headerIdx] - 1; + newMinOpenerIdx = openerIdx; + for (;openerIdx > minOpenerIdx; openerIdx -= jumps[openerIdx] + 1) { + opener = delimiters[openerIdx]; + if (opener.marker !== closer.marker) continue; + if (opener.open && opener.end < 0) { + isOddMatch = false; + // from spec: + + // If one of the delimiters can both open and close emphasis, then the + // sum of the lengths of the delimiter runs containing the opening and + // closing delimiters must not be a multiple of 3 unless both lengths + // are multiples of 3. + + if (opener.close || closer.open) { + if ((opener.length + closer.length) % 3 === 0) { + if (opener.length % 3 !== 0 || closer.length % 3 !== 0) { + isOddMatch = true; + } + } + } + if (!isOddMatch) { + // If previous delimiter cannot be an opener, we can safely skip + // the entire sequence in future checks. This is required to make + // sure algorithm has linear complexity (see *_*_*_*_*_... case). + lastJump = openerIdx > 0 && !delimiters[openerIdx - 1].open ? jumps[openerIdx - 1] + 1 : 0; + jumps[closerIdx] = closerIdx - openerIdx + lastJump; + jumps[openerIdx] = lastJump; + closer.open = false; + opener.end = closerIdx; + opener.close = false; + newMinOpenerIdx = -1; + // treat next token as start of run, + // it optimizes skips in **<...>**a**<...>** pathological case + lastTokenIdx = -2; + break; + } + } + } + if (newMinOpenerIdx !== -1) { + // If match for this delimiter run failed, we want to set lower bound for + // future lookups. This is required to make sure algorithm has linear + // complexity. + // See details here: + // https://github.com/commonmark/cmark/issues/178#issuecomment-270417442 + openersBottom[closer.marker][(closer.open ? 3 : 0) + (closer.length || 0) % 3] = newMinOpenerIdx; + } + } + } + var balance_pairs = function link_pairs(state) { + var curr, tokens_meta = state.tokens_meta, max = state.tokens_meta.length; + processDelimiters(state.delimiters); + for (curr = 0; curr < max; curr++) { + if (tokens_meta[curr] && tokens_meta[curr].delimiters) { + processDelimiters(tokens_meta[curr].delimiters); + } + } + }; + // Clean up tokens after emphasis and strikethrough postprocessing: + var fragments_join = function fragments_join(state) { + var curr, last, level = 0, tokens = state.tokens, max = state.tokens.length; + for (curr = last = 0; curr < max; curr++) { + // re-calculate levels after emphasis/strikethrough turns some text nodes + // into opening/closing tags + if (tokens[curr].nesting < 0) level--; + // closing tag + tokens[curr].level = level; + if (tokens[curr].nesting > 0) level++; + // opening tag + if (tokens[curr].type === "text" && curr + 1 < max && tokens[curr + 1].type === "text") { + // collapse two adjacent text nodes + tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content; + } else { + if (curr !== last) { + tokens[last] = tokens[curr]; + } + last++; + } + } + if (curr !== last) { + tokens.length = last; + } + }; + var isWhiteSpace = utils.isWhiteSpace; + var isPunctChar = utils.isPunctChar; + var isMdAsciiPunct = utils.isMdAsciiPunct; + function StateInline(src, md, env, outTokens) { + this.src = src; + this.env = env; + this.md = md; + this.tokens = outTokens; + this.tokens_meta = Array(outTokens.length); + this.pos = 0; + this.posMax = this.src.length; + this.level = 0; + this.pending = ""; + this.pendingLevel = 0; + // Stores { start: end } pairs. Useful for backtrack + // optimization of pairs parse (emphasis, strikes). + this.cache = {}; + // List of emphasis-like delimiters for current tag + this.delimiters = []; + // Stack of delimiter lists for upper level tags + this._prev_delimiters = []; + // backtick length => last seen position + this.backticks = {}; + this.backticksScanned = false; + // Counter used to disable inline linkify-it execution + // inside and markdown links + this.linkLevel = 0; + } + // Flush pending text + + StateInline.prototype.pushPending = function() { + var token$1 = new token("text", "", 0); + token$1.content = this.pending; + token$1.level = this.pendingLevel; + this.tokens.push(token$1); + this.pending = ""; + return token$1; + }; + // Push new token to "stream". + // If pending text exists - flush it as text token + + StateInline.prototype.push = function(type, tag, nesting) { + if (this.pending) { + this.pushPending(); + } + var token$1 = new token(type, tag, nesting); + var token_meta = null; + if (nesting < 0) { + // closing tag + this.level--; + this.delimiters = this._prev_delimiters.pop(); + } + token$1.level = this.level; + if (nesting > 0) { + // opening tag + this.level++; + this._prev_delimiters.push(this.delimiters); + this.delimiters = []; + token_meta = { + delimiters: this.delimiters + }; + } + this.pendingLevel = this.level; + this.tokens.push(token$1); + this.tokens_meta.push(token_meta); + return token$1; + }; + // Scan a sequence of emphasis-like markers, and determine whether + // it can start an emphasis sequence or end an emphasis sequence. + + // - start - position to scan from (it should point at a valid marker); + // - canSplitWord - determine if these markers can be found inside a word + + StateInline.prototype.scanDelims = function(start, canSplitWord) { + var pos = start, lastChar, nextChar, count, can_open, can_close, isLastWhiteSpace, isLastPunctChar, isNextWhiteSpace, isNextPunctChar, left_flanking = true, right_flanking = true, max = this.posMax, marker = this.src.charCodeAt(start); + // treat beginning of the line as a whitespace + lastChar = start > 0 ? this.src.charCodeAt(start - 1) : 32; + while (pos < max && this.src.charCodeAt(pos) === marker) { + pos++; + } + count = pos - start; + // treat end of the line as a whitespace + nextChar = pos < max ? this.src.charCodeAt(pos) : 32; + isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar)); + isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar)); + isLastWhiteSpace = isWhiteSpace(lastChar); + isNextWhiteSpace = isWhiteSpace(nextChar); + if (isNextWhiteSpace) { + left_flanking = false; + } else if (isNextPunctChar) { + if (!(isLastWhiteSpace || isLastPunctChar)) { + left_flanking = false; + } + } + if (isLastWhiteSpace) { + right_flanking = false; + } else if (isLastPunctChar) { + if (!(isNextWhiteSpace || isNextPunctChar)) { + right_flanking = false; + } + } + if (!canSplitWord) { + can_open = left_flanking && (!right_flanking || isLastPunctChar); + can_close = right_flanking && (!left_flanking || isNextPunctChar); + } else { + can_open = left_flanking; + can_close = right_flanking; + } + return { + can_open: can_open, + can_close: can_close, + length: count + }; + }; + // re-export Token class to use in block rules + StateInline.prototype.Token = token; + var state_inline = StateInline; + //////////////////////////////////////////////////////////////////////////////// + // Parser rules + var _rules = [ [ "text", text ], [ "linkify", linkify ], [ "newline", newline ], [ "escape", _escape ], [ "backticks", backticks ], [ "strikethrough", strikethrough.tokenize ], [ "emphasis", emphasis.tokenize ], [ "link", link ], [ "image", image ], [ "autolink", autolink ], [ "html_inline", html_inline ], [ "entity", entity ] ]; + // `rule2` ruleset was created specifically for emphasis/strikethrough + // post-processing and may be changed in the future. + + // Don't use this for anything except pairs (plugins working with `balance_pairs`). + + var _rules2 = [ [ "balance_pairs", balance_pairs ], [ "strikethrough", strikethrough.postProcess ], [ "emphasis", emphasis.postProcess ], + // rules for pairs separate '**' into its own text tokens, which may be left unused, + // rule below merges unused segments back with the rest of the text + [ "fragments_join", fragments_join ] ]; + /** + * new ParserInline() + **/ function ParserInline() { + var i; + /** + * ParserInline#ruler -> Ruler + * + * [[Ruler]] instance. Keep configuration of inline rules. + **/ this.ruler = new ruler; + for (i = 0; i < _rules.length; i++) { + this.ruler.push(_rules[i][0], _rules[i][1]); + } + /** + * ParserInline#ruler2 -> Ruler + * + * [[Ruler]] instance. Second ruler used for post-processing + * (e.g. in emphasis-like rules). + **/ this.ruler2 = new ruler; + for (i = 0; i < _rules2.length; i++) { + this.ruler2.push(_rules2[i][0], _rules2[i][1]); + } + } + // Skip single token by running all rules in validation mode; + // returns `true` if any rule reported success + + ParserInline.prototype.skipToken = function(state) { + var ok, i, pos = state.pos, rules = this.ruler.getRules(""), len = rules.length, maxNesting = state.md.options.maxNesting, cache = state.cache; + if (typeof cache[pos] !== "undefined") { + state.pos = cache[pos]; + return; + } + if (state.level < maxNesting) { + for (i = 0; i < len; i++) { + // Increment state.level and decrement it later to limit recursion. + // It's harmless to do here, because no tokens are created. But ideally, + // we'd need a separate private state variable for this purpose. + state.level++; + ok = rules[i](state, true); + state.level--; + if (ok) { + if (pos >= state.pos) { + throw new Error("inline rule didn't increment state.pos"); + } + break; + } + } + } else { + // Too much nesting, just skip until the end of the paragraph. + // NOTE: this will cause links to behave incorrectly in the following case, + // when an amount of `[` is exactly equal to `maxNesting + 1`: + // [[[[[[[[[[[[[[[[[[[[[foo]() + // TODO: remove this workaround when CM standard will allow nested links + // (we can replace it by preventing links from being parsed in + // validation mode) + state.pos = state.posMax; + } + if (!ok) { + state.pos++; + } + cache[pos] = state.pos; + }; + // Generate tokens for input range + + ParserInline.prototype.tokenize = function(state) { + var ok, i, prevPos, rules = this.ruler.getRules(""), len = rules.length, end = state.posMax, maxNesting = state.md.options.maxNesting; + while (state.pos < end) { + // Try all possible rules. + // On success, rule should: + // - update `state.pos` + // - update `state.tokens` + // - return true + prevPos = state.pos; + if (state.level < maxNesting) { + for (i = 0; i < len; i++) { + ok = rules[i](state, false); + if (ok) { + if (prevPos >= state.pos) { + throw new Error("inline rule didn't increment state.pos"); + } + break; + } + } + } + if (ok) { + if (state.pos >= end) { + break; + } + continue; + } + state.pending += state.src[state.pos++]; + } + if (state.pending) { + state.pushPending(); + } + }; + /** + * ParserInline.parse(str, md, env, outTokens) + * + * Process input string and push inline tokens into `outTokens` + **/ ParserInline.prototype.parse = function(str, md, env, outTokens) { + var i, rules, len; + var state = new this.State(str, md, env, outTokens); + this.tokenize(state); + rules = this.ruler2.getRules(""); + len = rules.length; + for (i = 0; i < len; i++) { + rules[i](state); + } + }; + ParserInline.prototype.State = state_inline; + var parser_inline = ParserInline; + var re = function(opts) { + var re = {}; + opts = opts || {}; + // Use direct extract instead of `regenerate` to reduse browserified size + re.src_Any = regex$3.source; + re.src_Cc = regex$2.source; + re.src_Z = regex.source; + re.src_P = regex$4.source; + // \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation) + re.src_ZPCc = [ re.src_Z, re.src_P, re.src_Cc ].join("|"); + // \p{\Z\Cc} (white spaces + control) + re.src_ZCc = [ re.src_Z, re.src_Cc ].join("|"); + // Experimental. List of chars, completely prohibited in links + // because can separate it from other part of text + var text_separators = "[><\uff5c]"; + // All possible word characters (everything without punctuation, spaces & controls) + // Defined via punctuation & spaces to save space + // Should be something like \p{\L\N\S\M} (\w but without `_`) + re.src_pseudo_letter = "(?:(?!" + text_separators + "|" + re.src_ZPCc + ")" + re.src_Any + ")"; + // The same as abothe but without [0-9] + // var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')'; + //////////////////////////////////////////////////////////////////////////////// + re.src_ip4 = "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"; + // Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch. + re.src_auth = "(?:(?:(?!" + re.src_ZCc + "|[@/\\[\\]()]).)+@)?"; + re.src_port = "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?"; + re.src_host_terminator = "(?=$|" + text_separators + "|" + re.src_ZPCc + ")" + "(?!" + (opts["---"] ? "-(?!--)|" : "-|") + "_|:\\d|\\.-|\\.(?!$|" + re.src_ZPCc + "))"; + re.src_path = "(?:" + "[/?#]" + "(?:" + "(?!" + re.src_ZCc + "|" + text_separators + "|[()[\\]{}.,\"'?!\\-;]).|" + "\\[(?:(?!" + re.src_ZCc + "|\\]).)*\\]|" + "\\((?:(?!" + re.src_ZCc + "|[)]).)*\\)|" + "\\{(?:(?!" + re.src_ZCc + "|[}]).)*\\}|" + '\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' + "\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" + "\\'(?=" + re.src_pseudo_letter + "|[-])|" + // allow `I'm_king` if no pair found + "\\.{2,}[a-zA-Z0-9%/&]|" + // google has many dots in "google search" links (#66, #81). + // github has ... in commit range links, + // Restrict to + // - english + // - percent-encoded + // - parts of file path + // - params separator + // until more examples found. + "\\.(?!" + re.src_ZCc + "|[.]|$)|" + (opts["---"] ? "\\-(?!--(?:[^-]|$))(?:-*)|" : "\\-+|") + ",(?!" + re.src_ZCc + "|$)|" + // allow `,,,` in paths + ";(?!" + re.src_ZCc + "|$)|" + // allow `;` if not followed by space-like char + "\\!+(?!" + re.src_ZCc + "|[!]|$)|" + // allow `!!!` in paths, but not at the end + "\\?(?!" + re.src_ZCc + "|[?]|$)" + ")+" + "|\\/" + ")?"; + // Allow anything in markdown spec, forbid quote (") at the first position + // because emails enclosed in quotes are far more common + re.src_email_name = '[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'; + re.src_xn = "xn--[a-z0-9\\-]{1,59}"; + // More to read about domain names + // http://serverfault.com/questions/638260/ + re.src_domain_root = + // Allow letters & digits (http://test1) + "(?:" + re.src_xn + "|" + re.src_pseudo_letter + "{1,63}" + ")"; + re.src_domain = "(?:" + re.src_xn + "|" + "(?:" + re.src_pseudo_letter + ")" + "|" + "(?:" + re.src_pseudo_letter + "(?:-|" + re.src_pseudo_letter + "){0,61}" + re.src_pseudo_letter + ")" + ")"; + re.src_host = "(?:" + + // Don't need IP check, because digits are already allowed in normal domain names + // src_ip4 + + // '|' + + "(?:(?:(?:" + re.src_domain + ")\\.)*" + re.src_domain /*_root*/ + ")" + ")"; + re.tpl_host_fuzzy = "(?:" + re.src_ip4 + "|" + "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))" + ")"; + re.tpl_host_no_ip_fuzzy = "(?:(?:(?:" + re.src_domain + ")\\.)+(?:%TLDS%))"; + re.src_host_strict = re.src_host + re.src_host_terminator; + re.tpl_host_fuzzy_strict = re.tpl_host_fuzzy + re.src_host_terminator; + re.src_host_port_strict = re.src_host + re.src_port + re.src_host_terminator; + re.tpl_host_port_fuzzy_strict = re.tpl_host_fuzzy + re.src_port + re.src_host_terminator; + re.tpl_host_port_no_ip_fuzzy_strict = re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator; + //////////////////////////////////////////////////////////////////////////////// + // Main rules + // Rude test fuzzy links by host, for quick deny + re.tpl_host_fuzzy_test = "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + re.src_ZPCc + "|>|$))"; + re.tpl_email_fuzzy = "(^|" + text_separators + '|"|\\(|' + re.src_ZCc + ")" + "(" + re.src_email_name + "@" + re.tpl_host_fuzzy_strict + ")"; + re.tpl_link_fuzzy = + // Fuzzy link can't be prepended with .:/\- and non punctuation. + // but can start with > (markdown blockquote) + "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_fuzzy_strict + re.src_path + ")"; + re.tpl_link_no_ip_fuzzy = + // Fuzzy link can't be prepended with .:/\- and non punctuation. + // but can start with > (markdown blockquote) + "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + re.src_ZPCc + "))" + "((?![$+<=>^`|\uff5c])" + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ")"; + return re; + }; + //////////////////////////////////////////////////////////////////////////////// + // Helpers + // Merge objects + + function assign(obj /*from1, from2, from3, ...*/) { + var sources = Array.prototype.slice.call(arguments, 1); + sources.forEach((function(source) { + if (!source) { + return; + } + Object.keys(source).forEach((function(key) { + obj[key] = source[key]; + })); + })); + return obj; + } + function _class(obj) { + return Object.prototype.toString.call(obj); + } + function isString(obj) { + return _class(obj) === "[object String]"; + } + function isObject(obj) { + return _class(obj) === "[object Object]"; + } + function isRegExp(obj) { + return _class(obj) === "[object RegExp]"; + } + function isFunction(obj) { + return _class(obj) === "[object Function]"; + } + function escapeRE(str) { + return str.replace(/[.?*+^$[\]\\(){}|-]/g, "\\$&"); + } + //////////////////////////////////////////////////////////////////////////////// + var defaultOptions = { + fuzzyLink: true, + fuzzyEmail: true, + fuzzyIP: false + }; + function isOptionsObj(obj) { + return Object.keys(obj || {}).reduce((function(acc, k) { + return acc || defaultOptions.hasOwnProperty(k); + }), false); + } + var defaultSchemas = { + "http:": { + validate: function(text, pos, self) { + var tail = text.slice(pos); + if (!self.re.http) { + // compile lazily, because "host"-containing variables can change on tlds update. + self.re.http = new RegExp("^\\/\\/" + self.re.src_auth + self.re.src_host_port_strict + self.re.src_path, "i"); + } + if (self.re.http.test(tail)) { + return tail.match(self.re.http)[0].length; + } + return 0; + } + }, + "https:": "http:", + "ftp:": "http:", + "//": { + validate: function(text, pos, self) { + var tail = text.slice(pos); + if (!self.re.no_http) { + // compile lazily, because "host"-containing variables can change on tlds update. + self.re.no_http = new RegExp("^" + self.re.src_auth + + // Don't allow single-level domains, because of false positives like '//test' + // with code comments + "(?:localhost|(?:(?:" + self.re.src_domain + ")\\.)+" + self.re.src_domain_root + ")" + self.re.src_port + self.re.src_host_terminator + self.re.src_path, "i"); + } + if (self.re.no_http.test(tail)) { + // should not be `://` & `///`, that protects from errors in protocol name + if (pos >= 3 && text[pos - 3] === ":") { + return 0; + } + if (pos >= 3 && text[pos - 3] === "/") { + return 0; + } + return tail.match(self.re.no_http)[0].length; + } + return 0; + } + }, + "mailto:": { + validate: function(text, pos, self) { + var tail = text.slice(pos); + if (!self.re.mailto) { + self.re.mailto = new RegExp("^" + self.re.src_email_name + "@" + self.re.src_host_strict, "i"); + } + if (self.re.mailto.test(tail)) { + return tail.match(self.re.mailto)[0].length; + } + return 0; + } + } + }; + /*eslint-disable max-len*/ + // RE pattern for 2-character tlds (autogenerated by ./support/tlds_2char_gen.js) + var tlds_2ch_src_re = "a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]"; + // DON'T try to make PRs with changes. Extend TLDs with LinkifyIt.tlds() instead + var tlds_default = "biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|\u0440\u0444".split("|"); + /*eslint-enable max-len*/ + //////////////////////////////////////////////////////////////////////////////// + function resetScanCache(self) { + self.__index__ = -1; + self.__text_cache__ = ""; + } + function createValidator(re) { + return function(text, pos) { + var tail = text.slice(pos); + if (re.test(tail)) { + return tail.match(re)[0].length; + } + return 0; + }; + } + function createNormalizer() { + return function(match, self) { + self.normalize(match); + }; + } + // Schemas compiler. Build regexps. + + function compile(self) { + // Load & clone RE patterns. + var re$1 = self.re = re(self.__opts__); + // Define dynamic patterns + var tlds = self.__tlds__.slice(); + self.onCompile(); + if (!self.__tlds_replaced__) { + tlds.push(tlds_2ch_src_re); + } + tlds.push(re$1.src_xn); + re$1.src_tlds = tlds.join("|"); + function untpl(tpl) { + return tpl.replace("%TLDS%", re$1.src_tlds); + } + re$1.email_fuzzy = RegExp(untpl(re$1.tpl_email_fuzzy), "i"); + re$1.link_fuzzy = RegExp(untpl(re$1.tpl_link_fuzzy), "i"); + re$1.link_no_ip_fuzzy = RegExp(untpl(re$1.tpl_link_no_ip_fuzzy), "i"); + re$1.host_fuzzy_test = RegExp(untpl(re$1.tpl_host_fuzzy_test), "i"); + + // Compile each schema + + var aliases = []; + self.__compiled__ = {}; + // Reset compiled data + function schemaError(name, val) { + throw new Error('(LinkifyIt) Invalid schema "' + name + '": ' + val); + } + Object.keys(self.__schemas__).forEach((function(name) { + var val = self.__schemas__[name]; + // skip disabled methods + if (val === null) { + return; + } + var compiled = { + validate: null, + link: null + }; + self.__compiled__[name] = compiled; + if (isObject(val)) { + if (isRegExp(val.validate)) { + compiled.validate = createValidator(val.validate); + } else if (isFunction(val.validate)) { + compiled.validate = val.validate; + } else { + schemaError(name, val); + } + if (isFunction(val.normalize)) { + compiled.normalize = val.normalize; + } else if (!val.normalize) { + compiled.normalize = createNormalizer(); + } else { + schemaError(name, val); + } + return; + } + if (isString(val)) { + aliases.push(name); + return; + } + schemaError(name, val); + })); + + // Compile postponed aliases + + aliases.forEach((function(alias) { + if (!self.__compiled__[self.__schemas__[alias]]) { + // Silently fail on missed schemas to avoid errons on disable. + // schemaError(alias, self.__schemas__[alias]); + return; + } + self.__compiled__[alias].validate = self.__compiled__[self.__schemas__[alias]].validate; + self.__compiled__[alias].normalize = self.__compiled__[self.__schemas__[alias]].normalize; + })); + + // Fake record for guessed links + + self.__compiled__[""] = { + validate: null, + normalize: createNormalizer() + }; + + // Build schema condition + + var slist = Object.keys(self.__compiled__).filter((function(name) { + // Filter disabled & fake schemas + return name.length > 0 && self.__compiled__[name]; + })).map(escapeRE).join("|"); + // (?!_) cause 1.5x slowdown + self.re.schema_test = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "i"); + self.re.schema_search = RegExp("(^|(?!_)(?:[><\uff5c]|" + re$1.src_ZPCc + "))(" + slist + ")", "ig"); + self.re.schema_at_start = RegExp("^" + self.re.schema_search.source, "i"); + self.re.pretest = RegExp("(" + self.re.schema_test.source + ")|(" + self.re.host_fuzzy_test.source + ")|@", "i"); + + // Cleanup + + resetScanCache(self); + } + /** + * class Match + * + * Match result. Single element of array, returned by [[LinkifyIt#match]] + **/ function Match(self, shift) { + var start = self.__index__, end = self.__last_index__, text = self.__text_cache__.slice(start, end); + /** + * Match#schema -> String + * + * Prefix (protocol) for matched string. + **/ this.schema = self.__schema__.toLowerCase(); + /** + * Match#index -> Number + * + * First position of matched string. + **/ this.index = start + shift; + /** + * Match#lastIndex -> Number + * + * Next position after matched string. + **/ this.lastIndex = end + shift; + /** + * Match#raw -> String + * + * Matched string. + **/ this.raw = text; + /** + * Match#text -> String + * + * Notmalized text of matched string. + **/ this.text = text; + /** + * Match#url -> String + * + * Normalized url of matched string. + **/ this.url = text; + } + function createMatch(self, shift) { + var match = new Match(self, shift); + self.__compiled__[match.schema].normalize(match, self); + return match; + } + /** + * class LinkifyIt + **/ + /** + * new LinkifyIt(schemas, options) + * - schemas (Object): Optional. Additional schemas to validate (prefix/validator) + * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false } + * + * Creates new linkifier instance with optional additional schemas. + * Can be called without `new` keyword for convenience. + * + * By default understands: + * + * - `http(s)://...` , `ftp://...`, `mailto:...` & `//...` links + * - "fuzzy" links and emails (example.com, foo@bar.com). + * + * `schemas` is an object, where each key/value describes protocol/rule: + * + * - __key__ - link prefix (usually, protocol name with `:` at the end, `skype:` + * for example). `linkify-it` makes shure that prefix is not preceeded with + * alphanumeric char and symbols. Only whitespaces and punctuation allowed. + * - __value__ - rule to check tail after link prefix + * - _String_ - just alias to existing rule + * - _Object_ + * - _validate_ - validator function (should return matched length on success), + * or `RegExp`. + * - _normalize_ - optional function to normalize text & url of matched result + * (for example, for @twitter mentions). + * + * `options`: + * + * - __fuzzyLink__ - recognige URL-s without `http(s):` prefix. Default `true`. + * - __fuzzyIP__ - allow IPs in fuzzy links above. Can conflict with some texts + * like version numbers. Default `false`. + * - __fuzzyEmail__ - recognize emails without `mailto:` prefix. + * + **/ function LinkifyIt(schemas, options) { + if (!(this instanceof LinkifyIt)) { + return new LinkifyIt(schemas, options); + } + if (!options) { + if (isOptionsObj(schemas)) { + options = schemas; + schemas = {}; + } + } + this.__opts__ = assign({}, defaultOptions, options); + // Cache last tested result. Used to skip repeating steps on next `match` call. + this.__index__ = -1; + this.__last_index__ = -1; + // Next scan position + this.__schema__ = ""; + this.__text_cache__ = ""; + this.__schemas__ = assign({}, defaultSchemas, schemas); + this.__compiled__ = {}; + this.__tlds__ = tlds_default; + this.__tlds_replaced__ = false; + this.re = {}; + compile(this); + } + /** chainable + * LinkifyIt#add(schema, definition) + * - schema (String): rule name (fixed pattern prefix) + * - definition (String|RegExp|Object): schema definition + * + * Add new rule definition. See constructor description for details. + **/ LinkifyIt.prototype.add = function add(schema, definition) { + this.__schemas__[schema] = definition; + compile(this); + return this; + }; + /** chainable + * LinkifyIt#set(options) + * - options (Object): { fuzzyLink|fuzzyEmail|fuzzyIP: true|false } + * + * Set recognition options for links without schema. + **/ LinkifyIt.prototype.set = function set(options) { + this.__opts__ = assign(this.__opts__, options); + return this; + }; + /** + * LinkifyIt#test(text) -> Boolean + * + * Searches linkifiable pattern and returns `true` on success or `false` on fail. + **/ LinkifyIt.prototype.test = function test(text) { + // Reset scan cache + this.__text_cache__ = text; + this.__index__ = -1; + if (!text.length) { + return false; + } + var m, ml, me, len, shift, next, re, tld_pos, at_pos; + // try to scan for link with schema - that's the most simple rule + if (this.re.schema_test.test(text)) { + re = this.re.schema_search; + re.lastIndex = 0; + while ((m = re.exec(text)) !== null) { + len = this.testSchemaAt(text, m[2], re.lastIndex); + if (len) { + this.__schema__ = m[2]; + this.__index__ = m.index + m[1].length; + this.__last_index__ = m.index + m[0].length + len; + break; + } + } + } + if (this.__opts__.fuzzyLink && this.__compiled__["http:"]) { + // guess schemaless links + tld_pos = text.search(this.re.host_fuzzy_test); + if (tld_pos >= 0) { + // if tld is located after found link - no need to check fuzzy pattern + if (this.__index__ < 0 || tld_pos < this.__index__) { + if ((ml = text.match(this.__opts__.fuzzyIP ? this.re.link_fuzzy : this.re.link_no_ip_fuzzy)) !== null) { + shift = ml.index + ml[1].length; + if (this.__index__ < 0 || shift < this.__index__) { + this.__schema__ = ""; + this.__index__ = shift; + this.__last_index__ = ml.index + ml[0].length; + } + } + } + } + } + if (this.__opts__.fuzzyEmail && this.__compiled__["mailto:"]) { + // guess schemaless emails + at_pos = text.indexOf("@"); + if (at_pos >= 0) { + // We can't skip this check, because this cases are possible: + // 192.168.1.1@gmail.com, my.in@example.com + if ((me = text.match(this.re.email_fuzzy)) !== null) { + shift = me.index + me[1].length; + next = me.index + me[0].length; + if (this.__index__ < 0 || shift < this.__index__ || shift === this.__index__ && next > this.__last_index__) { + this.__schema__ = "mailto:"; + this.__index__ = shift; + this.__last_index__ = next; + } + } + } + } + return this.__index__ >= 0; + }; + /** + * LinkifyIt#pretest(text) -> Boolean + * + * Very quick check, that can give false positives. Returns true if link MAY BE + * can exists. Can be used for speed optimization, when you need to check that + * link NOT exists. + **/ LinkifyIt.prototype.pretest = function pretest(text) { + return this.re.pretest.test(text); + }; + /** + * LinkifyIt#testSchemaAt(text, name, position) -> Number + * - text (String): text to scan + * - name (String): rule (schema) name + * - position (Number): text offset to check from + * + * Similar to [[LinkifyIt#test]] but checks only specific protocol tail exactly + * at given position. Returns length of found pattern (0 on fail). + **/ LinkifyIt.prototype.testSchemaAt = function testSchemaAt(text, schema, pos) { + // If not supported schema check requested - terminate + if (!this.__compiled__[schema.toLowerCase()]) { + return 0; + } + return this.__compiled__[schema.toLowerCase()].validate(text, pos, this); + }; + /** + * LinkifyIt#match(text) -> Array|null + * + * Returns array of found link descriptions or `null` on fail. We strongly + * recommend to use [[LinkifyIt#test]] first, for best speed. + * + * ##### Result match description + * + * - __schema__ - link schema, can be empty for fuzzy links, or `//` for + * protocol-neutral links. + * - __index__ - offset of matched text + * - __lastIndex__ - index of next char after mathch end + * - __raw__ - matched text + * - __text__ - normalized text + * - __url__ - link, generated from matched text + **/ LinkifyIt.prototype.match = function match(text) { + var shift = 0, result = []; + // Try to take previous element from cache, if .test() called before + if (this.__index__ >= 0 && this.__text_cache__ === text) { + result.push(createMatch(this, shift)); + shift = this.__last_index__; + } + // Cut head if cache was used + var tail = shift ? text.slice(shift) : text; + // Scan string until end reached + while (this.test(tail)) { + result.push(createMatch(this, shift)); + tail = tail.slice(this.__last_index__); + shift += this.__last_index__; + } + if (result.length) { + return result; + } + return null; + }; + /** + * LinkifyIt#matchAtStart(text) -> Match|null + * + * Returns fully-formed (not fuzzy) link if it starts at the beginning + * of the string, and null otherwise. + **/ LinkifyIt.prototype.matchAtStart = function matchAtStart(text) { + // Reset scan cache + this.__text_cache__ = text; + this.__index__ = -1; + if (!text.length) return null; + var m = this.re.schema_at_start.exec(text); + if (!m) return null; + var len = this.testSchemaAt(text, m[2], m[0].length); + if (!len) return null; + this.__schema__ = m[2]; + this.__index__ = m.index + m[1].length; + this.__last_index__ = m.index + m[0].length + len; + return createMatch(this, 0); + }; + /** chainable + * LinkifyIt#tlds(list [, keepOld]) -> this + * - list (Array): list of tlds + * - keepOld (Boolean): merge with current list if `true` (`false` by default) + * + * Load (or merge) new tlds list. Those are user for fuzzy links (without prefix) + * to avoid false positives. By default this algorythm used: + * + * - hostname with any 2-letter root zones are ok. + * - biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф + * are ok. + * - encoded (`xn--...`) root zones are ok. + * + * If list is replaced, then exact match for 2-chars root zones will be checked. + **/ LinkifyIt.prototype.tlds = function tlds(list, keepOld) { + list = Array.isArray(list) ? list : [ list ]; + if (!keepOld) { + this.__tlds__ = list.slice(); + this.__tlds_replaced__ = true; + compile(this); + return this; + } + this.__tlds__ = this.__tlds__.concat(list).sort().filter((function(el, idx, arr) { + return el !== arr[idx - 1]; + })).reverse(); + compile(this); + return this; + }; + /** + * LinkifyIt#normalize(match) + * + * Default normalizer (if schema does not define it's own). + **/ LinkifyIt.prototype.normalize = function normalize(match) { + // Do minimal possible changes by default. Need to collect feedback prior + // to move forward https://github.com/markdown-it/linkify-it/issues/1 + if (!match.schema) { + match.url = "http://" + match.url; + } + if (match.schema === "mailto:" && !/^mailto:/i.test(match.url)) { + match.url = "mailto:" + match.url; + } + }; + /** + * LinkifyIt#onCompile() + * + * Override to modify basic RegExp-s. + **/ LinkifyIt.prototype.onCompile = function onCompile() {}; + var linkifyIt = LinkifyIt; + /*! https://mths.be/punycode v1.4.1 by @mathias */ + /** Highest positive signed 32-bit float value */ var maxInt = 2147483647; + // aka. 0x7FFFFFFF or 2^31-1 + /** Bootstring parameters */ var base = 36; + var tMin = 1; + var tMax = 26; + var skew = 38; + var damp = 700; + var initialBias = 72; + var initialN = 128; + // 0x80 + var delimiter = "-"; + // '\x2D' + /** Regular expressions */ var regexPunycode = /^xn--/; + var regexNonASCII = /[^\x20-\x7E]/; + // unprintable ASCII chars + non-ASCII chars + var regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; + // RFC 3490 separators + /** Error messages */ var errors = { + overflow: "Overflow: input needs wider integers to process", + "not-basic": "Illegal input >= 0x80 (not a basic code point)", + "invalid-input": "Invalid input" + }; + /** Convenience shortcuts */ var baseMinusTMin = base - tMin; + var floor = Math.floor; + var stringFromCharCode = String.fromCharCode; + /*--------------------------------------------------------------------------*/ + /** + * A generic error utility function. + * @private + * @param {String} type The error type. + * @returns {Error} Throws a `RangeError` with the applicable error message. + */ function error(type) { + throw new RangeError(errors[type]); + } + /** + * A generic `Array#map` utility function. + * @private + * @param {Array} array The array to iterate over. + * @param {Function} callback The function that gets called for every array + * item. + * @returns {Array} A new array of values returned by the callback function. + */ function map(array, fn) { + var length = array.length; + var result = []; + while (length--) { + result[length] = fn(array[length]); + } + return result; + } + /** + * A simple `Array#map`-like wrapper to work with domain name strings or email + * addresses. + * @private + * @param {String} domain The domain name or email address. + * @param {Function} callback The function that gets called for every + * character. + * @returns {Array} A new string of characters returned by the callback + * function. + */ function mapDomain(string, fn) { + var parts = string.split("@"); + var result = ""; + if (parts.length > 1) { + // In email addresses, only the domain name should be punycoded. Leave + // the local part (i.e. everything up to `@`) intact. + result = parts[0] + "@"; + string = parts[1]; + } + // Avoid `split(regex)` for IE8 compatibility. See #17. + string = string.replace(regexSeparators, "."); + var labels = string.split("."); + var encoded = map(labels, fn).join("."); + return result + encoded; + } + /** + * Creates an array containing the numeric code points of each Unicode + * character in the string. While JavaScript uses UCS-2 internally, + * this function will convert a pair of surrogate halves (each of which + * UCS-2 exposes as separate characters) into a single code point, + * matching UTF-16. + * @see `punycode.ucs2.encode` + * @see + * @memberOf punycode.ucs2 + * @name decode + * @param {String} string The Unicode input string (UCS-2). + * @returns {Array} The new array of code points. + */ function ucs2decode(string) { + var output = [], counter = 0, length = string.length, value, extra; + while (counter < length) { + value = string.charCodeAt(counter++); + if (value >= 55296 && value <= 56319 && counter < length) { + // high surrogate, and there is a next character + extra = string.charCodeAt(counter++); + if ((extra & 64512) == 56320) { + // low surrogate + output.push(((value & 1023) << 10) + (extra & 1023) + 65536); + } else { + // unmatched surrogate; only append this code unit, in case the next + // code unit is the high surrogate of a surrogate pair + output.push(value); + counter--; + } + } else { + output.push(value); + } + } + return output; + } + /** + * Creates a string based on an array of numeric code points. + * @see `punycode.ucs2.decode` + * @memberOf punycode.ucs2 + * @name encode + * @param {Array} codePoints The array of numeric code points. + * @returns {String} The new Unicode string (UCS-2). + */ function ucs2encode(array) { + return map(array, (function(value) { + var output = ""; + if (value > 65535) { + value -= 65536; + output += stringFromCharCode(value >>> 10 & 1023 | 55296); + value = 56320 | value & 1023; + } + output += stringFromCharCode(value); + return output; + })).join(""); + } + /** + * Converts a basic code point into a digit/integer. + * @see `digitToBasic()` + * @private + * @param {Number} codePoint The basic numeric code point value. + * @returns {Number} The numeric value of a basic code point (for use in + * representing integers) in the range `0` to `base - 1`, or `base` if + * the code point does not represent a value. + */ function basicToDigit(codePoint) { + if (codePoint - 48 < 10) { + return codePoint - 22; + } + if (codePoint - 65 < 26) { + return codePoint - 65; + } + if (codePoint - 97 < 26) { + return codePoint - 97; + } + return base; + } + /** + * Converts a digit/integer into a basic code point. + * @see `basicToDigit()` + * @private + * @param {Number} digit The numeric value of a basic code point. + * @returns {Number} The basic code point whose value (when used for + * representing integers) is `digit`, which needs to be in the range + * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is + * used; else, the lowercase form is used. The behavior is undefined + * if `flag` is non-zero and `digit` has no uppercase form. + */ function digitToBasic(digit, flag) { + // 0..25 map to ASCII a..z or A..Z + // 26..35 map to ASCII 0..9 + return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5); + } + /** + * Bias adaptation function as per section 3.4 of RFC 3492. + * https://tools.ietf.org/html/rfc3492#section-3.4 + * @private + */ function adapt(delta, numPoints, firstTime) { + var k = 0; + delta = firstTime ? floor(delta / damp) : delta >> 1; + delta += floor(delta / numPoints); + for (;delta > baseMinusTMin * tMax >> 1; k += base) { + delta = floor(delta / baseMinusTMin); + } + return floor(k + (baseMinusTMin + 1) * delta / (delta + skew)); + } + /** + * Converts a Punycode string of ASCII-only symbols to a string of Unicode + * symbols. + * @memberOf punycode + * @param {String} input The Punycode string of ASCII-only symbols. + * @returns {String} The resulting string of Unicode symbols. + */ function decode(input) { + // Don't use UCS-2 + var output = [], inputLength = input.length, out, i = 0, n = initialN, bias = initialBias, basic, j, index, oldi, w, k, digit, t, + /** Cached calculation results */ + baseMinusT; + // Handle the basic code points: let `basic` be the number of input code + // points before the last delimiter, or `0` if there is none, then copy + // the first basic code points to the output. + basic = input.lastIndexOf(delimiter); + if (basic < 0) { + basic = 0; + } + for (j = 0; j < basic; ++j) { + // if it's not a basic code point + if (input.charCodeAt(j) >= 128) { + error("not-basic"); + } + output.push(input.charCodeAt(j)); + } + // Main decoding loop: start just after the last delimiter if any basic code + // points were copied; start at the beginning otherwise. + for (index = basic > 0 ? basic + 1 : 0; index < inputLength; ) { + // `index` is the index of the next character to be consumed. + // Decode a generalized variable-length integer into `delta`, + // which gets added to `i`. The overflow checking is easier + // if we increase `i` as we go, then subtract off its starting + // value at the end to obtain `delta`. + for (oldi = i, w = 1, k = base; ;k += base) { + if (index >= inputLength) { + error("invalid-input"); + } + digit = basicToDigit(input.charCodeAt(index++)); + if (digit >= base || digit > floor((maxInt - i) / w)) { + error("overflow"); + } + i += digit * w; + t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias; + if (digit < t) { + break; + } + baseMinusT = base - t; + if (w > floor(maxInt / baseMinusT)) { + error("overflow"); + } + w *= baseMinusT; + } + out = output.length + 1; + bias = adapt(i - oldi, out, oldi == 0); + // `i` was supposed to wrap around from `out` to `0`, + // incrementing `n` each time, so we'll fix that now: + if (floor(i / out) > maxInt - n) { + error("overflow"); + } + n += floor(i / out); + i %= out; + // Insert `n` at position `i` of the output + output.splice(i++, 0, n); + } + return ucs2encode(output); + } + /** + * Converts a string of Unicode symbols (e.g. a domain name label) to a + * Punycode string of ASCII-only symbols. + * @memberOf punycode + * @param {String} input The string of Unicode symbols. + * @returns {String} The resulting Punycode string of ASCII-only symbols. + */ function encode(input) { + var n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, currentValue, output = [], + /** `inputLength` will hold the number of code points in `input`. */ + inputLength, + /** Cached calculation results */ + handledCPCountPlusOne, baseMinusT, qMinusT; + // Convert the input in UCS-2 to Unicode + input = ucs2decode(input); + // Cache the length + inputLength = input.length; + // Initialize the state + n = initialN; + delta = 0; + bias = initialBias; + // Handle the basic code points + for (j = 0; j < inputLength; ++j) { + currentValue = input[j]; + if (currentValue < 128) { + output.push(stringFromCharCode(currentValue)); + } + } + handledCPCount = basicLength = output.length; + // `handledCPCount` is the number of code points that have been handled; + // `basicLength` is the number of basic code points. + // Finish the basic string - if it is not empty - with a delimiter + if (basicLength) { + output.push(delimiter); + } + // Main encoding loop: + while (handledCPCount < inputLength) { + // All non-basic code points < n have been handled already. Find the next + // larger one: + for (m = maxInt, j = 0; j < inputLength; ++j) { + currentValue = input[j]; + if (currentValue >= n && currentValue < m) { + m = currentValue; + } + } + // Increase `delta` enough to advance the decoder's state to , + // but guard against overflow + handledCPCountPlusOne = handledCPCount + 1; + if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) { + error("overflow"); + } + delta += (m - n) * handledCPCountPlusOne; + n = m; + for (j = 0; j < inputLength; ++j) { + currentValue = input[j]; + if (currentValue < n && ++delta > maxInt) { + error("overflow"); + } + if (currentValue == n) { + // Represent delta as a generalized variable-length integer + for (q = delta, k = base; ;k += base) { + t = k <= bias ? tMin : k >= bias + tMax ? tMax : k - bias; + if (q < t) { + break; + } + qMinusT = q - t; + baseMinusT = base - t; + output.push(stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0))); + q = floor(qMinusT / baseMinusT); + } + output.push(stringFromCharCode(digitToBasic(q, 0))); + bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength); + delta = 0; + ++handledCPCount; + } + } + ++delta; + ++n; + } + return output.join(""); + } + /** + * Converts a Punycode string representing a domain name or an email address + * to Unicode. Only the Punycoded parts of the input will be converted, i.e. + * it doesn't matter if you call it on a string that has already been + * converted to Unicode. + * @memberOf punycode + * @param {String} input The Punycoded domain name or email address to + * convert to Unicode. + * @returns {String} The Unicode representation of the given Punycode + * string. + */ function toUnicode(input) { + return mapDomain(input, (function(string) { + return regexPunycode.test(string) ? decode(string.slice(4).toLowerCase()) : string; + })); + } + /** + * Converts a Unicode string representing a domain name or an email address to + * Punycode. Only the non-ASCII parts of the domain name will be converted, + * i.e. it doesn't matter if you call it with a domain that's already in + * ASCII. + * @memberOf punycode + * @param {String} input The domain name or email address to convert, as a + * Unicode string. + * @returns {String} The Punycode representation of the given domain name or + * email address. + */ function toASCII(input) { + return mapDomain(input, (function(string) { + return regexNonASCII.test(string) ? "xn--" + encode(string) : string; + })); + } + var version = "1.4.1"; + /** + * An object of methods to convert from JavaScript's internal character + * representation (UCS-2) to Unicode code points, and back. + * @see + * @memberOf punycode + * @type Object + */ var ucs2 = { + decode: ucs2decode, + encode: ucs2encode + }; + var punycode$1 = { + version: version, + ucs2: ucs2, + toASCII: toASCII, + toUnicode: toUnicode, + encode: encode, + decode: decode + }; + var punycode$2 = Object.freeze({ + __proto__: null, + decode: decode, + encode: encode, + toUnicode: toUnicode, + toASCII: toASCII, + version: version, + ucs2: ucs2, + default: punycode$1 + }); + // markdown-it default options + var _default = { + options: { + html: false, + // Enable HTML tags in source + xhtmlOut: false, + // Use '/' to close single tags (
) + breaks: false, + // Convert '\n' in paragraphs into
+ langPrefix: "language-", + // CSS language prefix for fenced blocks + linkify: false, + // autoconvert URL-like texts to links + // Enable some language-neutral replacements + quotes beautification + typographer: false, + // Double + single quotes replacement pairs, when typographer enabled, + // and smartquotes on. Could be either a String or an Array. + // For example, you can use '«»„“' for Russian, '„“‚‘' for German, + // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + quotes: "\u201c\u201d\u2018\u2019", + /* “”‘’ */ + // Highlighter function. Should return escaped HTML, + // or '' if the source string is not changed and should be escaped externaly. + // If result starts with ) + breaks: false, + // Convert '\n' in paragraphs into
+ langPrefix: "language-", + // CSS language prefix for fenced blocks + linkify: false, + // autoconvert URL-like texts to links + // Enable some language-neutral replacements + quotes beautification + typographer: false, + // Double + single quotes replacement pairs, when typographer enabled, + // and smartquotes on. Could be either a String or an Array. + // For example, you can use '«»„“' for Russian, '„“‚‘' for German, + // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + quotes: "\u201c\u201d\u2018\u2019", + /* “”‘’ */ + // Highlighter function. Should return escaped HTML, + // or '' if the source string is not changed and should be escaped externaly. + // If result starts with ) + breaks: false, + // Convert '\n' in paragraphs into
+ langPrefix: "language-", + // CSS language prefix for fenced blocks + linkify: false, + // autoconvert URL-like texts to links + // Enable some language-neutral replacements + quotes beautification + typographer: false, + // Double + single quotes replacement pairs, when typographer enabled, + // and smartquotes on. Could be either a String or an Array. + // For example, you can use '«»„“' for Russian, '„“‚‘' for German, + // and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp). + quotes: "\u201c\u201d\u2018\u2019", + /* “”‘’ */ + // Highlighter function. Should return escaped HTML, + // or '' if the source string is not changed and should be escaped externaly. + // If result starts with = 0) { + try { + parsed.hostname = punycode.toASCII(parsed.hostname); + } catch (er) {} + } + } + return mdurl.encode(mdurl.format(parsed)); + } + function normalizeLinkText(url) { + var parsed = mdurl.parse(url, true); + if (parsed.hostname) { + // Encode hostnames in urls like: + // `http://host/`, `https://host/`, `mailto:user@host`, `//host/` + // We don't encode unknown schemas, because it's likely that we encode + // something we shouldn't (e.g. `skype:name` treated as `skype:host`) + if (!parsed.protocol || RECODE_HOSTNAME_FOR.indexOf(parsed.protocol) >= 0) { + try { + parsed.hostname = punycode.toUnicode(parsed.hostname); + } catch (er) {} + } + } + // add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720 + return mdurl.decode(mdurl.format(parsed), mdurl.decode.defaultChars + "%"); + } + /** + * class MarkdownIt + * + * Main parser/renderer class. + * + * ##### Usage + * + * ```javascript + * // node.js, "classic" way: + * var MarkdownIt = require('markdown-it'), + * md = new MarkdownIt(); + * var result = md.render('# markdown-it rulezz!'); + * + * // node.js, the same, but with sugar: + * var md = require('markdown-it')(); + * var result = md.render('# markdown-it rulezz!'); + * + * // browser without AMD, added to "window" on script load + * // Note, there are no dash. + * var md = window.markdownit(); + * var result = md.render('# markdown-it rulezz!'); + * ``` + * + * Single line rendering, without paragraph wrap: + * + * ```javascript + * var md = require('markdown-it')(); + * var result = md.renderInline('__markdown-it__ rulezz!'); + * ``` + **/ + /** + * new MarkdownIt([presetName, options]) + * - presetName (String): optional, `commonmark` / `zero` + * - options (Object) + * + * Creates parser instanse with given config. Can be called without `new`. + * + * ##### presetName + * + * MarkdownIt provides named presets as a convenience to quickly + * enable/disable active syntax rules and options for common use cases. + * + * - ["commonmark"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/commonmark.js) - + * configures parser to strict [CommonMark](http://commonmark.org/) mode. + * - [default](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/default.js) - + * similar to GFM, used when no preset name given. Enables all available rules, + * but still without html, typographer & autolinker. + * - ["zero"](https://github.com/markdown-it/markdown-it/blob/master/lib/presets/zero.js) - + * all rules disabled. Useful to quickly setup your config via `.enable()`. + * For example, when you need only `bold` and `italic` markup and nothing else. + * + * ##### options: + * + * - __html__ - `false`. Set `true` to enable HTML tags in source. Be careful! + * That's not safe! You may need external sanitizer to protect output from XSS. + * It's better to extend features via plugins, instead of enabling HTML. + * - __xhtmlOut__ - `false`. Set `true` to add '/' when closing single tags + * (`
`). This is needed only for full CommonMark compatibility. In real + * world you will need HTML output. + * - __breaks__ - `false`. Set `true` to convert `\n` in paragraphs into `
`. + * - __langPrefix__ - `language-`. CSS language class prefix for fenced blocks. + * Can be useful for external highlighters. + * - __linkify__ - `false`. Set `true` to autoconvert URL-like text to links. + * - __typographer__ - `false`. Set `true` to enable [some language-neutral + * replacement](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/replacements.js) + + * quotes beautification (smartquotes). + * - __quotes__ - `“”‘’`, String or Array. Double + single quotes replacement + * pairs, when typographer enabled and smartquotes on. For example, you can + * use `'«»„“'` for Russian, `'„“‚‘'` for German, and + * `['«\xA0', '\xA0»', '‹\xA0', '\xA0›']` for French (including nbsp). + * - __highlight__ - `null`. Highlighter function for fenced code blocks. + * Highlighter `function (str, lang)` should return escaped HTML. It can also + * return empty string if the source was not changed and should be escaped + * externaly. If result starts with `): + * + * ```javascript + * var hljs = require('highlight.js') // https://highlightjs.org/ + * + * // Actual default values + * var md = require('markdown-it')({ + * highlight: function (str, lang) { + * if (lang && hljs.getLanguage(lang)) { + * try { + * return '
' +
+	 *                hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
+	 *                '
'; + * } catch (__) {} + * } + * + * return '
' + md.utils.escapeHtml(str) + '
'; + * } + * }); + * ``` + * + **/ function MarkdownIt(presetName, options) { + if (!(this instanceof MarkdownIt)) { + return new MarkdownIt(presetName, options); + } + if (!options) { + if (!utils.isString(presetName)) { + options = presetName || {}; + presetName = "default"; + } + } + /** + * MarkdownIt#inline -> ParserInline + * + * Instance of [[ParserInline]]. You may need it to add new rules when + * writing plugins. For simple rules control use [[MarkdownIt.disable]] and + * [[MarkdownIt.enable]]. + **/ this.inline = new parser_inline; + /** + * MarkdownIt#block -> ParserBlock + * + * Instance of [[ParserBlock]]. You may need it to add new rules when + * writing plugins. For simple rules control use [[MarkdownIt.disable]] and + * [[MarkdownIt.enable]]. + **/ this.block = new parser_block; + /** + * MarkdownIt#core -> Core + * + * Instance of [[Core]] chain executor. You may need it to add new rules when + * writing plugins. For simple rules control use [[MarkdownIt.disable]] and + * [[MarkdownIt.enable]]. + **/ this.core = new parser_core; + /** + * MarkdownIt#renderer -> Renderer + * + * Instance of [[Renderer]]. Use it to modify output look. Or to add rendering + * rules for new token types, generated by plugins. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')(); + * + * function myToken(tokens, idx, options, env, self) { + * //... + * return result; + * }; + * + * md.renderer.rules['my_token'] = myToken + * ``` + * + * See [[Renderer]] docs and [source code](https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js). + **/ this.renderer = new renderer; + /** + * MarkdownIt#linkify -> LinkifyIt + * + * [linkify-it](https://github.com/markdown-it/linkify-it) instance. + * Used by [linkify](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/linkify.js) + * rule. + **/ this.linkify = new linkifyIt; + /** + * MarkdownIt#validateLink(url) -> Boolean + * + * Link validation function. CommonMark allows too much in links. By default + * we disable `javascript:`, `vbscript:`, `file:` schemas, and almost all `data:...` schemas + * except some embedded image types. + * + * You can change this behaviour: + * + * ```javascript + * var md = require('markdown-it')(); + * // enable everything + * md.validateLink = function () { return true; } + * ``` + **/ this.validateLink = validateLink; + /** + * MarkdownIt#normalizeLink(url) -> String + * + * Function used to encode link url to a machine-readable format, + * which includes url-encoding, punycode, etc. + **/ this.normalizeLink = normalizeLink; + /** + * MarkdownIt#normalizeLinkText(url) -> String + * + * Function used to decode link url to a human-readable format` + **/ this.normalizeLinkText = normalizeLinkText; + // Expose utils & helpers for easy acces from plugins + /** + * MarkdownIt#utils -> utils + * + * Assorted utility functions, useful to write plugins. See details + * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/common/utils.js). + **/ this.utils = utils; + /** + * MarkdownIt#helpers -> helpers + * + * Link components parser functions, useful to write plugins. See details + * [here](https://github.com/markdown-it/markdown-it/blob/master/lib/helpers). + **/ this.helpers = utils.assign({}, helpers); + this.options = {}; + this.configure(presetName); + if (options) { + this.set(options); + } + } + /** chainable + * MarkdownIt.set(options) + * + * Set parser options (in the same format as in constructor). Probably, you + * will never need it, but you can change options after constructor call. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')() + * .set({ html: true, breaks: true }) + * .set({ typographer, true }); + * ``` + * + * __Note:__ To achieve the best possible performance, don't modify a + * `markdown-it` instance options on the fly. If you need multiple configurations + * it's best to create multiple instances and initialize each with separate + * config. + **/ MarkdownIt.prototype.set = function(options) { + utils.assign(this.options, options); + return this; + }; + /** chainable, internal + * MarkdownIt.configure(presets) + * + * Batch load of all options and compenent settings. This is internal method, + * and you probably will not need it. But if you will - see available presets + * and data structure [here](https://github.com/markdown-it/markdown-it/tree/master/lib/presets) + * + * We strongly recommend to use presets instead of direct config loads. That + * will give better compatibility with next versions. + **/ MarkdownIt.prototype.configure = function(presets) { + var self = this, presetName; + if (utils.isString(presets)) { + presetName = presets; + presets = config[presetName]; + if (!presets) { + throw new Error('Wrong `markdown-it` preset "' + presetName + '", check name'); + } + } + if (!presets) { + throw new Error("Wrong `markdown-it` preset, can't be empty"); + } + if (presets.options) { + self.set(presets.options); + } + if (presets.components) { + Object.keys(presets.components).forEach((function(name) { + if (presets.components[name].rules) { + self[name].ruler.enableOnly(presets.components[name].rules); + } + if (presets.components[name].rules2) { + self[name].ruler2.enableOnly(presets.components[name].rules2); + } + })); + } + return this; + }; + /** chainable + * MarkdownIt.enable(list, ignoreInvalid) + * - list (String|Array): rule name or list of rule names to enable + * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found. + * + * Enable list or rules. It will automatically find appropriate components, + * containing rules with given names. If rule not found, and `ignoreInvalid` + * not set - throws exception. + * + * ##### Example + * + * ```javascript + * var md = require('markdown-it')() + * .enable(['sub', 'sup']) + * .disable('smartquotes'); + * ``` + **/ MarkdownIt.prototype.enable = function(list, ignoreInvalid) { + var result = []; + if (!Array.isArray(list)) { + list = [ list ]; + } + [ "core", "block", "inline" ].forEach((function(chain) { + result = result.concat(this[chain].ruler.enable(list, true)); + }), this); + result = result.concat(this.inline.ruler2.enable(list, true)); + var missed = list.filter((function(name) { + return result.indexOf(name) < 0; + })); + if (missed.length && !ignoreInvalid) { + throw new Error("MarkdownIt. Failed to enable unknown rule(s): " + missed); + } + return this; + }; + /** chainable + * MarkdownIt.disable(list, ignoreInvalid) + * - list (String|Array): rule name or list of rule names to disable. + * - ignoreInvalid (Boolean): set `true` to ignore errors when rule not found. + * + * The same as [[MarkdownIt.enable]], but turn specified rules off. + **/ MarkdownIt.prototype.disable = function(list, ignoreInvalid) { + var result = []; + if (!Array.isArray(list)) { + list = [ list ]; + } + [ "core", "block", "inline" ].forEach((function(chain) { + result = result.concat(this[chain].ruler.disable(list, true)); + }), this); + result = result.concat(this.inline.ruler2.disable(list, true)); + var missed = list.filter((function(name) { + return result.indexOf(name) < 0; + })); + if (missed.length && !ignoreInvalid) { + throw new Error("MarkdownIt. Failed to disable unknown rule(s): " + missed); + } + return this; + }; + /** chainable + * MarkdownIt.use(plugin, params) + * + * Load specified plugin with given params into current parser instance. + * It's just a sugar to call `plugin(md, params)` with curring. + * + * ##### Example + * + * ```javascript + * var iterator = require('markdown-it-for-inline'); + * var md = require('markdown-it')() + * .use(iterator, 'foo_replace', 'text', function (tokens, idx) { + * tokens[idx].content = tokens[idx].content.replace(/foo/g, 'bar'); + * }); + * ``` + **/ MarkdownIt.prototype.use = function(plugin /*, params, ... */) { + var args = [ this ].concat(Array.prototype.slice.call(arguments, 1)); + plugin.apply(plugin, args); + return this; + }; + /** internal + * MarkdownIt.parse(src, env) -> Array + * - src (String): source string + * - env (Object): environment sandbox + * + * Parse input string and return list of block tokens (special token type + * "inline" will contain list of inline tokens). You should not call this + * method directly, until you write custom renderer (for example, to produce + * AST). + * + * `env` is used to pass data between "distributed" rules and return additional + * metadata like reference info, needed for the renderer. It also can be used to + * inject data in specific cases. Usually, you will be ok to pass `{}`, + * and then pass updated object to renderer. + **/ MarkdownIt.prototype.parse = function(src, env) { + if (typeof src !== "string") { + throw new Error("Input data should be a String"); + } + var state = new this.core.State(src, this, env); + this.core.process(state); + return state.tokens; + }; + /** + * MarkdownIt.render(src [, env]) -> String + * - src (String): source string + * - env (Object): environment sandbox + * + * Render markdown string into html. It does all magic for you :). + * + * `env` can be used to inject additional metadata (`{}` by default). + * But you will not need it with high probability. See also comment + * in [[MarkdownIt.parse]]. + **/ MarkdownIt.prototype.render = function(src, env) { + env = env || {}; + return this.renderer.render(this.parse(src, env), this.options, env); + }; + /** internal + * MarkdownIt.parseInline(src, env) -> Array + * - src (String): source string + * - env (Object): environment sandbox + * + * The same as [[MarkdownIt.parse]] but skip all block rules. It returns the + * block tokens list with the single `inline` element, containing parsed inline + * tokens in `children` property. Also updates `env` object. + **/ MarkdownIt.prototype.parseInline = function(src, env) { + var state = new this.core.State(src, this, env); + state.inlineMode = true; + this.core.process(state); + return state.tokens; + }; + /** + * MarkdownIt.renderInline(src [, env]) -> String + * - src (String): source string + * - env (Object): environment sandbox + * + * Similar to [[MarkdownIt.render]] but for single paragraph content. Result + * will NOT be wrapped into `

` tags. + **/ MarkdownIt.prototype.renderInline = function(src, env) { + env = env || {}; + return this.renderer.render(this.parseInline(src, env), this.options, env); + }; + var lib = MarkdownIt; + var markdownIt = lib; + return markdownIt; +})); + diff --git a/examples/server/public/deps_tailwindcss.js b/examples/server/public/deps_tailwindcss.js new file mode 100644 index 000000000..6736cb8ca --- /dev/null +++ b/examples/server/public/deps_tailwindcss.js @@ -0,0 +1,82 @@ +(()=>{var Iv=Object.create;var Ui=Object.defineProperty;var Dv=Object.getOwnPropertyDescriptor;var qv=Object.getOwnPropertyNames;var $v=Object.getPrototypeOf,Lv=Object.prototype.hasOwnProperty;var cf=r=>Ui(r,"__esModule",{value:!0});var pf=r=>{if(typeof require!="undefined")return require(r);throw new Error('Dynamic require of "'+r+'" is not supported')};var R=(r,e)=>()=>(r&&(e=r(r=0)),e);var x=(r,e)=>()=>(e||r((e={exports:{}}).exports,e),e.exports),Ge=(r,e)=>{cf(r);for(var t in e)Ui(r,t,{get:e[t],enumerable:!0})},Mv=(r,e,t)=>{if(e&&typeof e=="object"||typeof e=="function")for(let i of qv(e))!Lv.call(r,i)&&i!=="default"&&Ui(r,i,{get:()=>e[i],enumerable:!(t=Dv(e,i))||t.enumerable});return r},pe=r=>Mv(cf(Ui(r!=null?Iv($v(r)):{},"default",r&&r.__esModule&&"default"in r?{get:()=>r.default,enumerable:!0}:{value:r,enumerable:!0})),r);var m,u=R(()=>{m={platform:"",env:{},versions:{node:"14.17.6"}}});var Nv,be,ft=R(()=>{u();Nv=0,be={readFileSync:r=>self[r]||"",statSync:()=>({mtimeMs:Nv++}),promises:{readFile:r=>Promise.resolve(self[r]||"")}}});var Ns=x((sP,hf)=>{u();"use strict";var df=class{constructor(e={}){if(!(e.maxSize&&e.maxSize>0))throw new TypeError("`maxSize` must be a number greater than 0");if(typeof e.maxAge=="number"&&e.maxAge===0)throw new TypeError("`maxAge` must be a number greater than 0");this.maxSize=e.maxSize,this.maxAge=e.maxAge||1/0,this.onEviction=e.onEviction,this.cache=new Map,this.oldCache=new Map,this._size=0}_emitEvictions(e){if(typeof this.onEviction=="function")for(let[t,i]of e)this.onEviction(t,i.value)}_deleteIfExpired(e,t){return typeof t.expiry=="number"&&t.expiry<=Date.now()?(typeof this.onEviction=="function"&&this.onEviction(e,t.value),this.delete(e)):!1}_getOrDeleteIfExpired(e,t){if(this._deleteIfExpired(e,t)===!1)return t.value}_getItemValue(e,t){return t.expiry?this._getOrDeleteIfExpired(e,t):t.value}_peek(e,t){let i=t.get(e);return this._getItemValue(e,i)}_set(e,t){this.cache.set(e,t),this._size++,this._size>=this.maxSize&&(this._size=0,this._emitEvictions(this.oldCache),this.oldCache=this.cache,this.cache=new Map)}_moveToRecent(e,t){this.oldCache.delete(e),this._set(e,t)}*_entriesAscending(){for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield e)}for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield e)}}get(e){if(this.cache.has(e)){let t=this.cache.get(e);return this._getItemValue(e,t)}if(this.oldCache.has(e)){let t=this.oldCache.get(e);if(this._deleteIfExpired(e,t)===!1)return this._moveToRecent(e,t),t.value}}set(e,t,{maxAge:i=this.maxAge===1/0?void 0:Date.now()+this.maxAge}={}){this.cache.has(e)?this.cache.set(e,{value:t,maxAge:i}):this._set(e,{value:t,expiry:i})}has(e){return this.cache.has(e)?!this._deleteIfExpired(e,this.cache.get(e)):this.oldCache.has(e)?!this._deleteIfExpired(e,this.oldCache.get(e)):!1}peek(e){if(this.cache.has(e))return this._peek(e,this.cache);if(this.oldCache.has(e))return this._peek(e,this.oldCache)}delete(e){let t=this.cache.delete(e);return t&&this._size--,this.oldCache.delete(e)||t}clear(){this.cache.clear(),this.oldCache.clear(),this._size=0}resize(e){if(!(e&&e>0))throw new TypeError("`maxSize` must be a number greater than 0");let t=[...this._entriesAscending()],i=t.length-e;i<0?(this.cache=new Map(t),this.oldCache=new Map,this._size=t.length):(i>0&&this._emitEvictions(t.slice(0,i)),this.oldCache=new Map(t.slice(i)),this.cache=new Map,this._size=0),this.maxSize=e}*keys(){for(let[e]of this)yield e}*values(){for(let[,e]of this)yield e}*[Symbol.iterator](){for(let e of this.cache){let[t,i]=e;this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}for(let e of this.oldCache){let[t,i]=e;this.cache.has(t)||this._deleteIfExpired(t,i)===!1&&(yield[t,i.value])}}*entriesDescending(){let e=[...this.cache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}e=[...this.oldCache];for(let t=e.length-1;t>=0;--t){let i=e[t],[n,a]=i;this.cache.has(n)||this._deleteIfExpired(n,a)===!1&&(yield[n,a.value])}}*entriesAscending(){for(let[e,t]of this._entriesAscending())yield[e,t.value]}get size(){if(!this._size)return this.oldCache.size;let e=0;for(let t of this.oldCache.keys())this.cache.has(t)||e++;return Math.min(this._size+e,this.maxSize)}};hf.exports=df});var mf,gf=R(()=>{u();mf=r=>r&&r._hash});function Vi(r){return mf(r,{ignoreUnknown:!0})}var yf=R(()=>{u();gf()});function xt(r){if(r=`${r}`,r==="0")return"0";if(/^[+-]?(\d+|\d*\.\d+)(e[+-]?\d+)?(%|\w+)?$/.test(r))return r.replace(/^[+-]?/,t=>t==="-"?"":"-");let e=["var","calc","min","max","clamp"];for(let t of e)if(r.includes(`${t}(`))return`calc(${r} * -1)`}var Hi=R(()=>{u()});var bf,wf=R(()=>{u();bf=["preflight","container","accessibility","pointerEvents","visibility","position","inset","isolation","zIndex","order","gridColumn","gridColumnStart","gridColumnEnd","gridRow","gridRowStart","gridRowEnd","float","clear","margin","boxSizing","lineClamp","display","aspectRatio","size","height","maxHeight","minHeight","width","minWidth","maxWidth","flex","flexShrink","flexGrow","flexBasis","tableLayout","captionSide","borderCollapse","borderSpacing","transformOrigin","translate","rotate","skew","scale","transform","animation","cursor","touchAction","userSelect","resize","scrollSnapType","scrollSnapAlign","scrollSnapStop","scrollMargin","scrollPadding","listStylePosition","listStyleType","listStyleImage","appearance","columns","breakBefore","breakInside","breakAfter","gridAutoColumns","gridAutoFlow","gridAutoRows","gridTemplateColumns","gridTemplateRows","flexDirection","flexWrap","placeContent","placeItems","alignContent","alignItems","justifyContent","justifyItems","gap","space","divideWidth","divideStyle","divideColor","divideOpacity","placeSelf","alignSelf","justifySelf","overflow","overscrollBehavior","scrollBehavior","textOverflow","hyphens","whitespace","textWrap","wordBreak","borderRadius","borderWidth","borderStyle","borderColor","borderOpacity","backgroundColor","backgroundOpacity","backgroundImage","gradientColorStops","boxDecorationBreak","backgroundSize","backgroundAttachment","backgroundClip","backgroundPosition","backgroundRepeat","backgroundOrigin","fill","stroke","strokeWidth","objectFit","objectPosition","padding","textAlign","textIndent","verticalAlign","fontFamily","fontSize","fontWeight","textTransform","fontStyle","fontVariantNumeric","lineHeight","letterSpacing","textColor","textOpacity","textDecoration","textDecorationColor","textDecorationStyle","textDecorationThickness","textUnderlineOffset","fontSmoothing","placeholderColor","placeholderOpacity","caretColor","accentColor","opacity","backgroundBlendMode","mixBlendMode","boxShadow","boxShadowColor","outlineStyle","outlineWidth","outlineOffset","outlineColor","ringWidth","ringColor","ringOpacity","ringOffsetWidth","ringOffsetColor","blur","brightness","contrast","dropShadow","grayscale","hueRotate","invert","saturate","sepia","filter","backdropBlur","backdropBrightness","backdropContrast","backdropGrayscale","backdropHueRotate","backdropInvert","backdropOpacity","backdropSaturate","backdropSepia","backdropFilter","transitionProperty","transitionDelay","transitionDuration","transitionTimingFunction","willChange","contain","content","forcedColorAdjust"]});function vf(r,e){return r===void 0?e:Array.isArray(r)?r:[...new Set(e.filter(i=>r!==!1&&r[i]!==!1).concat(Object.keys(r).filter(i=>r[i]!==!1)))]}var xf=R(()=>{u()});var kf={};Ge(kf,{default:()=>Qe});var Qe,Wi=R(()=>{u();Qe=new Proxy({},{get:()=>String})});function Bs(r,e,t){typeof m!="undefined"&&m.env.JEST_WORKER_ID||t&&Sf.has(t)||(t&&Sf.add(t),console.warn(""),e.forEach(i=>console.warn(r,"-",i)))}function Fs(r){return Qe.dim(r)}var Sf,G,Be=R(()=>{u();Wi();Sf=new Set;G={info(r,e){Bs(Qe.bold(Qe.cyan("info")),...Array.isArray(r)?[r]:[e,r])},warn(r,e){["content-problems"].includes(r)||Bs(Qe.bold(Qe.yellow("warn")),...Array.isArray(r)?[r]:[e,r])},risk(r,e){Bs(Qe.bold(Qe.magenta("risk")),...Array.isArray(r)?[r]:[e,r])}}});var Af={};Ge(Af,{default:()=>js});function qr({version:r,from:e,to:t}){G.warn(`${e}-color-renamed`,[`As of Tailwind CSS ${r}, \`${e}\` has been renamed to \`${t}\`.`,"Update your configuration file to silence this warning."])}var js,zs=R(()=>{u();Be();js={inherit:"inherit",current:"currentColor",transparent:"transparent",black:"#000",white:"#fff",slate:{50:"#f8fafc",100:"#f1f5f9",200:"#e2e8f0",300:"#cbd5e1",400:"#94a3b8",500:"#64748b",600:"#475569",700:"#334155",800:"#1e293b",900:"#0f172a",950:"#020617"},gray:{50:"#f9fafb",100:"#f3f4f6",200:"#e5e7eb",300:"#d1d5db",400:"#9ca3af",500:"#6b7280",600:"#4b5563",700:"#374151",800:"#1f2937",900:"#111827",950:"#030712"},zinc:{50:"#fafafa",100:"#f4f4f5",200:"#e4e4e7",300:"#d4d4d8",400:"#a1a1aa",500:"#71717a",600:"#52525b",700:"#3f3f46",800:"#27272a",900:"#18181b",950:"#09090b"},neutral:{50:"#fafafa",100:"#f5f5f5",200:"#e5e5e5",300:"#d4d4d4",400:"#a3a3a3",500:"#737373",600:"#525252",700:"#404040",800:"#262626",900:"#171717",950:"#0a0a0a"},stone:{50:"#fafaf9",100:"#f5f5f4",200:"#e7e5e4",300:"#d6d3d1",400:"#a8a29e",500:"#78716c",600:"#57534e",700:"#44403c",800:"#292524",900:"#1c1917",950:"#0c0a09"},red:{50:"#fef2f2",100:"#fee2e2",200:"#fecaca",300:"#fca5a5",400:"#f87171",500:"#ef4444",600:"#dc2626",700:"#b91c1c",800:"#991b1b",900:"#7f1d1d",950:"#450a0a"},orange:{50:"#fff7ed",100:"#ffedd5",200:"#fed7aa",300:"#fdba74",400:"#fb923c",500:"#f97316",600:"#ea580c",700:"#c2410c",800:"#9a3412",900:"#7c2d12",950:"#431407"},amber:{50:"#fffbeb",100:"#fef3c7",200:"#fde68a",300:"#fcd34d",400:"#fbbf24",500:"#f59e0b",600:"#d97706",700:"#b45309",800:"#92400e",900:"#78350f",950:"#451a03"},yellow:{50:"#fefce8",100:"#fef9c3",200:"#fef08a",300:"#fde047",400:"#facc15",500:"#eab308",600:"#ca8a04",700:"#a16207",800:"#854d0e",900:"#713f12",950:"#422006"},lime:{50:"#f7fee7",100:"#ecfccb",200:"#d9f99d",300:"#bef264",400:"#a3e635",500:"#84cc16",600:"#65a30d",700:"#4d7c0f",800:"#3f6212",900:"#365314",950:"#1a2e05"},green:{50:"#f0fdf4",100:"#dcfce7",200:"#bbf7d0",300:"#86efac",400:"#4ade80",500:"#22c55e",600:"#16a34a",700:"#15803d",800:"#166534",900:"#14532d",950:"#052e16"},emerald:{50:"#ecfdf5",100:"#d1fae5",200:"#a7f3d0",300:"#6ee7b7",400:"#34d399",500:"#10b981",600:"#059669",700:"#047857",800:"#065f46",900:"#064e3b",950:"#022c22"},teal:{50:"#f0fdfa",100:"#ccfbf1",200:"#99f6e4",300:"#5eead4",400:"#2dd4bf",500:"#14b8a6",600:"#0d9488",700:"#0f766e",800:"#115e59",900:"#134e4a",950:"#042f2e"},cyan:{50:"#ecfeff",100:"#cffafe",200:"#a5f3fc",300:"#67e8f9",400:"#22d3ee",500:"#06b6d4",600:"#0891b2",700:"#0e7490",800:"#155e75",900:"#164e63",950:"#083344"},sky:{50:"#f0f9ff",100:"#e0f2fe",200:"#bae6fd",300:"#7dd3fc",400:"#38bdf8",500:"#0ea5e9",600:"#0284c7",700:"#0369a1",800:"#075985",900:"#0c4a6e",950:"#082f49"},blue:{50:"#eff6ff",100:"#dbeafe",200:"#bfdbfe",300:"#93c5fd",400:"#60a5fa",500:"#3b82f6",600:"#2563eb",700:"#1d4ed8",800:"#1e40af",900:"#1e3a8a",950:"#172554"},indigo:{50:"#eef2ff",100:"#e0e7ff",200:"#c7d2fe",300:"#a5b4fc",400:"#818cf8",500:"#6366f1",600:"#4f46e5",700:"#4338ca",800:"#3730a3",900:"#312e81",950:"#1e1b4b"},violet:{50:"#f5f3ff",100:"#ede9fe",200:"#ddd6fe",300:"#c4b5fd",400:"#a78bfa",500:"#8b5cf6",600:"#7c3aed",700:"#6d28d9",800:"#5b21b6",900:"#4c1d95",950:"#2e1065"},purple:{50:"#faf5ff",100:"#f3e8ff",200:"#e9d5ff",300:"#d8b4fe",400:"#c084fc",500:"#a855f7",600:"#9333ea",700:"#7e22ce",800:"#6b21a8",900:"#581c87",950:"#3b0764"},fuchsia:{50:"#fdf4ff",100:"#fae8ff",200:"#f5d0fe",300:"#f0abfc",400:"#e879f9",500:"#d946ef",600:"#c026d3",700:"#a21caf",800:"#86198f",900:"#701a75",950:"#4a044e"},pink:{50:"#fdf2f8",100:"#fce7f3",200:"#fbcfe8",300:"#f9a8d4",400:"#f472b6",500:"#ec4899",600:"#db2777",700:"#be185d",800:"#9d174d",900:"#831843",950:"#500724"},rose:{50:"#fff1f2",100:"#ffe4e6",200:"#fecdd3",300:"#fda4af",400:"#fb7185",500:"#f43f5e",600:"#e11d48",700:"#be123c",800:"#9f1239",900:"#881337",950:"#4c0519"},get lightBlue(){return qr({version:"v2.2",from:"lightBlue",to:"sky"}),this.sky},get warmGray(){return qr({version:"v3.0",from:"warmGray",to:"stone"}),this.stone},get trueGray(){return qr({version:"v3.0",from:"trueGray",to:"neutral"}),this.neutral},get coolGray(){return qr({version:"v3.0",from:"coolGray",to:"gray"}),this.gray},get blueGray(){return qr({version:"v3.0",from:"blueGray",to:"slate"}),this.slate}}});function Us(r,...e){for(let t of e){for(let i in t)r?.hasOwnProperty?.(i)||(r[i]=t[i]);for(let i of Object.getOwnPropertySymbols(t))r?.hasOwnProperty?.(i)||(r[i]=t[i])}return r}var Cf=R(()=>{u()});function kt(r){if(Array.isArray(r))return r;let e=r.split("[").length-1,t=r.split("]").length-1;if(e!==t)throw new Error(`Path is invalid. Has unbalanced brackets: ${r}`);return r.split(/\.(?![^\[]*\])|[\[\]]/g).filter(Boolean)}var Gi=R(()=>{u()});function we(r,e){return Qi.future.includes(e)?r.future==="all"||(r?.future?.[e]??_f[e]??!1):Qi.experimental.includes(e)?r.experimental==="all"||(r?.experimental?.[e]??_f[e]??!1):!1}function Ef(r){return r.experimental==="all"?Qi.experimental:Object.keys(r?.experimental??{}).filter(e=>Qi.experimental.includes(e)&&r.experimental[e])}function Of(r){if(m.env.JEST_WORKER_ID===void 0&&Ef(r).length>0){let e=Ef(r).map(t=>Qe.yellow(t)).join(", ");G.warn("experimental-flags-enabled",[`You have enabled experimental features: ${e}`,"Experimental features in Tailwind CSS are not covered by semver, may introduce breaking changes, and can change at any time."])}}var _f,Qi,ct=R(()=>{u();Wi();Be();_f={optimizeUniversalDefaults:!1,generalizedModifiers:!0,disableColorOpacityUtilitiesByDefault:!1,relativeContentPathsByDefault:!1},Qi={future:["hoverOnlyWhenSupported","respectDefaultRingColorOpacity","disableColorOpacityUtilitiesByDefault","relativeContentPathsByDefault"],experimental:["optimizeUniversalDefaults","generalizedModifiers"]}});function Tf(r){(()=>{if(r.purge||!r.content||!Array.isArray(r.content)&&!(typeof r.content=="object"&&r.content!==null))return!1;if(Array.isArray(r.content))return r.content.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string"));if(typeof r.content=="object"&&r.content!==null){if(Object.keys(r.content).some(t=>!["files","relative","extract","transform"].includes(t)))return!1;if(Array.isArray(r.content.files)){if(!r.content.files.every(t=>typeof t=="string"?!0:!(typeof t?.raw!="string"||t?.extension&&typeof t?.extension!="string")))return!1;if(typeof r.content.extract=="object"){for(let t of Object.values(r.content.extract))if(typeof t!="function")return!1}else if(!(r.content.extract===void 0||typeof r.content.extract=="function"))return!1;if(typeof r.content.transform=="object"){for(let t of Object.values(r.content.transform))if(typeof t!="function")return!1}else if(!(r.content.transform===void 0||typeof r.content.transform=="function"))return!1;if(typeof r.content.relative!="boolean"&&typeof r.content.relative!="undefined")return!1}return!0}return!1})()||G.warn("purge-deprecation",["The `purge`/`content` options have changed in Tailwind CSS v3.0.","Update your configuration file to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#configure-content-sources"]),r.safelist=(()=>{let{content:t,purge:i,safelist:n}=r;return Array.isArray(n)?n:Array.isArray(t?.safelist)?t.safelist:Array.isArray(i?.safelist)?i.safelist:Array.isArray(i?.options?.safelist)?i.options.safelist:[]})(),r.blocklist=(()=>{let{blocklist:t}=r;if(Array.isArray(t)){if(t.every(i=>typeof i=="string"))return t;G.warn("blocklist-invalid",["The `blocklist` option must be an array of strings.","https://tailwindcss.com/docs/content-configuration#discarding-classes"])}return[]})(),typeof r.prefix=="function"?(G.warn("prefix-function",["As of Tailwind CSS v3.0, `prefix` cannot be a function.","Update `prefix` in your configuration to be a string to eliminate this warning.","https://tailwindcss.com/docs/upgrade-guide#prefix-cannot-be-a-function"]),r.prefix=""):r.prefix=r.prefix??"",r.content={relative:(()=>{let{content:t}=r;return t?.relative?t.relative:we(r,"relativeContentPathsByDefault")})(),files:(()=>{let{content:t,purge:i}=r;return Array.isArray(i)?i:Array.isArray(i?.content)?i.content:Array.isArray(t)?t:Array.isArray(t?.content)?t.content:Array.isArray(t?.files)?t.files:[]})(),extract:(()=>{let t=(()=>r.purge?.extract?r.purge.extract:r.content?.extract?r.content.extract:r.purge?.extract?.DEFAULT?r.purge.extract.DEFAULT:r.content?.extract?.DEFAULT?r.content.extract.DEFAULT:r.purge?.options?.extractors?r.purge.options.extractors:r.content?.options?.extractors?r.content.options.extractors:{})(),i={},n=(()=>{if(r.purge?.options?.defaultExtractor)return r.purge.options.defaultExtractor;if(r.content?.options?.defaultExtractor)return r.content.options.defaultExtractor})();if(n!==void 0&&(i.DEFAULT=n),typeof t=="function")i.DEFAULT=t;else if(Array.isArray(t))for(let{extensions:a,extractor:s}of t??[])for(let o of a)i[o]=s;else typeof t=="object"&&t!==null&&Object.assign(i,t);return i})(),transform:(()=>{let t=(()=>r.purge?.transform?r.purge.transform:r.content?.transform?r.content.transform:r.purge?.transform?.DEFAULT?r.purge.transform.DEFAULT:r.content?.transform?.DEFAULT?r.content.transform.DEFAULT:{})(),i={};return typeof t=="function"?i.DEFAULT=t:typeof t=="object"&&t!==null&&Object.assign(i,t),i})()};for(let t of r.content.files)if(typeof t=="string"&&/{([^,]*?)}/g.test(t)){G.warn("invalid-glob-braces",[`The glob pattern ${Fs(t)} in your Tailwind CSS configuration is invalid.`,`Update it to ${Fs(t.replace(/{([^,]*?)}/g,"$1"))} to silence this warning.`]);break}return r}var Rf=R(()=>{u();ct();Be()});function ke(r){if(Object.prototype.toString.call(r)!=="[object Object]")return!1;let e=Object.getPrototypeOf(r);return e===null||Object.getPrototypeOf(e)===null}var Kt=R(()=>{u()});function St(r){return Array.isArray(r)?r.map(e=>St(e)):typeof r=="object"&&r!==null?Object.fromEntries(Object.entries(r).map(([e,t])=>[e,St(t)])):r}var Yi=R(()=>{u()});function jt(r){return r.replace(/\\,/g,"\\2c ")}var Ki=R(()=>{u()});var Vs,Pf=R(()=>{u();Vs={aliceblue:[240,248,255],antiquewhite:[250,235,215],aqua:[0,255,255],aquamarine:[127,255,212],azure:[240,255,255],beige:[245,245,220],bisque:[255,228,196],black:[0,0,0],blanchedalmond:[255,235,205],blue:[0,0,255],blueviolet:[138,43,226],brown:[165,42,42],burlywood:[222,184,135],cadetblue:[95,158,160],chartreuse:[127,255,0],chocolate:[210,105,30],coral:[255,127,80],cornflowerblue:[100,149,237],cornsilk:[255,248,220],crimson:[220,20,60],cyan:[0,255,255],darkblue:[0,0,139],darkcyan:[0,139,139],darkgoldenrod:[184,134,11],darkgray:[169,169,169],darkgreen:[0,100,0],darkgrey:[169,169,169],darkkhaki:[189,183,107],darkmagenta:[139,0,139],darkolivegreen:[85,107,47],darkorange:[255,140,0],darkorchid:[153,50,204],darkred:[139,0,0],darksalmon:[233,150,122],darkseagreen:[143,188,143],darkslateblue:[72,61,139],darkslategray:[47,79,79],darkslategrey:[47,79,79],darkturquoise:[0,206,209],darkviolet:[148,0,211],deeppink:[255,20,147],deepskyblue:[0,191,255],dimgray:[105,105,105],dimgrey:[105,105,105],dodgerblue:[30,144,255],firebrick:[178,34,34],floralwhite:[255,250,240],forestgreen:[34,139,34],fuchsia:[255,0,255],gainsboro:[220,220,220],ghostwhite:[248,248,255],gold:[255,215,0],goldenrod:[218,165,32],gray:[128,128,128],green:[0,128,0],greenyellow:[173,255,47],grey:[128,128,128],honeydew:[240,255,240],hotpink:[255,105,180],indianred:[205,92,92],indigo:[75,0,130],ivory:[255,255,240],khaki:[240,230,140],lavender:[230,230,250],lavenderblush:[255,240,245],lawngreen:[124,252,0],lemonchiffon:[255,250,205],lightblue:[173,216,230],lightcoral:[240,128,128],lightcyan:[224,255,255],lightgoldenrodyellow:[250,250,210],lightgray:[211,211,211],lightgreen:[144,238,144],lightgrey:[211,211,211],lightpink:[255,182,193],lightsalmon:[255,160,122],lightseagreen:[32,178,170],lightskyblue:[135,206,250],lightslategray:[119,136,153],lightslategrey:[119,136,153],lightsteelblue:[176,196,222],lightyellow:[255,255,224],lime:[0,255,0],limegreen:[50,205,50],linen:[250,240,230],magenta:[255,0,255],maroon:[128,0,0],mediumaquamarine:[102,205,170],mediumblue:[0,0,205],mediumorchid:[186,85,211],mediumpurple:[147,112,219],mediumseagreen:[60,179,113],mediumslateblue:[123,104,238],mediumspringgreen:[0,250,154],mediumturquoise:[72,209,204],mediumvioletred:[199,21,133],midnightblue:[25,25,112],mintcream:[245,255,250],mistyrose:[255,228,225],moccasin:[255,228,181],navajowhite:[255,222,173],navy:[0,0,128],oldlace:[253,245,230],olive:[128,128,0],olivedrab:[107,142,35],orange:[255,165,0],orangered:[255,69,0],orchid:[218,112,214],palegoldenrod:[238,232,170],palegreen:[152,251,152],paleturquoise:[175,238,238],palevioletred:[219,112,147],papayawhip:[255,239,213],peachpuff:[255,218,185],peru:[205,133,63],pink:[255,192,203],plum:[221,160,221],powderblue:[176,224,230],purple:[128,0,128],rebeccapurple:[102,51,153],red:[255,0,0],rosybrown:[188,143,143],royalblue:[65,105,225],saddlebrown:[139,69,19],salmon:[250,128,114],sandybrown:[244,164,96],seagreen:[46,139,87],seashell:[255,245,238],sienna:[160,82,45],silver:[192,192,192],skyblue:[135,206,235],slateblue:[106,90,205],slategray:[112,128,144],slategrey:[112,128,144],snow:[255,250,250],springgreen:[0,255,127],steelblue:[70,130,180],tan:[210,180,140],teal:[0,128,128],thistle:[216,191,216],tomato:[255,99,71],turquoise:[64,224,208],violet:[238,130,238],wheat:[245,222,179],white:[255,255,255],whitesmoke:[245,245,245],yellow:[255,255,0],yellowgreen:[154,205,50]}});function $r(r,{loose:e=!1}={}){if(typeof r!="string")return null;if(r=r.trim(),r==="transparent")return{mode:"rgb",color:["0","0","0"],alpha:"0"};if(r in Vs)return{mode:"rgb",color:Vs[r].map(a=>a.toString())};let t=r.replace(Fv,(a,s,o,l,c)=>["#",s,s,o,o,l,l,c?c+c:""].join("")).match(Bv);if(t!==null)return{mode:"rgb",color:[parseInt(t[1],16),parseInt(t[2],16),parseInt(t[3],16)].map(a=>a.toString()),alpha:t[4]?(parseInt(t[4],16)/255).toString():void 0};let i=r.match(jv)??r.match(zv);if(i===null)return null;let n=[i[2],i[3],i[4]].filter(Boolean).map(a=>a.toString());return n.length===2&&n[0].startsWith("var(")?{mode:i[1],color:[n[0]],alpha:n[1]}:!e&&n.length!==3||n.length<3&&!n.some(a=>/^var\(.*?\)$/.test(a))?null:{mode:i[1],color:n,alpha:i[5]?.toString?.()}}function Hs({mode:r,color:e,alpha:t}){let i=t!==void 0;return r==="rgba"||r==="hsla"?`${r}(${e.join(", ")}${i?`, ${t}`:""})`:`${r}(${e.join(" ")}${i?` / ${t}`:""})`}var Bv,Fv,At,Xi,If,Ct,jv,zv,Ws=R(()=>{u();Pf();Bv=/^#([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})?$/i,Fv=/^#([a-f\d])([a-f\d])([a-f\d])([a-f\d])?$/i,At=/(?:\d+|\d*\.\d+)%?/,Xi=/(?:\s*,\s*|\s+)/,If=/\s*[,/]\s*/,Ct=/var\(--(?:[^ )]*?)(?:,(?:[^ )]*?|var\(--[^ )]*?\)))?\)/,jv=new RegExp(`^(rgba?)\\(\\s*(${At.source}|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`),zv=new RegExp(`^(hsla?)\\(\\s*((?:${At.source})(?:deg|rad|grad|turn)?|${Ct.source})(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${Xi.source}(${At.source}|${Ct.source}))?(?:${If.source}(${At.source}|${Ct.source}))?\\s*\\)$`)});function Ze(r,e,t){if(typeof r=="function")return r({opacityValue:e});let i=$r(r,{loose:!0});return i===null?t:Hs({...i,alpha:e})}function Ae({color:r,property:e,variable:t}){let i=[].concat(e);if(typeof r=="function")return{[t]:"1",...Object.fromEntries(i.map(a=>[a,r({opacityVariable:t,opacityValue:`var(${t})`})]))};let n=$r(r);return n===null?Object.fromEntries(i.map(a=>[a,r])):n.alpha!==void 0?Object.fromEntries(i.map(a=>[a,r])):{[t]:"1",...Object.fromEntries(i.map(a=>[a,Hs({...n,alpha:`var(${t})`})]))}}var Lr=R(()=>{u();Ws()});function ve(r,e){let t=[],i=[],n=0,a=!1;for(let s=0;s{u()});function Ji(r){return ve(r,",").map(t=>{let i=t.trim(),n={raw:i},a=i.split(Vv),s=new Set;for(let o of a)Df.lastIndex=0,!s.has("KEYWORD")&&Uv.has(o)?(n.keyword=o,s.add("KEYWORD")):Df.test(o)?s.has("X")?s.has("Y")?s.has("BLUR")?s.has("SPREAD")||(n.spread=o,s.add("SPREAD")):(n.blur=o,s.add("BLUR")):(n.y=o,s.add("Y")):(n.x=o,s.add("X")):n.color?(n.unknown||(n.unknown=[]),n.unknown.push(o)):n.color=o;return n.valid=n.x!==void 0&&n.y!==void 0,n})}function qf(r){return r.map(e=>e.valid?[e.keyword,e.x,e.y,e.blur,e.spread,e.color].filter(Boolean).join(" "):e.raw).join(", ")}var Uv,Vv,Df,Gs=R(()=>{u();zt();Uv=new Set(["inset","inherit","initial","revert","unset"]),Vv=/\ +(?![^(]*\))/g,Df=/^-?(\d+|\.\d+)(.*?)$/g});function Qs(r){return Hv.some(e=>new RegExp(`^${e}\\(.*\\)`).test(r))}function K(r,e=null,t=!0){let i=e&&Wv.has(e.property);return r.startsWith("--")&&!i?`var(${r})`:r.includes("url(")?r.split(/(url\(.*?\))/g).filter(Boolean).map(n=>/^url\(.*?\)$/.test(n)?n:K(n,e,!1)).join(""):(r=r.replace(/([^\\])_+/g,(n,a)=>a+" ".repeat(n.length-1)).replace(/^_/g," ").replace(/\\_/g,"_"),t&&(r=r.trim()),r=Gv(r),r)}function Ye(r){return r.includes("=")&&(r=r.replace(/(=.*)/g,(e,t)=>{if(t[1]==="'"||t[1]==='"')return t;if(t.length>2){let i=t[t.length-1];if(t[t.length-2]===" "&&(i==="i"||i==="I"||i==="s"||i==="S"))return`="${t.slice(1,-2)}" ${t[t.length-1]}`}return`="${t.slice(1)}"`})),r}function Gv(r){let e=["theme"],t=["min-content","max-content","fit-content","safe-area-inset-top","safe-area-inset-right","safe-area-inset-bottom","safe-area-inset-left","titlebar-area-x","titlebar-area-y","titlebar-area-width","titlebar-area-height","keyboard-inset-top","keyboard-inset-right","keyboard-inset-bottom","keyboard-inset-left","keyboard-inset-width","keyboard-inset-height","radial-gradient","linear-gradient","conic-gradient","repeating-radial-gradient","repeating-linear-gradient","repeating-conic-gradient","anchor-size"];return r.replace(/(calc|min|max|clamp)\(.+\)/g,i=>{let n="";function a(){let s=n.trimEnd();return s[s.length-1]}for(let s=0;si[s+p]===d)},l=function(f){let d=1/0;for(let h of f){let b=i.indexOf(h,s);b!==-1&&bo(f))){let f=t.find(d=>o(d));n+=f,s+=f.length-1}else e.some(f=>o(f))?n+=l([")"]):o("[")?n+=l(["]"]):["+","-","*","/"].includes(c)&&!["(","+","-","*","/",","].includes(a())?n+=` ${c} `:n+=c}return n.replace(/\s+/g," ")})}function Ys(r){return r.startsWith("url(")}function Ks(r){return!isNaN(Number(r))||Qs(r)}function Mr(r){return r.endsWith("%")&&Ks(r.slice(0,-1))||Qs(r)}function Nr(r){return r==="0"||new RegExp(`^[+-]?[0-9]*.?[0-9]+(?:[eE][+-]?[0-9]+)?${Yv}$`).test(r)||Qs(r)}function $f(r){return Kv.has(r)}function Lf(r){let e=Ji(K(r));for(let t of e)if(!t.valid)return!1;return!0}function Mf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:$r(i,{loose:!0})!==null?(e++,!0):!1))?e>0:!1}function Nf(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:Ys(i)||Jv(i)||["element(","image(","cross-fade(","image-set("].some(n=>i.startsWith(n))?(e++,!0):!1))?e>0:!1}function Jv(r){r=K(r);for(let e of Xv)if(r.startsWith(`${e}(`))return!0;return!1}function Bf(r){let e=0;return ve(r,"_").every(i=>(i=K(i),i.startsWith("var(")?!0:Zv.has(i)||Nr(i)||Mr(i)?(e++,!0):!1))?e>0:!1}function Ff(r){let e=0;return ve(r,",").every(i=>(i=K(i),i.startsWith("var(")?!0:i.includes(" ")&&!/(['"])([^"']+)\1/g.test(i)||/^\d/g.test(i)?!1:(e++,!0)))?e>0:!1}function jf(r){return ex.has(r)}function zf(r){return tx.has(r)}function Uf(r){return rx.has(r)}var Hv,Wv,Qv,Yv,Kv,Xv,Zv,ex,tx,rx,Br=R(()=>{u();Ws();Gs();zt();Hv=["min","max","clamp","calc"];Wv=new Set(["scroll-timeline-name","timeline-scope","view-timeline-name","font-palette","anchor-name","anchor-scope","position-anchor","position-try-options","scroll-timeline","animation-timeline","view-timeline","position-try"]);Qv=["cm","mm","Q","in","pc","pt","px","em","ex","ch","rem","lh","rlh","vw","vh","vmin","vmax","vb","vi","svw","svh","lvw","lvh","dvw","dvh","cqw","cqh","cqi","cqb","cqmin","cqmax"],Yv=`(?:${Qv.join("|")})`;Kv=new Set(["thin","medium","thick"]);Xv=new Set(["conic-gradient","linear-gradient","radial-gradient","repeating-conic-gradient","repeating-linear-gradient","repeating-radial-gradient"]);Zv=new Set(["center","top","right","bottom","left"]);ex=new Set(["serif","sans-serif","monospace","cursive","fantasy","system-ui","ui-serif","ui-sans-serif","ui-monospace","ui-rounded","math","emoji","fangsong"]);tx=new Set(["xx-small","x-small","small","medium","large","x-large","xx-large","xxx-large"]);rx=new Set(["larger","smaller"])});function Vf(r){let e=["cover","contain"];return ve(r,",").every(t=>{let i=ve(t,"_").filter(Boolean);return i.length===1&&e.includes(i[0])?!0:i.length!==1&&i.length!==2?!1:i.every(n=>Nr(n)||Mr(n)||n==="auto")})}var Hf=R(()=>{u();Br();zt()});function Wf(r,e){r.walkClasses(t=>{t.value=e(t.value),t.raws&&t.raws.value&&(t.raws.value=jt(t.raws.value))})}function Gf(r,e){if(!_t(r))return;let t=r.slice(1,-1);if(!!e(t))return K(t)}function ix(r,e={},t){let i=e[r];if(i!==void 0)return xt(i);if(_t(r)){let n=Gf(r,t);return n===void 0?void 0:xt(n)}}function Zi(r,e={},{validate:t=()=>!0}={}){let i=e.values?.[r];return i!==void 0?i:e.supportsNegativeValues&&r.startsWith("-")?ix(r.slice(1),e.values,t):Gf(r,t)}function _t(r){return r.startsWith("[")&&r.endsWith("]")}function Qf(r){let e=r.lastIndexOf("/"),t=r.lastIndexOf("[",e),i=r.indexOf("]",e);return r[e-1]==="]"||r[e+1]==="["||t!==-1&&i!==-1&&t")){let e=r;return({opacityValue:t=1})=>e.replace(//g,t)}return r}function Yf(r){return K(r.slice(1,-1))}function nx(r,e={},{tailwindConfig:t={}}={}){if(e.values?.[r]!==void 0)return Xt(e.values?.[r]);let[i,n]=Qf(r);if(n!==void 0){let a=e.values?.[i]??(_t(i)?i.slice(1,-1):void 0);return a===void 0?void 0:(a=Xt(a),_t(n)?Ze(a,Yf(n)):t.theme?.opacity?.[n]===void 0?void 0:Ze(a,t.theme.opacity[n]))}return Zi(r,e,{validate:Mf})}function sx(r,e={}){return e.values?.[r]}function qe(r){return(e,t)=>Zi(e,t,{validate:r})}function ax(r,e){let t=r.indexOf(e);return t===-1?[void 0,r]:[r.slice(0,t),r.slice(t+1)]}function Js(r,e,t,i){if(t.values&&e in t.values)for(let{type:a}of r??[]){let s=Xs[a](e,t,{tailwindConfig:i});if(s!==void 0)return[s,a,null]}if(_t(e)){let a=e.slice(1,-1),[s,o]=ax(a,":");if(!/^[\w-_]+$/g.test(s))o=a;else if(s!==void 0&&!Kf.includes(s))return[];if(o.length>0&&Kf.includes(s))return[Zi(`[${o}]`,t),s,null]}let n=Zs(r,e,t,i);for(let a of n)return a;return[]}function*Zs(r,e,t,i){let n=we(i,"generalizedModifiers"),[a,s]=Qf(e);if(n&&t.modifiers!=null&&(t.modifiers==="any"||typeof t.modifiers=="object"&&(s&&_t(s)||s in t.modifiers))||(a=e,s=void 0),s!==void 0&&a===""&&(a="DEFAULT"),s!==void 0&&typeof t.modifiers=="object"){let l=t.modifiers?.[s]??null;l!==null?s=l:_t(s)&&(s=Yf(s))}for(let{type:l}of r??[]){let c=Xs[l](a,t,{tailwindConfig:i});c!==void 0&&(yield[c,l,s??null])}}var Xs,Kf,Fr=R(()=>{u();Ki();Lr();Br();Hi();Hf();ct();Xs={any:Zi,color:nx,url:qe(Ys),image:qe(Nf),length:qe(Nr),percentage:qe(Mr),position:qe(Bf),lookup:sx,"generic-name":qe(jf),"family-name":qe(Ff),number:qe(Ks),"line-width":qe($f),"absolute-size":qe(zf),"relative-size":qe(Uf),shadow:qe(Lf),size:qe(Vf)},Kf=Object.keys(Xs)});function X(r){return typeof r=="function"?r({}):r}var ea=R(()=>{u()});function Jt(r){return typeof r=="function"}function jr(r,...e){let t=e.pop();for(let i of e)for(let n in i){let a=t(r[n],i[n]);a===void 0?ke(r[n])&&ke(i[n])?r[n]=jr({},r[n],i[n],t):r[n]=i[n]:r[n]=a}return r}function ox(r,...e){return Jt(r)?r(...e):r}function lx(r){return r.reduce((e,{extend:t})=>jr(e,t,(i,n)=>i===void 0?[n]:Array.isArray(i)?[n,...i]:[n,i]),{})}function ux(r){return{...r.reduce((e,t)=>Us(e,t),{}),extend:lx(r)}}function Xf(r,e){if(Array.isArray(r)&&ke(r[0]))return r.concat(e);if(Array.isArray(e)&&ke(e[0])&&ke(r))return[r,...e];if(Array.isArray(e))return e}function fx({extend:r,...e}){return jr(e,r,(t,i)=>!Jt(t)&&!i.some(Jt)?jr({},t,...i,Xf):(n,a)=>jr({},...[t,...i].map(s=>ox(s,n,a)),Xf))}function*cx(r){let e=kt(r);if(e.length===0||(yield e,Array.isArray(r)))return;let t=/^(.*?)\s*\/\s*([^/]+)$/,i=r.match(t);if(i!==null){let[,n,a]=i,s=kt(n);s.alpha=a,yield s}}function px(r){let e=(t,i)=>{for(let n of cx(t)){let a=0,s=r;for(;s!=null&&a(t[i]=Jt(r[i])?r[i](e,ta):r[i],t),{})}function Jf(r){let e=[];return r.forEach(t=>{e=[...e,t];let i=t?.plugins??[];i.length!==0&&i.forEach(n=>{n.__isOptionsFunction&&(n=n()),e=[...e,...Jf([n?.config??{}])]})}),e}function dx(r){return[...r].reduceRight((t,i)=>Jt(i)?i({corePlugins:t}):vf(i,t),bf)}function hx(r){return[...r].reduceRight((t,i)=>[...t,...i],[])}function ra(r){let e=[...Jf(r),{prefix:"",important:!1,separator:":"}];return Tf(Us({theme:px(fx(ux(e.map(t=>t?.theme??{})))),corePlugins:dx(e.map(t=>t.corePlugins)),plugins:hx(r.map(t=>t?.plugins??[]))},...e))}var ta,Zf=R(()=>{u();Hi();wf();xf();zs();Cf();Gi();Rf();Kt();Yi();Fr();Lr();ea();ta={colors:js,negative(r){return Object.keys(r).filter(e=>r[e]!=="0").reduce((e,t)=>{let i=xt(r[t]);return i!==void 0&&(e[`-${t}`]=i),e},{})},breakpoints(r){return Object.keys(r).filter(e=>typeof r[e]=="string").reduce((e,t)=>({...e,[`screen-${t}`]:r[t]}),{})}}});var en=x((l3,ec)=>{u();ec.exports={content:[],presets:[],darkMode:"media",theme:{accentColor:({theme:r})=>({...r("colors"),auto:"auto"}),animation:{none:"none",spin:"spin 1s linear infinite",ping:"ping 1s cubic-bezier(0, 0, 0.2, 1) infinite",pulse:"pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite",bounce:"bounce 1s infinite"},aria:{busy:'busy="true"',checked:'checked="true"',disabled:'disabled="true"',expanded:'expanded="true"',hidden:'hidden="true"',pressed:'pressed="true"',readonly:'readonly="true"',required:'required="true"',selected:'selected="true"'},aspectRatio:{auto:"auto",square:"1 / 1",video:"16 / 9"},backdropBlur:({theme:r})=>r("blur"),backdropBrightness:({theme:r})=>r("brightness"),backdropContrast:({theme:r})=>r("contrast"),backdropGrayscale:({theme:r})=>r("grayscale"),backdropHueRotate:({theme:r})=>r("hueRotate"),backdropInvert:({theme:r})=>r("invert"),backdropOpacity:({theme:r})=>r("opacity"),backdropSaturate:({theme:r})=>r("saturate"),backdropSepia:({theme:r})=>r("sepia"),backgroundColor:({theme:r})=>r("colors"),backgroundImage:{none:"none","gradient-to-t":"linear-gradient(to top, var(--tw-gradient-stops))","gradient-to-tr":"linear-gradient(to top right, var(--tw-gradient-stops))","gradient-to-r":"linear-gradient(to right, var(--tw-gradient-stops))","gradient-to-br":"linear-gradient(to bottom right, var(--tw-gradient-stops))","gradient-to-b":"linear-gradient(to bottom, var(--tw-gradient-stops))","gradient-to-bl":"linear-gradient(to bottom left, var(--tw-gradient-stops))","gradient-to-l":"linear-gradient(to left, var(--tw-gradient-stops))","gradient-to-tl":"linear-gradient(to top left, var(--tw-gradient-stops))"},backgroundOpacity:({theme:r})=>r("opacity"),backgroundPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},backgroundSize:{auto:"auto",cover:"cover",contain:"contain"},blur:{0:"0",none:"",sm:"4px",DEFAULT:"8px",md:"12px",lg:"16px",xl:"24px","2xl":"40px","3xl":"64px"},borderColor:({theme:r})=>({...r("colors"),DEFAULT:r("colors.gray.200","currentColor")}),borderOpacity:({theme:r})=>r("opacity"),borderRadius:{none:"0px",sm:"0.125rem",DEFAULT:"0.25rem",md:"0.375rem",lg:"0.5rem",xl:"0.75rem","2xl":"1rem","3xl":"1.5rem",full:"9999px"},borderSpacing:({theme:r})=>({...r("spacing")}),borderWidth:{DEFAULT:"1px",0:"0px",2:"2px",4:"4px",8:"8px"},boxShadow:{sm:"0 1px 2px 0 rgb(0 0 0 / 0.05)",DEFAULT:"0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",md:"0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",lg:"0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1)",xl:"0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1)","2xl":"0 25px 50px -12px rgb(0 0 0 / 0.25)",inner:"inset 0 2px 4px 0 rgb(0 0 0 / 0.05)",none:"none"},boxShadowColor:({theme:r})=>r("colors"),brightness:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5",200:"2"},caretColor:({theme:r})=>r("colors"),colors:({colors:r})=>({inherit:r.inherit,current:r.current,transparent:r.transparent,black:r.black,white:r.white,slate:r.slate,gray:r.gray,zinc:r.zinc,neutral:r.neutral,stone:r.stone,red:r.red,orange:r.orange,amber:r.amber,yellow:r.yellow,lime:r.lime,green:r.green,emerald:r.emerald,teal:r.teal,cyan:r.cyan,sky:r.sky,blue:r.blue,indigo:r.indigo,violet:r.violet,purple:r.purple,fuchsia:r.fuchsia,pink:r.pink,rose:r.rose}),columns:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12","3xs":"16rem","2xs":"18rem",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem"},container:{},content:{none:"none"},contrast:{0:"0",50:".5",75:".75",100:"1",125:"1.25",150:"1.5",200:"2"},cursor:{auto:"auto",default:"default",pointer:"pointer",wait:"wait",text:"text",move:"move",help:"help","not-allowed":"not-allowed",none:"none","context-menu":"context-menu",progress:"progress",cell:"cell",crosshair:"crosshair","vertical-text":"vertical-text",alias:"alias",copy:"copy","no-drop":"no-drop",grab:"grab",grabbing:"grabbing","all-scroll":"all-scroll","col-resize":"col-resize","row-resize":"row-resize","n-resize":"n-resize","e-resize":"e-resize","s-resize":"s-resize","w-resize":"w-resize","ne-resize":"ne-resize","nw-resize":"nw-resize","se-resize":"se-resize","sw-resize":"sw-resize","ew-resize":"ew-resize","ns-resize":"ns-resize","nesw-resize":"nesw-resize","nwse-resize":"nwse-resize","zoom-in":"zoom-in","zoom-out":"zoom-out"},divideColor:({theme:r})=>r("borderColor"),divideOpacity:({theme:r})=>r("borderOpacity"),divideWidth:({theme:r})=>r("borderWidth"),dropShadow:{sm:"0 1px 1px rgb(0 0 0 / 0.05)",DEFAULT:["0 1px 2px rgb(0 0 0 / 0.1)","0 1px 1px rgb(0 0 0 / 0.06)"],md:["0 4px 3px rgb(0 0 0 / 0.07)","0 2px 2px rgb(0 0 0 / 0.06)"],lg:["0 10px 8px rgb(0 0 0 / 0.04)","0 4px 3px rgb(0 0 0 / 0.1)"],xl:["0 20px 13px rgb(0 0 0 / 0.03)","0 8px 5px rgb(0 0 0 / 0.08)"],"2xl":"0 25px 25px rgb(0 0 0 / 0.15)",none:"0 0 #0000"},fill:({theme:r})=>({none:"none",...r("colors")}),flex:{1:"1 1 0%",auto:"1 1 auto",initial:"0 1 auto",none:"none"},flexBasis:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%"}),flexGrow:{0:"0",DEFAULT:"1"},flexShrink:{0:"0",DEFAULT:"1"},fontFamily:{sans:["ui-sans-serif","system-ui","sans-serif",'"Apple Color Emoji"','"Segoe UI Emoji"','"Segoe UI Symbol"','"Noto Color Emoji"'],serif:["ui-serif","Georgia","Cambria",'"Times New Roman"',"Times","serif"],mono:["ui-monospace","SFMono-Regular","Menlo","Monaco","Consolas",'"Liberation Mono"','"Courier New"',"monospace"]},fontSize:{xs:["0.75rem",{lineHeight:"1rem"}],sm:["0.875rem",{lineHeight:"1.25rem"}],base:["1rem",{lineHeight:"1.5rem"}],lg:["1.125rem",{lineHeight:"1.75rem"}],xl:["1.25rem",{lineHeight:"1.75rem"}],"2xl":["1.5rem",{lineHeight:"2rem"}],"3xl":["1.875rem",{lineHeight:"2.25rem"}],"4xl":["2.25rem",{lineHeight:"2.5rem"}],"5xl":["3rem",{lineHeight:"1"}],"6xl":["3.75rem",{lineHeight:"1"}],"7xl":["4.5rem",{lineHeight:"1"}],"8xl":["6rem",{lineHeight:"1"}],"9xl":["8rem",{lineHeight:"1"}]},fontWeight:{thin:"100",extralight:"200",light:"300",normal:"400",medium:"500",semibold:"600",bold:"700",extrabold:"800",black:"900"},gap:({theme:r})=>r("spacing"),gradientColorStops:({theme:r})=>r("colors"),gradientColorStopPositions:{"0%":"0%","5%":"5%","10%":"10%","15%":"15%","20%":"20%","25%":"25%","30%":"30%","35%":"35%","40%":"40%","45%":"45%","50%":"50%","55%":"55%","60%":"60%","65%":"65%","70%":"70%","75%":"75%","80%":"80%","85%":"85%","90%":"90%","95%":"95%","100%":"100%"},grayscale:{0:"0",DEFAULT:"100%"},gridAutoColumns:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridAutoRows:{auto:"auto",min:"min-content",max:"max-content",fr:"minmax(0, 1fr)"},gridColumn:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridColumnEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridColumnStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRow:{auto:"auto","span-1":"span 1 / span 1","span-2":"span 2 / span 2","span-3":"span 3 / span 3","span-4":"span 4 / span 4","span-5":"span 5 / span 5","span-6":"span 6 / span 6","span-7":"span 7 / span 7","span-8":"span 8 / span 8","span-9":"span 9 / span 9","span-10":"span 10 / span 10","span-11":"span 11 / span 11","span-12":"span 12 / span 12","span-full":"1 / -1"},gridRowEnd:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridRowStart:{auto:"auto",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12",13:"13"},gridTemplateColumns:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},gridTemplateRows:{none:"none",subgrid:"subgrid",1:"repeat(1, minmax(0, 1fr))",2:"repeat(2, minmax(0, 1fr))",3:"repeat(3, minmax(0, 1fr))",4:"repeat(4, minmax(0, 1fr))",5:"repeat(5, minmax(0, 1fr))",6:"repeat(6, minmax(0, 1fr))",7:"repeat(7, minmax(0, 1fr))",8:"repeat(8, minmax(0, 1fr))",9:"repeat(9, minmax(0, 1fr))",10:"repeat(10, minmax(0, 1fr))",11:"repeat(11, minmax(0, 1fr))",12:"repeat(12, minmax(0, 1fr))"},height:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),hueRotate:{0:"0deg",15:"15deg",30:"30deg",60:"60deg",90:"90deg",180:"180deg"},inset:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),invert:{0:"0",DEFAULT:"100%"},keyframes:{spin:{to:{transform:"rotate(360deg)"}},ping:{"75%, 100%":{transform:"scale(2)",opacity:"0"}},pulse:{"50%":{opacity:".5"}},bounce:{"0%, 100%":{transform:"translateY(-25%)",animationTimingFunction:"cubic-bezier(0.8,0,1,1)"},"50%":{transform:"none",animationTimingFunction:"cubic-bezier(0,0,0.2,1)"}}},letterSpacing:{tighter:"-0.05em",tight:"-0.025em",normal:"0em",wide:"0.025em",wider:"0.05em",widest:"0.1em"},lineHeight:{none:"1",tight:"1.25",snug:"1.375",normal:"1.5",relaxed:"1.625",loose:"2",3:".75rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem"},listStyleType:{none:"none",disc:"disc",decimal:"decimal"},listStyleImage:{none:"none"},margin:({theme:r})=>({auto:"auto",...r("spacing")}),lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"},maxHeight:({theme:r})=>({...r("spacing"),none:"none",full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),maxWidth:({theme:r,breakpoints:e})=>({...r("spacing"),none:"none",xs:"20rem",sm:"24rem",md:"28rem",lg:"32rem",xl:"36rem","2xl":"42rem","3xl":"48rem","4xl":"56rem","5xl":"64rem","6xl":"72rem","7xl":"80rem",full:"100%",min:"min-content",max:"max-content",fit:"fit-content",prose:"65ch",...e(r("screens"))}),minHeight:({theme:r})=>({...r("spacing"),full:"100%",screen:"100vh",svh:"100svh",lvh:"100lvh",dvh:"100dvh",min:"min-content",max:"max-content",fit:"fit-content"}),minWidth:({theme:r})=>({...r("spacing"),full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),objectPosition:{bottom:"bottom",center:"center",left:"left","left-bottom":"left bottom","left-top":"left top",right:"right","right-bottom":"right bottom","right-top":"right top",top:"top"},opacity:{0:"0",5:"0.05",10:"0.1",15:"0.15",20:"0.2",25:"0.25",30:"0.3",35:"0.35",40:"0.4",45:"0.45",50:"0.5",55:"0.55",60:"0.6",65:"0.65",70:"0.7",75:"0.75",80:"0.8",85:"0.85",90:"0.9",95:"0.95",100:"1"},order:{first:"-9999",last:"9999",none:"0",1:"1",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7",8:"8",9:"9",10:"10",11:"11",12:"12"},outlineColor:({theme:r})=>r("colors"),outlineOffset:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},outlineWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},padding:({theme:r})=>r("spacing"),placeholderColor:({theme:r})=>r("colors"),placeholderOpacity:({theme:r})=>r("opacity"),ringColor:({theme:r})=>({DEFAULT:r("colors.blue.500","#3b82f6"),...r("colors")}),ringOffsetColor:({theme:r})=>r("colors"),ringOffsetWidth:{0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},ringOpacity:({theme:r})=>({DEFAULT:"0.5",...r("opacity")}),ringWidth:{DEFAULT:"3px",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},rotate:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg",45:"45deg",90:"90deg",180:"180deg"},saturate:{0:"0",50:".5",100:"1",150:"1.5",200:"2"},scale:{0:"0",50:".5",75:".75",90:".9",95:".95",100:"1",105:"1.05",110:"1.1",125:"1.25",150:"1.5"},screens:{sm:"640px",md:"768px",lg:"1024px",xl:"1280px","2xl":"1536px"},scrollMargin:({theme:r})=>({...r("spacing")}),scrollPadding:({theme:r})=>r("spacing"),sepia:{0:"0",DEFAULT:"100%"},skew:{0:"0deg",1:"1deg",2:"2deg",3:"3deg",6:"6deg",12:"12deg"},space:({theme:r})=>({...r("spacing")}),spacing:{px:"1px",0:"0px",.5:"0.125rem",1:"0.25rem",1.5:"0.375rem",2:"0.5rem",2.5:"0.625rem",3:"0.75rem",3.5:"0.875rem",4:"1rem",5:"1.25rem",6:"1.5rem",7:"1.75rem",8:"2rem",9:"2.25rem",10:"2.5rem",11:"2.75rem",12:"3rem",14:"3.5rem",16:"4rem",20:"5rem",24:"6rem",28:"7rem",32:"8rem",36:"9rem",40:"10rem",44:"11rem",48:"12rem",52:"13rem",56:"14rem",60:"15rem",64:"16rem",72:"18rem",80:"20rem",96:"24rem"},stroke:({theme:r})=>({none:"none",...r("colors")}),strokeWidth:{0:"0",1:"1",2:"2"},supports:{},data:{},textColor:({theme:r})=>r("colors"),textDecorationColor:({theme:r})=>r("colors"),textDecorationThickness:{auto:"auto","from-font":"from-font",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},textIndent:({theme:r})=>({...r("spacing")}),textOpacity:({theme:r})=>r("opacity"),textUnderlineOffset:{auto:"auto",0:"0px",1:"1px",2:"2px",4:"4px",8:"8px"},transformOrigin:{center:"center",top:"top","top-right":"top right",right:"right","bottom-right":"bottom right",bottom:"bottom","bottom-left":"bottom left",left:"left","top-left":"top left"},transitionDelay:{0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionDuration:{DEFAULT:"150ms",0:"0s",75:"75ms",100:"100ms",150:"150ms",200:"200ms",300:"300ms",500:"500ms",700:"700ms",1e3:"1000ms"},transitionProperty:{none:"none",all:"all",DEFAULT:"color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter",colors:"color, background-color, border-color, text-decoration-color, fill, stroke",opacity:"opacity",shadow:"box-shadow",transform:"transform"},transitionTimingFunction:{DEFAULT:"cubic-bezier(0.4, 0, 0.2, 1)",linear:"linear",in:"cubic-bezier(0.4, 0, 1, 1)",out:"cubic-bezier(0, 0, 0.2, 1)","in-out":"cubic-bezier(0.4, 0, 0.2, 1)"},translate:({theme:r})=>({...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%",full:"100%"}),size:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",min:"min-content",max:"max-content",fit:"fit-content"}),width:({theme:r})=>({auto:"auto",...r("spacing"),"1/2":"50%","1/3":"33.333333%","2/3":"66.666667%","1/4":"25%","2/4":"50%","3/4":"75%","1/5":"20%","2/5":"40%","3/5":"60%","4/5":"80%","1/6":"16.666667%","2/6":"33.333333%","3/6":"50%","4/6":"66.666667%","5/6":"83.333333%","1/12":"8.333333%","2/12":"16.666667%","3/12":"25%","4/12":"33.333333%","5/12":"41.666667%","6/12":"50%","7/12":"58.333333%","8/12":"66.666667%","9/12":"75%","10/12":"83.333333%","11/12":"91.666667%",full:"100%",screen:"100vw",svw:"100svw",lvw:"100lvw",dvw:"100dvw",min:"min-content",max:"max-content",fit:"fit-content"}),willChange:{auto:"auto",scroll:"scroll-position",contents:"contents",transform:"transform"},zIndex:{auto:"auto",0:"0",10:"10",20:"20",30:"30",40:"40",50:"50"}},plugins:[]}});function tn(r){let e=(r?.presets??[tc.default]).slice().reverse().flatMap(n=>tn(n instanceof Function?n():n)),t={respectDefaultRingColorOpacity:{theme:{ringColor:({theme:n})=>({DEFAULT:"#3b82f67f",...n("colors")})}},disableColorOpacityUtilitiesByDefault:{corePlugins:{backgroundOpacity:!1,borderOpacity:!1,divideOpacity:!1,placeholderOpacity:!1,ringOpacity:!1,textOpacity:!1}}},i=Object.keys(t).filter(n=>we(r,n)).map(n=>t[n]);return[r,...i,...e]}var tc,rc=R(()=>{u();tc=pe(en());ct()});var ic={};Ge(ic,{default:()=>zr});function zr(...r){let[,...e]=tn(r[0]);return ra([...r,...e])}var ia=R(()=>{u();Zf();rc()});var Ur={};Ge(Ur,{default:()=>me});var me,et=R(()=>{u();me={resolve:r=>r,extname:r=>"."+r.split(".").pop()}});function rn(r){return typeof r=="object"&&r!==null}function gx(r){return Object.keys(r).length===0}function nc(r){return typeof r=="string"||r instanceof String}function na(r){return rn(r)&&r.config===void 0&&!gx(r)?null:rn(r)&&r.config!==void 0&&nc(r.config)?me.resolve(r.config):rn(r)&&r.config!==void 0&&rn(r.config)?null:nc(r)?me.resolve(r):yx()}function yx(){for(let r of mx)try{let e=me.resolve(r);return be.accessSync(e),e}catch(e){}return null}var mx,sc=R(()=>{u();ft();et();mx=["./tailwind.config.js","./tailwind.config.cjs","./tailwind.config.mjs","./tailwind.config.ts","./tailwind.config.cts","./tailwind.config.mts"]});var ac={};Ge(ac,{default:()=>sa});var sa,aa=R(()=>{u();sa={parse:r=>({href:r})}});var oa=x(()=>{u()});var nn=x((b3,uc)=>{u();"use strict";var oc=(Wi(),kf),lc=oa(),Zt=class extends Error{constructor(e,t,i,n,a,s){super(e);this.name="CssSyntaxError",this.reason=e,a&&(this.file=a),n&&(this.source=n),s&&(this.plugin=s),typeof t!="undefined"&&typeof i!="undefined"&&(typeof t=="number"?(this.line=t,this.column=i):(this.line=t.line,this.column=t.column,this.endLine=i.line,this.endColumn=i.column)),this.setMessage(),Error.captureStackTrace&&Error.captureStackTrace(this,Zt)}setMessage(){this.message=this.plugin?this.plugin+": ":"",this.message+=this.file?this.file:"",typeof this.line!="undefined"&&(this.message+=":"+this.line+":"+this.column),this.message+=": "+this.reason}showSourceCode(e){if(!this.source)return"";let t=this.source;e==null&&(e=oc.isColorSupported),lc&&e&&(t=lc(t));let i=t.split(/\r?\n/),n=Math.max(this.line-3,0),a=Math.min(this.line+2,i.length),s=String(a).length,o,l;if(e){let{bold:c,red:f,gray:d}=oc.createColors(!0);o=p=>c(f(p)),l=p=>d(p)}else o=l=c=>c;return i.slice(n,a).map((c,f)=>{let d=n+1+f,p=" "+(" "+d).slice(-s)+" | ";if(d===this.line){let h=l(p.replace(/\d/g," "))+c.slice(0,this.column-1).replace(/[^\t]/g," ");return o(">")+l(p)+c+` + `+h+o("^")}return" "+l(p)+c}).join(` +`)}toString(){let e=this.showSourceCode();return e&&(e=` + +`+e+` +`),this.name+": "+this.message+e}};uc.exports=Zt;Zt.default=Zt});var sn=x((w3,la)=>{u();"use strict";la.exports.isClean=Symbol("isClean");la.exports.my=Symbol("my")});var ua=x((v3,cc)=>{u();"use strict";var fc={colon:": ",indent:" ",beforeDecl:` +`,beforeRule:` +`,beforeOpen:" ",beforeClose:` +`,beforeComment:` +`,after:` +`,emptyBody:"",commentLeft:" ",commentRight:" ",semicolon:!1};function bx(r){return r[0].toUpperCase()+r.slice(1)}var an=class{constructor(e){this.builder=e}stringify(e,t){if(!this[e.type])throw new Error("Unknown AST node type "+e.type+". Maybe you need to change PostCSS stringifier.");this[e.type](e,t)}document(e){this.body(e)}root(e){this.body(e),e.raws.after&&this.builder(e.raws.after)}comment(e){let t=this.raw(e,"left","commentLeft"),i=this.raw(e,"right","commentRight");this.builder("/*"+t+e.text+i+"*/",e)}decl(e,t){let i=this.raw(e,"between","colon"),n=e.prop+i+this.rawValue(e,"value");e.important&&(n+=e.raws.important||" !important"),t&&(n+=";"),this.builder(n,e)}rule(e){this.block(e,this.rawValue(e,"selector")),e.raws.ownSemicolon&&this.builder(e.raws.ownSemicolon,e,"end")}atrule(e,t){let i="@"+e.name,n=e.params?this.rawValue(e,"params"):"";if(typeof e.raws.afterName!="undefined"?i+=e.raws.afterName:n&&(i+=" "),e.nodes)this.block(e,i+n);else{let a=(e.raws.between||"")+(t?";":"");this.builder(i+n+a,e)}}body(e){let t=e.nodes.length-1;for(;t>0&&e.nodes[t].type==="comment";)t-=1;let i=this.raw(e,"semicolon");for(let n=0;n{if(n=l.raws[t],typeof n!="undefined")return!1})}return typeof n=="undefined"&&(n=fc[i]),s.rawCache[i]=n,n}rawSemicolon(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length&&i.last.type==="decl"&&(t=i.raws.semicolon,typeof t!="undefined"))return!1}),t}rawEmptyBody(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length===0&&(t=i.raws.after,typeof t!="undefined"))return!1}),t}rawIndent(e){if(e.raws.indent)return e.raws.indent;let t;return e.walk(i=>{let n=i.parent;if(n&&n!==e&&n.parent&&n.parent===e&&typeof i.raws.before!="undefined"){let a=i.raws.before.split(` +`);return t=a[a.length-1],t=t.replace(/\S/g,""),!1}}),t}rawBeforeComment(e,t){let i;return e.walkComments(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(` +`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeDecl"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeDecl(e,t){let i;return e.walkDecls(n=>{if(typeof n.raws.before!="undefined")return i=n.raws.before,i.includes(` +`)&&(i=i.replace(/[^\n]+$/,"")),!1}),typeof i=="undefined"?i=this.raw(t,null,"beforeRule"):i&&(i=i.replace(/\S/g,"")),i}rawBeforeRule(e){let t;return e.walk(i=>{if(i.nodes&&(i.parent!==e||e.first!==i)&&typeof i.raws.before!="undefined")return t=i.raws.before,t.includes(` +`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeClose(e){let t;return e.walk(i=>{if(i.nodes&&i.nodes.length>0&&typeof i.raws.after!="undefined")return t=i.raws.after,t.includes(` +`)&&(t=t.replace(/[^\n]+$/,"")),!1}),t&&(t=t.replace(/\S/g,"")),t}rawBeforeOpen(e){let t;return e.walk(i=>{if(i.type!=="decl"&&(t=i.raws.between,typeof t!="undefined"))return!1}),t}rawColon(e){let t;return e.walkDecls(i=>{if(typeof i.raws.between!="undefined")return t=i.raws.between.replace(/[^\s:]/g,""),!1}),t}beforeAfter(e,t){let i;e.type==="decl"?i=this.raw(e,null,"beforeDecl"):e.type==="comment"?i=this.raw(e,null,"beforeComment"):t==="before"?i=this.raw(e,null,"beforeRule"):i=this.raw(e,null,"beforeClose");let n=e.parent,a=0;for(;n&&n.type!=="root";)a+=1,n=n.parent;if(i.includes(` +`)){let s=this.raw(e,null,"indent");if(s.length)for(let o=0;o{u();"use strict";var wx=ua();function fa(r,e){new wx(e).stringify(r)}pc.exports=fa;fa.default=fa});var Hr=x((k3,dc)=>{u();"use strict";var{isClean:on,my:vx}=sn(),xx=nn(),kx=ua(),Sx=Vr();function ca(r,e){let t=new r.constructor;for(let i in r){if(!Object.prototype.hasOwnProperty.call(r,i)||i==="proxyCache")continue;let n=r[i],a=typeof n;i==="parent"&&a==="object"?e&&(t[i]=e):i==="source"?t[i]=n:Array.isArray(n)?t[i]=n.map(s=>ca(s,t)):(a==="object"&&n!==null&&(n=ca(n)),t[i]=n)}return t}var ln=class{constructor(e={}){this.raws={},this[on]=!1,this[vx]=!0;for(let t in e)if(t==="nodes"){this.nodes=[];for(let i of e[t])typeof i.clone=="function"?this.append(i.clone()):this.append(i)}else this[t]=e[t]}error(e,t={}){if(this.source){let{start:i,end:n}=this.rangeBy(t);return this.source.input.error(e,{line:i.line,column:i.column},{line:n.line,column:n.column},t)}return new xx(e)}warn(e,t,i){let n={node:this};for(let a in i)n[a]=i[a];return e.warn(t,n)}remove(){return this.parent&&this.parent.removeChild(this),this.parent=void 0,this}toString(e=Sx){e.stringify&&(e=e.stringify);let t="";return e(this,i=>{t+=i}),t}assign(e={}){for(let t in e)this[t]=e[t];return this}clone(e={}){let t=ca(this);for(let i in e)t[i]=e[i];return t}cloneBefore(e={}){let t=this.clone(e);return this.parent.insertBefore(this,t),t}cloneAfter(e={}){let t=this.clone(e);return this.parent.insertAfter(this,t),t}replaceWith(...e){if(this.parent){let t=this,i=!1;for(let n of e)n===this?i=!0:i?(this.parent.insertAfter(t,n),t=n):this.parent.insertBefore(t,n);i||this.remove()}return this}next(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e+1]}prev(){if(!this.parent)return;let e=this.parent.index(this);return this.parent.nodes[e-1]}before(e){return this.parent.insertBefore(this,e),this}after(e){return this.parent.insertAfter(this,e),this}root(){let e=this;for(;e.parent&&e.parent.type!=="document";)e=e.parent;return e}raw(e,t){return new kx().raw(this,e,t)}cleanRaws(e){delete this.raws.before,delete this.raws.after,e||delete this.raws.between}toJSON(e,t){let i={},n=t==null;t=t||new Map;let a=0;for(let s in this){if(!Object.prototype.hasOwnProperty.call(this,s)||s==="parent"||s==="proxyCache")continue;let o=this[s];if(Array.isArray(o))i[s]=o.map(l=>typeof l=="object"&&l.toJSON?l.toJSON(null,t):l);else if(typeof o=="object"&&o.toJSON)i[s]=o.toJSON(null,t);else if(s==="source"){let l=t.get(o.input);l==null&&(l=a,t.set(o.input,a),a++),i[s]={inputId:l,start:o.start,end:o.end}}else i[s]=o}return n&&(i.inputs=[...t.keys()].map(s=>s.toJSON())),i}positionInside(e){let t=this.toString(),i=this.source.start.column,n=this.source.start.line;for(let a=0;ae.root().toProxy():e[t]}}}toProxy(){return this.proxyCache||(this.proxyCache=new Proxy(this,this.getProxyProcessor())),this.proxyCache}addToError(e){if(e.postcssNode=this,e.stack&&this.source&&/\n\s{4}at /.test(e.stack)){let t=this.source;e.stack=e.stack.replace(/\n\s{4}at /,`$&${t.input.from}:${t.start.line}:${t.start.column}$&`)}return e}markDirty(){if(this[on]){this[on]=!1;let e=this;for(;e=e.parent;)e[on]=!1}}get proxyOf(){return this}};dc.exports=ln;ln.default=ln});var Wr=x((S3,hc)=>{u();"use strict";var Ax=Hr(),un=class extends Ax{constructor(e){e&&typeof e.value!="undefined"&&typeof e.value!="string"&&(e={...e,value:String(e.value)});super(e);this.type="decl"}get variable(){return this.prop.startsWith("--")||this.prop[0]==="$"}};hc.exports=un;un.default=un});var pa=x((A3,mc)=>{u();mc.exports=function(r,e){return{generate:()=>{let t="";return r(e,i=>{t+=i}),[t]}}}});var Gr=x((C3,gc)=>{u();"use strict";var Cx=Hr(),fn=class extends Cx{constructor(e){super(e);this.type="comment"}};gc.exports=fn;fn.default=fn});var Et=x((_3,Cc)=>{u();"use strict";var{isClean:yc,my:bc}=sn(),wc=Wr(),vc=Gr(),_x=Hr(),xc,da,ha,kc;function Sc(r){return r.map(e=>(e.nodes&&(e.nodes=Sc(e.nodes)),delete e.source,e))}function Ac(r){if(r[yc]=!1,r.proxyOf.nodes)for(let e of r.proxyOf.nodes)Ac(e)}var Fe=class extends _x{push(e){return e.parent=this,this.proxyOf.nodes.push(e),this}each(e){if(!this.proxyOf.nodes)return;let t=this.getIterator(),i,n;for(;this.indexes[t]{let n;try{n=e(t,i)}catch(a){throw t.addToError(a)}return n!==!1&&t.walk&&(n=t.walk(e)),n})}walkDecls(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="decl"&&e.test(i.prop))return t(i,n)}):this.walk((i,n)=>{if(i.type==="decl"&&i.prop===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="decl")return t(i,n)}))}walkRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="rule"&&e.test(i.selector))return t(i,n)}):this.walk((i,n)=>{if(i.type==="rule"&&i.selector===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="rule")return t(i,n)}))}walkAtRules(e,t){return t?e instanceof RegExp?this.walk((i,n)=>{if(i.type==="atrule"&&e.test(i.name))return t(i,n)}):this.walk((i,n)=>{if(i.type==="atrule"&&i.name===e)return t(i,n)}):(t=e,this.walk((i,n)=>{if(i.type==="atrule")return t(i,n)}))}walkComments(e){return this.walk((t,i)=>{if(t.type==="comment")return e(t,i)})}append(...e){for(let t of e){let i=this.normalize(t,this.last);for(let n of i)this.proxyOf.nodes.push(n)}return this.markDirty(),this}prepend(...e){e=e.reverse();for(let t of e){let i=this.normalize(t,this.first,"prepend").reverse();for(let n of i)this.proxyOf.nodes.unshift(n);for(let n in this.indexes)this.indexes[n]=this.indexes[n]+i.length}return this.markDirty(),this}cleanRaws(e){if(super.cleanRaws(e),this.nodes)for(let t of this.nodes)t.cleanRaws(e)}insertBefore(e,t){let i=this.index(e),n=i===0?"prepend":!1,a=this.normalize(t,this.proxyOf.nodes[i],n).reverse();i=this.index(e);for(let o of a)this.proxyOf.nodes.splice(i,0,o);let s;for(let o in this.indexes)s=this.indexes[o],i<=s&&(this.indexes[o]=s+a.length);return this.markDirty(),this}insertAfter(e,t){let i=this.index(e),n=this.normalize(t,this.proxyOf.nodes[i]).reverse();i=this.index(e);for(let s of n)this.proxyOf.nodes.splice(i+1,0,s);let a;for(let s in this.indexes)a=this.indexes[s],i=e&&(this.indexes[i]=t-1);return this.markDirty(),this}removeAll(){for(let e of this.proxyOf.nodes)e.parent=void 0;return this.proxyOf.nodes=[],this.markDirty(),this}replaceValues(e,t,i){return i||(i=t,t={}),this.walkDecls(n=>{t.props&&!t.props.includes(n.prop)||t.fast&&!n.value.includes(t.fast)||(n.value=n.value.replace(e,i))}),this.markDirty(),this}every(e){return this.nodes.every(e)}some(e){return this.nodes.some(e)}index(e){return typeof e=="number"?e:(e.proxyOf&&(e=e.proxyOf),this.proxyOf.nodes.indexOf(e))}get first(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[0]}get last(){if(!!this.proxyOf.nodes)return this.proxyOf.nodes[this.proxyOf.nodes.length-1]}normalize(e,t){if(typeof e=="string")e=Sc(xc(e).nodes);else if(Array.isArray(e)){e=e.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type==="root"&&this.type!=="document"){e=e.nodes.slice(0);for(let n of e)n.parent&&n.parent.removeChild(n,"ignore")}else if(e.type)e=[e];else if(e.prop){if(typeof e.value=="undefined")throw new Error("Value field is missed in node creation");typeof e.value!="string"&&(e.value=String(e.value)),e=[new wc(e)]}else if(e.selector)e=[new da(e)];else if(e.name)e=[new ha(e)];else if(e.text)e=[new vc(e)];else throw new Error("Unknown node type in node creation");return e.map(n=>(n[bc]||Fe.rebuild(n),n=n.proxyOf,n.parent&&n.parent.removeChild(n),n[yc]&&Ac(n),typeof n.raws.before=="undefined"&&t&&typeof t.raws.before!="undefined"&&(n.raws.before=t.raws.before.replace(/\S/g,"")),n.parent=this.proxyOf,n))}getProxyProcessor(){return{set(e,t,i){return e[t]===i||(e[t]=i,(t==="name"||t==="params"||t==="selector")&&e.markDirty()),!0},get(e,t){return t==="proxyOf"?e:e[t]?t==="each"||typeof t=="string"&&t.startsWith("walk")?(...i)=>e[t](...i.map(n=>typeof n=="function"?(a,s)=>n(a.toProxy(),s):n)):t==="every"||t==="some"?i=>e[t]((n,...a)=>i(n.toProxy(),...a)):t==="root"?()=>e.root().toProxy():t==="nodes"?e.nodes.map(i=>i.toProxy()):t==="first"||t==="last"?e[t].toProxy():e[t]:e[t]}}}getIterator(){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach+=1;let e=this.lastEach;return this.indexes[e]=0,e}};Fe.registerParse=r=>{xc=r};Fe.registerRule=r=>{da=r};Fe.registerAtRule=r=>{ha=r};Fe.registerRoot=r=>{kc=r};Cc.exports=Fe;Fe.default=Fe;Fe.rebuild=r=>{r.type==="atrule"?Object.setPrototypeOf(r,ha.prototype):r.type==="rule"?Object.setPrototypeOf(r,da.prototype):r.type==="decl"?Object.setPrototypeOf(r,wc.prototype):r.type==="comment"?Object.setPrototypeOf(r,vc.prototype):r.type==="root"&&Object.setPrototypeOf(r,kc.prototype),r[bc]=!0,r.nodes&&r.nodes.forEach(e=>{Fe.rebuild(e)})}});var cn=x((E3,Oc)=>{u();"use strict";var Ex=Et(),_c,Ec,er=class extends Ex{constructor(e){super({type:"document",...e});this.nodes||(this.nodes=[])}toResult(e={}){return new _c(new Ec,this,e).stringify()}};er.registerLazyResult=r=>{_c=r};er.registerProcessor=r=>{Ec=r};Oc.exports=er;er.default=er});var ma=x((O3,Rc)=>{u();"use strict";var Tc={};Rc.exports=function(e){Tc[e]||(Tc[e]=!0,typeof console!="undefined"&&console.warn&&console.warn(e))}});var ga=x((T3,Pc)=>{u();"use strict";var pn=class{constructor(e,t={}){if(this.type="warning",this.text=e,t.node&&t.node.source){let i=t.node.rangeBy(t);this.line=i.start.line,this.column=i.start.column,this.endLine=i.end.line,this.endColumn=i.end.column}for(let i in t)this[i]=t[i]}toString(){return this.node?this.node.error(this.text,{plugin:this.plugin,index:this.index,word:this.word}).message:this.plugin?this.plugin+": "+this.text:this.text}};Pc.exports=pn;pn.default=pn});var hn=x((R3,Ic)=>{u();"use strict";var Ox=ga(),dn=class{constructor(e,t,i){this.processor=e,this.messages=[],this.root=t,this.opts=i,this.css=void 0,this.map=void 0}toString(){return this.css}warn(e,t={}){t.plugin||this.lastPlugin&&this.lastPlugin.postcssPlugin&&(t.plugin=this.lastPlugin.postcssPlugin);let i=new Ox(e,t);return this.messages.push(i),i}warnings(){return this.messages.filter(e=>e.type==="warning")}get content(){return this.css}};Ic.exports=dn;dn.default=dn});var Mc=x((P3,Lc)=>{u();"use strict";var ya="'".charCodeAt(0),Dc='"'.charCodeAt(0),mn="\\".charCodeAt(0),qc="/".charCodeAt(0),gn=` +`.charCodeAt(0),Qr=" ".charCodeAt(0),yn="\f".charCodeAt(0),bn=" ".charCodeAt(0),wn="\r".charCodeAt(0),Tx="[".charCodeAt(0),Rx="]".charCodeAt(0),Px="(".charCodeAt(0),Ix=")".charCodeAt(0),Dx="{".charCodeAt(0),qx="}".charCodeAt(0),$x=";".charCodeAt(0),Lx="*".charCodeAt(0),Mx=":".charCodeAt(0),Nx="@".charCodeAt(0),vn=/[\t\n\f\r "#'()/;[\\\]{}]/g,xn=/[\t\n\f\r !"#'():;@[\\\]{}]|\/(?=\*)/g,Bx=/.[\n"'(/\\]/,$c=/[\da-f]/i;Lc.exports=function(e,t={}){let i=e.css.valueOf(),n=t.ignoreErrors,a,s,o,l,c,f,d,p,h,b,v=i.length,y=0,w=[],k=[];function S(){return y}function E(T){throw e.error("Unclosed "+T,y)}function O(){return k.length===0&&y>=v}function B(T){if(k.length)return k.pop();if(y>=v)return;let F=T?T.ignoreUnclosed:!1;switch(a=i.charCodeAt(y),a){case gn:case Qr:case bn:case wn:case yn:{s=y;do s+=1,a=i.charCodeAt(s);while(a===Qr||a===gn||a===bn||a===wn||a===yn);b=["space",i.slice(y,s)],y=s-1;break}case Tx:case Rx:case Dx:case qx:case Mx:case $x:case Ix:{let Y=String.fromCharCode(a);b=[Y,Y,y];break}case Px:{if(p=w.length?w.pop()[1]:"",h=i.charCodeAt(y+1),p==="url"&&h!==ya&&h!==Dc&&h!==Qr&&h!==gn&&h!==bn&&h!==yn&&h!==wn){s=y;do{if(f=!1,s=i.indexOf(")",s+1),s===-1)if(n||F){s=y;break}else E("bracket");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["brackets",i.slice(y,s+1),y,s],y=s}else s=i.indexOf(")",y+1),l=i.slice(y,s+1),s===-1||Bx.test(l)?b=["(","(",y]:(b=["brackets",l,y,s],y=s);break}case ya:case Dc:{o=a===ya?"'":'"',s=y;do{if(f=!1,s=i.indexOf(o,s+1),s===-1)if(n||F){s=y+1;break}else E("string");for(d=s;i.charCodeAt(d-1)===mn;)d-=1,f=!f}while(f);b=["string",i.slice(y,s+1),y,s],y=s;break}case Nx:{vn.lastIndex=y+1,vn.test(i),vn.lastIndex===0?s=i.length-1:s=vn.lastIndex-2,b=["at-word",i.slice(y,s+1),y,s],y=s;break}case mn:{for(s=y,c=!0;i.charCodeAt(s+1)===mn;)s+=1,c=!c;if(a=i.charCodeAt(s+1),c&&a!==qc&&a!==Qr&&a!==gn&&a!==bn&&a!==wn&&a!==yn&&(s+=1,$c.test(i.charAt(s)))){for(;$c.test(i.charAt(s+1));)s+=1;i.charCodeAt(s+1)===Qr&&(s+=1)}b=["word",i.slice(y,s+1),y,s],y=s;break}default:{a===qc&&i.charCodeAt(y+1)===Lx?(s=i.indexOf("*/",y+2)+1,s===0&&(n||F?s=i.length:E("comment")),b=["comment",i.slice(y,s+1),y,s],y=s):(xn.lastIndex=y+1,xn.test(i),xn.lastIndex===0?s=i.length-1:s=xn.lastIndex-2,b=["word",i.slice(y,s+1),y,s],w.push(b),y=s);break}}return y++,b}function N(T){k.push(T)}return{back:N,nextToken:B,endOfFile:O,position:S}}});var kn=x((I3,Bc)=>{u();"use strict";var Nc=Et(),Yr=class extends Nc{constructor(e){super(e);this.type="atrule"}append(...e){return this.proxyOf.nodes||(this.nodes=[]),super.append(...e)}prepend(...e){return this.proxyOf.nodes||(this.nodes=[]),super.prepend(...e)}};Bc.exports=Yr;Yr.default=Yr;Nc.registerAtRule(Yr)});var tr=x((D3,Uc)=>{u();"use strict";var Fc=Et(),jc,zc,Ut=class extends Fc{constructor(e){super(e);this.type="root",this.nodes||(this.nodes=[])}removeChild(e,t){let i=this.index(e);return!t&&i===0&&this.nodes.length>1&&(this.nodes[1].raws.before=this.nodes[i].raws.before),super.removeChild(e)}normalize(e,t,i){let n=super.normalize(e);if(t){if(i==="prepend")this.nodes.length>1?t.raws.before=this.nodes[1].raws.before:delete t.raws.before;else if(this.first!==t)for(let a of n)a.raws.before=t.raws.before}return n}toResult(e={}){return new jc(new zc,this,e).stringify()}};Ut.registerLazyResult=r=>{jc=r};Ut.registerProcessor=r=>{zc=r};Uc.exports=Ut;Ut.default=Ut;Fc.registerRoot(Ut)});var ba=x((q3,Vc)=>{u();"use strict";var Kr={split(r,e,t){let i=[],n="",a=!1,s=0,o=!1,l="",c=!1;for(let f of r)c?c=!1:f==="\\"?c=!0:o?f===l&&(o=!1):f==='"'||f==="'"?(o=!0,l=f):f==="("?s+=1:f===")"?s>0&&(s-=1):s===0&&e.includes(f)&&(a=!0),a?(n!==""&&i.push(n.trim()),n="",a=!1):n+=f;return(t||n!=="")&&i.push(n.trim()),i},space(r){let e=[" ",` +`," "];return Kr.split(r,e)},comma(r){return Kr.split(r,[","],!0)}};Vc.exports=Kr;Kr.default=Kr});var Sn=x(($3,Wc)=>{u();"use strict";var Hc=Et(),Fx=ba(),Xr=class extends Hc{constructor(e){super(e);this.type="rule",this.nodes||(this.nodes=[])}get selectors(){return Fx.comma(this.selector)}set selectors(e){let t=this.selector?this.selector.match(/,\s*/):null,i=t?t[0]:","+this.raw("between","beforeOpen");this.selector=e.join(i)}};Wc.exports=Xr;Xr.default=Xr;Hc.registerRule(Xr)});var Xc=x((L3,Kc)=>{u();"use strict";var jx=Wr(),zx=Mc(),Ux=Gr(),Vx=kn(),Hx=tr(),Gc=Sn(),Qc={empty:!0,space:!0};function Wx(r){for(let e=r.length-1;e>=0;e--){let t=r[e],i=t[3]||t[2];if(i)return i}}var Yc=class{constructor(e){this.input=e,this.root=new Hx,this.current=this.root,this.spaces="",this.semicolon=!1,this.customProperty=!1,this.createTokenizer(),this.root.source={input:e,start:{offset:0,line:1,column:1}}}createTokenizer(){this.tokenizer=zx(this.input)}parse(){let e;for(;!this.tokenizer.endOfFile();)switch(e=this.tokenizer.nextToken(),e[0]){case"space":this.spaces+=e[1];break;case";":this.freeSemicolon(e);break;case"}":this.end(e);break;case"comment":this.comment(e);break;case"at-word":this.atrule(e);break;case"{":this.emptyRule(e);break;default:this.other(e);break}this.endFile()}comment(e){let t=new Ux;this.init(t,e[2]),t.source.end=this.getPosition(e[3]||e[2]);let i=e[1].slice(2,-2);if(/^\s*$/.test(i))t.text="",t.raws.left=i,t.raws.right="";else{let n=i.match(/^(\s*)([^]*\S)(\s*)$/);t.text=n[2],t.raws.left=n[1],t.raws.right=n[3]}}emptyRule(e){let t=new Gc;this.init(t,e[2]),t.selector="",t.raws.between="",this.current=t}other(e){let t=!1,i=null,n=!1,a=null,s=[],o=e[1].startsWith("--"),l=[],c=e;for(;c;){if(i=c[0],l.push(c),i==="("||i==="[")a||(a=c),s.push(i==="("?")":"]");else if(o&&n&&i==="{")a||(a=c),s.push("}");else if(s.length===0)if(i===";")if(n){this.decl(l,o);return}else break;else if(i==="{"){this.rule(l);return}else if(i==="}"){this.tokenizer.back(l.pop()),t=!0;break}else i===":"&&(n=!0);else i===s[s.length-1]&&(s.pop(),s.length===0&&(a=null));c=this.tokenizer.nextToken()}if(this.tokenizer.endOfFile()&&(t=!0),s.length>0&&this.unclosedBracket(a),t&&n){if(!o)for(;l.length&&(c=l[l.length-1][0],!(c!=="space"&&c!=="comment"));)this.tokenizer.back(l.pop());this.decl(l,o)}else this.unknownWord(l)}rule(e){e.pop();let t=new Gc;this.init(t,e[0][2]),t.raws.between=this.spacesAndCommentsFromEnd(e),this.raw(t,"selector",e),this.current=t}decl(e,t){let i=new jx;this.init(i,e[0][2]);let n=e[e.length-1];for(n[0]===";"&&(this.semicolon=!0,e.pop()),i.source.end=this.getPosition(n[3]||n[2]||Wx(e));e[0][0]!=="word";)e.length===1&&this.unknownWord(e),i.raws.before+=e.shift()[1];for(i.source.start=this.getPosition(e[0][2]),i.prop="";e.length;){let c=e[0][0];if(c===":"||c==="space"||c==="comment")break;i.prop+=e.shift()[1]}i.raws.between="";let a;for(;e.length;)if(a=e.shift(),a[0]===":"){i.raws.between+=a[1];break}else a[0]==="word"&&/\w/.test(a[1])&&this.unknownWord([a]),i.raws.between+=a[1];(i.prop[0]==="_"||i.prop[0]==="*")&&(i.raws.before+=i.prop[0],i.prop=i.prop.slice(1));let s=[],o;for(;e.length&&(o=e[0][0],!(o!=="space"&&o!=="comment"));)s.push(e.shift());this.precheckMissedSemicolon(e);for(let c=e.length-1;c>=0;c--){if(a=e[c],a[1].toLowerCase()==="!important"){i.important=!0;let f=this.stringFrom(e,c);f=this.spacesFromEnd(e)+f,f!==" !important"&&(i.raws.important=f);break}else if(a[1].toLowerCase()==="important"){let f=e.slice(0),d="";for(let p=c;p>0;p--){let h=f[p][0];if(d.trim().indexOf("!")===0&&h!=="space")break;d=f.pop()[1]+d}d.trim().indexOf("!")===0&&(i.important=!0,i.raws.important=d,e=f)}if(a[0]!=="space"&&a[0]!=="comment")break}e.some(c=>c[0]!=="space"&&c[0]!=="comment")&&(i.raws.between+=s.map(c=>c[1]).join(""),s=[]),this.raw(i,"value",s.concat(e),t),i.value.includes(":")&&!t&&this.checkMissedSemicolon(e)}atrule(e){let t=new Vx;t.name=e[1].slice(1),t.name===""&&this.unnamedAtrule(t,e),this.init(t,e[2]);let i,n,a,s=!1,o=!1,l=[],c=[];for(;!this.tokenizer.endOfFile();){if(e=this.tokenizer.nextToken(),i=e[0],i==="("||i==="["?c.push(i==="("?")":"]"):i==="{"&&c.length>0?c.push("}"):i===c[c.length-1]&&c.pop(),c.length===0)if(i===";"){t.source.end=this.getPosition(e[2]),this.semicolon=!0;break}else if(i==="{"){o=!0;break}else if(i==="}"){if(l.length>0){for(a=l.length-1,n=l[a];n&&n[0]==="space";)n=l[--a];n&&(t.source.end=this.getPosition(n[3]||n[2]))}this.end(e);break}else l.push(e);else l.push(e);if(this.tokenizer.endOfFile()){s=!0;break}}t.raws.between=this.spacesAndCommentsFromEnd(l),l.length?(t.raws.afterName=this.spacesAndCommentsFromStart(l),this.raw(t,"params",l),s&&(e=l[l.length-1],t.source.end=this.getPosition(e[3]||e[2]),this.spaces=t.raws.between,t.raws.between="")):(t.raws.afterName="",t.params=""),o&&(t.nodes=[],this.current=t)}end(e){this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.semicolon=!1,this.current.raws.after=(this.current.raws.after||"")+this.spaces,this.spaces="",this.current.parent?(this.current.source.end=this.getPosition(e[2]),this.current=this.current.parent):this.unexpectedClose(e)}endFile(){this.current.parent&&this.unclosedBlock(),this.current.nodes&&this.current.nodes.length&&(this.current.raws.semicolon=this.semicolon),this.current.raws.after=(this.current.raws.after||"")+this.spaces}freeSemicolon(e){if(this.spaces+=e[1],this.current.nodes){let t=this.current.nodes[this.current.nodes.length-1];t&&t.type==="rule"&&!t.raws.ownSemicolon&&(t.raws.ownSemicolon=this.spaces,this.spaces="")}}getPosition(e){let t=this.input.fromOffset(e);return{offset:e,line:t.line,column:t.col}}init(e,t){this.current.push(e),e.source={start:this.getPosition(t),input:this.input},e.raws.before=this.spaces,this.spaces="",e.type!=="comment"&&(this.semicolon=!1)}raw(e,t,i,n){let a,s,o=i.length,l="",c=!0,f,d;for(let p=0;ph+b[1],"");e.raws[t]={value:l,raw:p}}e[t]=l}spacesAndCommentsFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],!(t!=="space"&&t!=="comment"));)i=e.pop()[1]+i;return i}spacesAndCommentsFromStart(e){let t,i="";for(;e.length&&(t=e[0][0],!(t!=="space"&&t!=="comment"));)i+=e.shift()[1];return i}spacesFromEnd(e){let t,i="";for(;e.length&&(t=e[e.length-1][0],t==="space");)i=e.pop()[1]+i;return i}stringFrom(e,t){let i="";for(let n=t;n=0&&(n=e[a],!(n[0]!=="space"&&(i+=1,i===2)));a--);throw this.input.error("Missed semicolon",n[0]==="word"?n[3]+1:n[2])}};Kc.exports=Yc});var Jc=x(()=>{u()});var ep=x((B3,Zc)=>{u();var Gx="useandom-26T198340PX75pxJACKVERYMINDBUSHWOLF_GQZbfghjklqvwyzrict",Qx=(r,e=21)=>(t=e)=>{let i="",n=t;for(;n--;)i+=r[Math.random()*r.length|0];return i},Yx=(r=21)=>{let e="",t=r;for(;t--;)e+=Gx[Math.random()*64|0];return e};Zc.exports={nanoid:Yx,customAlphabet:Qx}});var wa=x((F3,tp)=>{u();tp.exports={}});var Cn=x((j3,sp)=>{u();"use strict";var{SourceMapConsumer:Kx,SourceMapGenerator:Xx}=Jc(),{fileURLToPath:rp,pathToFileURL:An}=(aa(),ac),{resolve:va,isAbsolute:xa}=(et(),Ur),{nanoid:Jx}=ep(),ka=oa(),ip=nn(),Zx=wa(),Sa=Symbol("fromOffsetCache"),e1=Boolean(Kx&&Xx),np=Boolean(va&&xa),Jr=class{constructor(e,t={}){if(e===null||typeof e=="undefined"||typeof e=="object"&&!e.toString)throw new Error(`PostCSS received ${e} instead of CSS string`);if(this.css=e.toString(),this.css[0]==="\uFEFF"||this.css[0]==="\uFFFE"?(this.hasBOM=!0,this.css=this.css.slice(1)):this.hasBOM=!1,t.from&&(!np||/^\w+:\/\//.test(t.from)||xa(t.from)?this.file=t.from:this.file=va(t.from)),np&&e1){let i=new Zx(this.css,t);if(i.text){this.map=i;let n=i.consumer().file;!this.file&&n&&(this.file=this.mapResolve(n))}}this.file||(this.id=""),this.map&&(this.map.file=this.from)}fromOffset(e){let t,i;if(this[Sa])i=this[Sa];else{let a=this.css.split(` +`);i=new Array(a.length);let s=0;for(let o=0,l=a.length;o=t)n=i.length-1;else{let a=i.length-2,s;for(;n>1),e=i[s+1])n=s+1;else{n=s;break}}return{line:n+1,col:e-i[n]+1}}error(e,t,i,n={}){let a,s,o;if(t&&typeof t=="object"){let c=t,f=i;if(typeof c.offset=="number"){let d=this.fromOffset(c.offset);t=d.line,i=d.col}else t=c.line,i=c.column;if(typeof f.offset=="number"){let d=this.fromOffset(f.offset);s=d.line,o=d.col}else s=f.line,o=f.column}else if(!i){let c=this.fromOffset(t);t=c.line,i=c.col}let l=this.origin(t,i,s,o);return l?a=new ip(e,l.endLine===void 0?l.line:{line:l.line,column:l.column},l.endLine===void 0?l.column:{line:l.endLine,column:l.endColumn},l.source,l.file,n.plugin):a=new ip(e,s===void 0?t:{line:t,column:i},s===void 0?i:{line:s,column:o},this.css,this.file,n.plugin),a.input={line:t,column:i,endLine:s,endColumn:o,source:this.css},this.file&&(An&&(a.input.url=An(this.file).toString()),a.input.file=this.file),a}origin(e,t,i,n){if(!this.map)return!1;let a=this.map.consumer(),s=a.originalPositionFor({line:e,column:t});if(!s.source)return!1;let o;typeof i=="number"&&(o=a.originalPositionFor({line:i,column:n}));let l;xa(s.source)?l=An(s.source):l=new URL(s.source,this.map.consumer().sourceRoot||An(this.map.mapFile));let c={url:l.toString(),line:s.line,column:s.column,endLine:o&&o.line,endColumn:o&&o.column};if(l.protocol==="file:")if(rp)c.file=rp(l);else throw new Error("file: protocol is not available in this PostCSS build");let f=a.sourceContentFor(s.source);return f&&(c.source=f),c}mapResolve(e){return/^\w+:\/\//.test(e)?e:va(this.map.consumer().sourceRoot||this.map.root||".",e)}get from(){return this.file||this.id}toJSON(){let e={};for(let t of["hasBOM","css","file","id"])this[t]!=null&&(e[t]=this[t]);return this.map&&(e.map={...this.map},e.map.consumerCache&&(e.map.consumerCache=void 0)),e}};sp.exports=Jr;Jr.default=Jr;ka&&ka.registerInput&&ka.registerInput(Jr)});var En=x((z3,ap)=>{u();"use strict";var t1=Et(),r1=Xc(),i1=Cn();function _n(r,e){let t=new i1(r,e),i=new r1(t);try{i.parse()}catch(n){throw n}return i.root}ap.exports=_n;_n.default=_n;t1.registerParse(_n)});var _a=x((V3,fp)=>{u();"use strict";var{isClean:tt,my:n1}=sn(),s1=pa(),a1=Vr(),o1=Et(),l1=cn(),U3=ma(),op=hn(),u1=En(),f1=tr(),c1={document:"Document",root:"Root",atrule:"AtRule",rule:"Rule",decl:"Declaration",comment:"Comment"},p1={postcssPlugin:!0,prepare:!0,Once:!0,Document:!0,Root:!0,Declaration:!0,Rule:!0,AtRule:!0,Comment:!0,DeclarationExit:!0,RuleExit:!0,AtRuleExit:!0,CommentExit:!0,RootExit:!0,DocumentExit:!0,OnceExit:!0},d1={postcssPlugin:!0,prepare:!0,Once:!0},rr=0;function Zr(r){return typeof r=="object"&&typeof r.then=="function"}function lp(r){let e=!1,t=c1[r.type];return r.type==="decl"?e=r.prop.toLowerCase():r.type==="atrule"&&(e=r.name.toLowerCase()),e&&r.append?[t,t+"-"+e,rr,t+"Exit",t+"Exit-"+e]:e?[t,t+"-"+e,t+"Exit",t+"Exit-"+e]:r.append?[t,rr,t+"Exit"]:[t,t+"Exit"]}function up(r){let e;return r.type==="document"?e=["Document",rr,"DocumentExit"]:r.type==="root"?e=["Root",rr,"RootExit"]:e=lp(r),{node:r,events:e,eventIndex:0,visitors:[],visitorIndex:0,iterator:0}}function Aa(r){return r[tt]=!1,r.nodes&&r.nodes.forEach(e=>Aa(e)),r}var Ca={},pt=class{constructor(e,t,i){this.stringified=!1,this.processed=!1;let n;if(typeof t=="object"&&t!==null&&(t.type==="root"||t.type==="document"))n=Aa(t);else if(t instanceof pt||t instanceof op)n=Aa(t.root),t.map&&(typeof i.map=="undefined"&&(i.map={}),i.map.inline||(i.map.inline=!1),i.map.prev=t.map);else{let a=u1;i.syntax&&(a=i.syntax.parse),i.parser&&(a=i.parser),a.parse&&(a=a.parse);try{n=a(t,i)}catch(s){this.processed=!0,this.error=s}n&&!n[n1]&&o1.rebuild(n)}this.result=new op(e,n,i),this.helpers={...Ca,result:this.result,postcss:Ca},this.plugins=this.processor.plugins.map(a=>typeof a=="object"&&a.prepare?{...a,...a.prepare(this.result)}:a)}get[Symbol.toStringTag](){return"LazyResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.stringify().css}get content(){return this.stringify().content}get map(){return this.stringify().map}get root(){return this.sync().root}get messages(){return this.sync().messages}warnings(){return this.sync().warnings()}toString(){return this.css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):this.processed?Promise.resolve(this.result):(this.processing||(this.processing=this.runAsync()),this.processing)}sync(){if(this.error)throw this.error;if(this.processed)return this.result;if(this.processed=!0,this.processing)throw this.getAsyncError();for(let e of this.plugins){let t=this.runOnRoot(e);if(Zr(t))throw this.getAsyncError()}if(this.prepareVisitors(),this.hasListener){let e=this.result.root;for(;!e[tt];)e[tt]=!0,this.walkSync(e);if(this.listeners.OnceExit)if(e.type==="document")for(let t of e.nodes)this.visitSync(this.listeners.OnceExit,t);else this.visitSync(this.listeners.OnceExit,e)}return this.result}stringify(){if(this.error)throw this.error;if(this.stringified)return this.result;this.stringified=!0,this.sync();let e=this.result.opts,t=a1;e.syntax&&(t=e.syntax.stringify),e.stringifier&&(t=e.stringifier),t.stringify&&(t=t.stringify);let n=new s1(t,this.result.root,this.result.opts).generate();return this.result.css=n[0],this.result.map=n[1],this.result}walkSync(e){e[tt]=!0;let t=lp(e);for(let i of t)if(i===rr)e.nodes&&e.each(n=>{n[tt]||this.walkSync(n)});else{let n=this.listeners[i];if(n&&this.visitSync(n,e.toProxy()))return}}visitSync(e,t){for(let[i,n]of e){this.result.lastPlugin=i;let a;try{a=n(t,this.helpers)}catch(s){throw this.handleError(s,t.proxyOf)}if(t.type!=="root"&&t.type!=="document"&&!t.parent)return!0;if(Zr(a))throw this.getAsyncError()}}runOnRoot(e){this.result.lastPlugin=e;try{if(typeof e=="object"&&e.Once){if(this.result.root.type==="document"){let t=this.result.root.nodes.map(i=>e.Once(i,this.helpers));return Zr(t[0])?Promise.all(t):t}return e.Once(this.result.root,this.helpers)}else if(typeof e=="function")return e(this.result.root,this.result)}catch(t){throw this.handleError(t)}}getAsyncError(){throw new Error("Use process(css).then(cb) to work with async plugins")}handleError(e,t){let i=this.result.lastPlugin;try{t&&t.addToError(e),this.error=e,e.name==="CssSyntaxError"&&!e.plugin?(e.plugin=i.postcssPlugin,e.setMessage()):i.postcssVersion}catch(n){console&&console.error&&console.error(n)}return e}async runAsync(){this.plugin=0;for(let e=0;e0;){let i=this.visitTick(t);if(Zr(i))try{await i}catch(n){let a=t[t.length-1].node;throw this.handleError(n,a)}}}if(this.listeners.OnceExit)for(let[t,i]of this.listeners.OnceExit){this.result.lastPlugin=t;try{if(e.type==="document"){let n=e.nodes.map(a=>i(a,this.helpers));await Promise.all(n)}else await i(e,this.helpers)}catch(n){throw this.handleError(n)}}}return this.processed=!0,this.stringify()}prepareVisitors(){this.listeners={};let e=(t,i,n)=>{this.listeners[i]||(this.listeners[i]=[]),this.listeners[i].push([t,n])};for(let t of this.plugins)if(typeof t=="object")for(let i in t){if(!p1[i]&&/^[A-Z]/.test(i))throw new Error(`Unknown event ${i} in ${t.postcssPlugin}. Try to update PostCSS (${this.processor.version} now).`);if(!d1[i])if(typeof t[i]=="object")for(let n in t[i])n==="*"?e(t,i,t[i][n]):e(t,i+"-"+n.toLowerCase(),t[i][n]);else typeof t[i]=="function"&&e(t,i,t[i])}this.hasListener=Object.keys(this.listeners).length>0}visitTick(e){let t=e[e.length-1],{node:i,visitors:n}=t;if(i.type!=="root"&&i.type!=="document"&&!i.parent){e.pop();return}if(n.length>0&&t.visitorIndex{Ca=r};fp.exports=pt;pt.default=pt;f1.registerLazyResult(pt);l1.registerLazyResult(pt)});var pp=x((W3,cp)=>{u();"use strict";var h1=pa(),m1=Vr(),H3=ma(),g1=En(),y1=hn(),On=class{constructor(e,t,i){t=t.toString(),this.stringified=!1,this._processor=e,this._css=t,this._opts=i,this._map=void 0;let n,a=m1;this.result=new y1(this._processor,n,this._opts),this.result.css=t;let s=this;Object.defineProperty(this.result,"root",{get(){return s.root}});let o=new h1(a,n,this._opts,t);if(o.isMap()){let[l,c]=o.generate();l&&(this.result.css=l),c&&(this.result.map=c)}}get[Symbol.toStringTag](){return"NoWorkResult"}get processor(){return this.result.processor}get opts(){return this.result.opts}get css(){return this.result.css}get content(){return this.result.css}get map(){return this.result.map}get root(){if(this._root)return this._root;let e,t=g1;try{e=t(this._css,this._opts)}catch(i){this.error=i}if(this.error)throw this.error;return this._root=e,e}get messages(){return[]}warnings(){return[]}toString(){return this._css}then(e,t){return this.async().then(e,t)}catch(e){return this.async().catch(e)}finally(e){return this.async().then(e,e)}async(){return this.error?Promise.reject(this.error):Promise.resolve(this.result)}sync(){if(this.error)throw this.error;return this.result}};cp.exports=On;On.default=On});var hp=x((G3,dp)=>{u();"use strict";var b1=pp(),w1=_a(),v1=cn(),x1=tr(),ir=class{constructor(e=[]){this.version="8.4.24",this.plugins=this.normalize(e)}use(e){return this.plugins=this.plugins.concat(this.normalize([e])),this}process(e,t={}){return this.plugins.length===0&&typeof t.parser=="undefined"&&typeof t.stringifier=="undefined"&&typeof t.syntax=="undefined"?new b1(this,e,t):new w1(this,e,t)}normalize(e){let t=[];for(let i of e)if(i.postcss===!0?i=i():i.postcss&&(i=i.postcss),typeof i=="object"&&Array.isArray(i.plugins))t=t.concat(i.plugins);else if(typeof i=="object"&&i.postcssPlugin)t.push(i);else if(typeof i=="function")t.push(i);else if(!(typeof i=="object"&&(i.parse||i.stringify)))throw new Error(i+" is not a PostCSS plugin");return t}};dp.exports=ir;ir.default=ir;x1.registerProcessor(ir);v1.registerProcessor(ir)});var gp=x((Q3,mp)=>{u();"use strict";var k1=Wr(),S1=wa(),A1=Gr(),C1=kn(),_1=Cn(),E1=tr(),O1=Sn();function ei(r,e){if(Array.isArray(r))return r.map(n=>ei(n));let{inputs:t,...i}=r;if(t){e=[];for(let n of t){let a={...n,__proto__:_1.prototype};a.map&&(a.map={...a.map,__proto__:S1.prototype}),e.push(a)}}if(i.nodes&&(i.nodes=r.nodes.map(n=>ei(n,e))),i.source){let{inputId:n,...a}=i.source;i.source=a,n!=null&&(i.source.input=e[n])}if(i.type==="root")return new E1(i);if(i.type==="decl")return new k1(i);if(i.type==="rule")return new O1(i);if(i.type==="comment")return new A1(i);if(i.type==="atrule")return new C1(i);throw new Error("Unknown node type: "+r.type)}mp.exports=ei;ei.default=ei});var $e=x((Y3,Sp)=>{u();"use strict";var T1=nn(),yp=Wr(),R1=_a(),P1=Et(),Ea=hp(),I1=Vr(),D1=gp(),bp=cn(),q1=ga(),wp=Gr(),vp=kn(),$1=hn(),L1=Cn(),M1=En(),N1=ba(),xp=Sn(),kp=tr(),B1=Hr();function Z(...r){return r.length===1&&Array.isArray(r[0])&&(r=r[0]),new Ea(r)}Z.plugin=function(e,t){let i=!1;function n(...s){console&&console.warn&&!i&&(i=!0,console.warn(e+`: postcss.plugin was deprecated. Migration guide: +https://evilmartians.com/chronicles/postcss-8-plugin-migration`),m.env.LANG&&m.env.LANG.startsWith("cn")&&console.warn(e+`: \u91CC\u9762 postcss.plugin \u88AB\u5F03\u7528. \u8FC1\u79FB\u6307\u5357: +https://www.w3ctech.com/topic/2226`));let o=t(...s);return o.postcssPlugin=e,o.postcssVersion=new Ea().version,o}let a;return Object.defineProperty(n,"postcss",{get(){return a||(a=n()),a}}),n.process=function(s,o,l){return Z([n(l)]).process(s,o)},n};Z.stringify=I1;Z.parse=M1;Z.fromJSON=D1;Z.list=N1;Z.comment=r=>new wp(r);Z.atRule=r=>new vp(r);Z.decl=r=>new yp(r);Z.rule=r=>new xp(r);Z.root=r=>new kp(r);Z.document=r=>new bp(r);Z.CssSyntaxError=T1;Z.Declaration=yp;Z.Container=P1;Z.Processor=Ea;Z.Document=bp;Z.Comment=wp;Z.Warning=q1;Z.AtRule=vp;Z.Result=$1;Z.Input=L1;Z.Rule=xp;Z.Root=kp;Z.Node=B1;R1.registerPostcss(Z);Sp.exports=Z;Z.default=Z});var re,ee,K3,X3,J3,Z3,eI,tI,rI,iI,nI,sI,aI,oI,lI,uI,fI,cI,pI,dI,hI,mI,gI,yI,bI,wI,Ot=R(()=>{u();re=pe($e()),ee=re.default,K3=re.default.stringify,X3=re.default.fromJSON,J3=re.default.plugin,Z3=re.default.parse,eI=re.default.list,tI=re.default.document,rI=re.default.comment,iI=re.default.atRule,nI=re.default.rule,sI=re.default.decl,aI=re.default.root,oI=re.default.CssSyntaxError,lI=re.default.Declaration,uI=re.default.Container,fI=re.default.Processor,cI=re.default.Document,pI=re.default.Comment,dI=re.default.Warning,hI=re.default.AtRule,mI=re.default.Result,gI=re.default.Input,yI=re.default.Rule,bI=re.default.Root,wI=re.default.Node});var Oa=x((xI,Ap)=>{u();Ap.exports=function(r,e,t,i,n){for(e=e.split?e.split("."):e,i=0;i{u();"use strict";Tn.__esModule=!0;Tn.default=z1;function F1(r){for(var e=r.toLowerCase(),t="",i=!1,n=0;n<6&&e[n]!==void 0;n++){var a=e.charCodeAt(n),s=a>=97&&a<=102||a>=48&&a<=57;if(i=a===32,!s)break;t+=e[n]}if(t.length!==0){var o=parseInt(t,16),l=o>=55296&&o<=57343;return l||o===0||o>1114111?["\uFFFD",t.length+(i?1:0)]:[String.fromCodePoint(o),t.length+(i?1:0)]}}var j1=/\\/;function z1(r){var e=j1.test(r);if(!e)return r;for(var t="",i=0;i{u();"use strict";Pn.__esModule=!0;Pn.default=U1;function U1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i0;){var n=t.shift();if(!r[n])return;r=r[n]}return r}_p.exports=Pn.default});var Tp=x((In,Op)=>{u();"use strict";In.__esModule=!0;In.default=V1;function V1(r){for(var e=arguments.length,t=new Array(e>1?e-1:0),i=1;i0;){var n=t.shift();r[n]||(r[n]={}),r=r[n]}}Op.exports=In.default});var Pp=x((Dn,Rp)=>{u();"use strict";Dn.__esModule=!0;Dn.default=H1;function H1(r){for(var e="",t=r.indexOf("/*"),i=0;t>=0;){e=e+r.slice(i,t);var n=r.indexOf("*/",t+2);if(n<0)return e;i=n+2,t=r.indexOf("/*",i)}return e=e+r.slice(i),e}Rp.exports=Dn.default});var ti=x(rt=>{u();"use strict";rt.__esModule=!0;rt.unesc=rt.stripComments=rt.getProp=rt.ensureObject=void 0;var W1=qn(Rn());rt.unesc=W1.default;var G1=qn(Ep());rt.getProp=G1.default;var Q1=qn(Tp());rt.ensureObject=Q1.default;var Y1=qn(Pp());rt.stripComments=Y1.default;function qn(r){return r&&r.__esModule?r:{default:r}}});var dt=x((ri,qp)=>{u();"use strict";ri.__esModule=!0;ri.default=void 0;var Ip=ti();function Dp(r,e){for(var t=0;ti||this.source.end.linen||this.source.end.line===i&&this.source.end.column{u();"use strict";ie.__esModule=!0;ie.UNIVERSAL=ie.TAG=ie.STRING=ie.SELECTOR=ie.ROOT=ie.PSEUDO=ie.NESTING=ie.ID=ie.COMMENT=ie.COMBINATOR=ie.CLASS=ie.ATTRIBUTE=void 0;var Z1="tag";ie.TAG=Z1;var ek="string";ie.STRING=ek;var tk="selector";ie.SELECTOR=tk;var rk="root";ie.ROOT=rk;var ik="pseudo";ie.PSEUDO=ik;var nk="nesting";ie.NESTING=nk;var sk="id";ie.ID=sk;var ak="comment";ie.COMMENT=ak;var ok="combinator";ie.COMBINATOR=ok;var lk="class";ie.CLASS=lk;var uk="attribute";ie.ATTRIBUTE=uk;var fk="universal";ie.UNIVERSAL=fk});var $n=x((ii,Np)=>{u();"use strict";ii.__esModule=!0;ii.default=void 0;var ck=dk(dt()),ht=pk(Se());function $p(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return($p=function(n){return n?t:e})(r)}function pk(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=$p(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function dk(r){return r&&r.__esModule?r:{default:r}}function hk(r,e){var t=typeof Symbol!="undefined"&&r[Symbol.iterator]||r["@@iterator"];if(t)return(t=t.call(r)).next.bind(t);if(Array.isArray(r)||(t=mk(r))||e&&r&&typeof r.length=="number"){t&&(r=t);var i=0;return function(){return i>=r.length?{done:!0}:{done:!1,value:r[i++]}}}throw new TypeError(`Invalid attempt to iterate non-iterable instance. +In order to be iterable, non-array objects must have a [Symbol.iterator]() method.`)}function mk(r,e){if(!!r){if(typeof r=="string")return Lp(r,e);var t=Object.prototype.toString.call(r).slice(8,-1);if(t==="Object"&&r.constructor&&(t=r.constructor.name),t==="Map"||t==="Set")return Array.from(r);if(t==="Arguments"||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t))return Lp(r,e)}}function Lp(r,e){(e==null||e>r.length)&&(e=r.length);for(var t=0,i=new Array(e);t=n&&(this.indexes[s]=a-1);return this},t.removeAll=function(){for(var n=hk(this.nodes),a;!(a=n()).done;){var s=a.value;s.parent=void 0}return this.nodes=[],this},t.empty=function(){return this.removeAll()},t.insertAfter=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s+1,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],s<=o&&(this.indexes[l]=o+1);return this},t.insertBefore=function(n,a){a.parent=this;var s=this.index(n);this.nodes.splice(s,0,a),a.parent=this;var o;for(var l in this.indexes)o=this.indexes[l],o<=s&&(this.indexes[l]=o+1);return this},t._findChildAtPosition=function(n,a){var s=void 0;return this.each(function(o){if(o.atPosition){var l=o.atPosition(n,a);if(l)return s=l,!1}else if(o.isAtPosition(n,a))return s=o,!1}),s},t.atPosition=function(n,a){if(this.isAtPosition(n,a))return this._findChildAtPosition(n,a)||this},t._inferEndPosition=function(){this.last&&this.last.source&&this.last.source.end&&(this.source=this.source||{},this.source.end=this.source.end||{},Object.assign(this.source.end,this.last.source.end))},t.each=function(n){this.lastEach||(this.lastEach=0),this.indexes||(this.indexes={}),this.lastEach++;var a=this.lastEach;if(this.indexes[a]=0,!!this.length){for(var s,o;this.indexes[a]{u();"use strict";ni.__esModule=!0;ni.default=void 0;var wk=xk($n()),vk=Se();function xk(r){return r&&r.__esModule?r:{default:r}}function Bp(r,e){for(var t=0;t{u();"use strict";si.__esModule=!0;si.default=void 0;var Ck=Ek($n()),_k=Se();function Ek(r){return r&&r.__esModule?r:{default:r}}function Ok(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ia(r,e)}function Ia(r,e){return Ia=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ia(r,e)}var Tk=function(r){Ok(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=_k.SELECTOR,i}return e}(Ck.default);si.default=Tk;jp.exports=si.default});var Ln=x((AI,zp)=>{u();"use strict";var Rk={},Pk=Rk.hasOwnProperty,Ik=function(e,t){if(!e)return t;var i={};for(var n in t)i[n]=Pk.call(e,n)?e[n]:t[n];return i},Dk=/[ -,\.\/:-@\[-\^`\{-~]/,qk=/[ -,\.\/:-@\[\]\^`\{-~]/,$k=/(^|\\+)?(\\[A-F0-9]{1,6})\x20(?![a-fA-F0-9\x20])/g,qa=function r(e,t){t=Ik(t,r.options),t.quotes!="single"&&t.quotes!="double"&&(t.quotes="single");for(var i=t.quotes=="double"?'"':"'",n=t.isIdentifier,a=e.charAt(0),s="",o=0,l=e.length;o126){if(f>=55296&&f<=56319&&o{u();"use strict";ai.__esModule=!0;ai.default=void 0;var Lk=Up(Ln()),Mk=ti(),Nk=Up(dt()),Bk=Se();function Up(r){return r&&r.__esModule?r:{default:r}}function Vp(r,e){for(var t=0;t{u();"use strict";oi.__esModule=!0;oi.default=void 0;var Uk=Hk(dt()),Vk=Se();function Hk(r){return r&&r.__esModule?r:{default:r}}function Wk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ma(r,e)}function Ma(r,e){return Ma=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ma(r,e)}var Gk=function(r){Wk(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=Vk.COMMENT,i}return e}(Uk.default);oi.default=Gk;Wp.exports=oi.default});var Fa=x((li,Gp)=>{u();"use strict";li.__esModule=!0;li.default=void 0;var Qk=Kk(dt()),Yk=Se();function Kk(r){return r&&r.__esModule?r:{default:r}}function Xk(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Ba(r,e)}function Ba(r,e){return Ba=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Ba(r,e)}var Jk=function(r){Xk(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=Yk.ID,n}var t=e.prototype;return t.valueToString=function(){return"#"+r.prototype.valueToString.call(this)},e}(Qk.default);li.default=Jk;Gp.exports=li.default});var Mn=x((ui,Kp)=>{u();"use strict";ui.__esModule=!0;ui.default=void 0;var Zk=Qp(Ln()),eS=ti(),tS=Qp(dt());function Qp(r){return r&&r.__esModule?r:{default:r}}function Yp(r,e){for(var t=0;t{u();"use strict";fi.__esModule=!0;fi.default=void 0;var sS=oS(Mn()),aS=Se();function oS(r){return r&&r.__esModule?r:{default:r}}function lS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,za(r,e)}function za(r,e){return za=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},za(r,e)}var uS=function(r){lS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=aS.TAG,i}return e}(sS.default);fi.default=uS;Xp.exports=fi.default});var Ha=x((ci,Jp)=>{u();"use strict";ci.__esModule=!0;ci.default=void 0;var fS=pS(dt()),cS=Se();function pS(r){return r&&r.__esModule?r:{default:r}}function dS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Va(r,e)}function Va(r,e){return Va=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Va(r,e)}var hS=function(r){dS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=cS.STRING,i}return e}(fS.default);ci.default=hS;Jp.exports=ci.default});var Ga=x((pi,Zp)=>{u();"use strict";pi.__esModule=!0;pi.default=void 0;var mS=yS($n()),gS=Se();function yS(r){return r&&r.__esModule?r:{default:r}}function bS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,Wa(r,e)}function Wa(r,e){return Wa=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},Wa(r,e)}var wS=function(r){bS(e,r);function e(i){var n;return n=r.call(this,i)||this,n.type=gS.PSEUDO,n}var t=e.prototype;return t.toString=function(){var n=this.length?"("+this.map(String).join(",")+")":"";return[this.rawSpaceBefore,this.stringifyProperty("value"),n,this.rawSpaceAfter].join("")},e}(mS.default);pi.default=wS;Zp.exports=pi.default});var Nn={};Ge(Nn,{deprecate:()=>vS});function vS(r){return r}var Bn=R(()=>{u()});var td=x((CI,ed)=>{u();ed.exports=(Bn(),Nn).deprecate});var Za=x(mi=>{u();"use strict";mi.__esModule=!0;mi.default=void 0;mi.unescapeValue=Xa;var di=Ya(Ln()),xS=Ya(Rn()),kS=Ya(Mn()),SS=Se(),Qa;function Ya(r){return r&&r.__esModule?r:{default:r}}function rd(r,e){for(var t=0;t0&&!n.quoted&&o.before.length===0&&!(n.spaces.value&&n.spaces.value.after)&&(o.before=" "),id(s,o)}))),a.push("]"),a.push(this.rawSpaceAfter),a.join("")},AS(e,[{key:"quoted",get:function(){var n=this.quoteMark;return n==="'"||n==='"'},set:function(n){OS()}},{key:"quoteMark",get:function(){return this._quoteMark},set:function(n){if(!this._constructed){this._quoteMark=n;return}this._quoteMark!==n&&(this._quoteMark=n,this._syncRawValue())}},{key:"qualifiedAttribute",get:function(){return this.qualifiedName(this.raws.attribute||this.attribute)}},{key:"insensitiveFlag",get:function(){return this.insensitive?"i":""}},{key:"value",get:function(){return this._value},set:function(n){if(this._constructed){var a=Xa(n),s=a.deprecatedUsage,o=a.unescaped,l=a.quoteMark;if(s&&ES(),o===this._value&&l===this._quoteMark)return;this._value=o,this._quoteMark=l,this._syncRawValue()}else this._value=n}},{key:"insensitive",get:function(){return this._insensitive},set:function(n){n||(this._insensitive=!1,this.raws&&(this.raws.insensitiveFlag==="I"||this.raws.insensitiveFlag==="i")&&(this.raws.insensitiveFlag=void 0)),this._insensitive=n}},{key:"attribute",get:function(){return this._attribute},set:function(n){this._handleEscapes("attribute",n),this._attribute=n}}]),e}(kS.default);mi.default=Fn;Fn.NO_QUOTE=null;Fn.SINGLE_QUOTE="'";Fn.DOUBLE_QUOTE='"';var Ja=(Qa={"'":{quotes:"single",wrap:!0},'"':{quotes:"double",wrap:!0}},Qa[null]={isIdentifier:!0},Qa);function id(r,e){return""+e.before+r+e.after}});var to=x((gi,nd)=>{u();"use strict";gi.__esModule=!0;gi.default=void 0;var PS=DS(Mn()),IS=Se();function DS(r){return r&&r.__esModule?r:{default:r}}function qS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,eo(r,e)}function eo(r,e){return eo=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},eo(r,e)}var $S=function(r){qS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=IS.UNIVERSAL,i.value="*",i}return e}(PS.default);gi.default=$S;nd.exports=gi.default});var io=x((yi,sd)=>{u();"use strict";yi.__esModule=!0;yi.default=void 0;var LS=NS(dt()),MS=Se();function NS(r){return r&&r.__esModule?r:{default:r}}function BS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,ro(r,e)}function ro(r,e){return ro=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},ro(r,e)}var FS=function(r){BS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=MS.COMBINATOR,i}return e}(LS.default);yi.default=FS;sd.exports=yi.default});var so=x((bi,ad)=>{u();"use strict";bi.__esModule=!0;bi.default=void 0;var jS=US(dt()),zS=Se();function US(r){return r&&r.__esModule?r:{default:r}}function VS(r,e){r.prototype=Object.create(e.prototype),r.prototype.constructor=r,no(r,e)}function no(r,e){return no=Object.setPrototypeOf?Object.setPrototypeOf.bind():function(i,n){return i.__proto__=n,i},no(r,e)}var HS=function(r){VS(e,r);function e(t){var i;return i=r.call(this,t)||this,i.type=zS.NESTING,i.value="&",i}return e}(jS.default);bi.default=HS;ad.exports=bi.default});var ld=x((jn,od)=>{u();"use strict";jn.__esModule=!0;jn.default=WS;function WS(r){return r.sort(function(e,t){return e-t})}od.exports=jn.default});var ao=x(M=>{u();"use strict";M.__esModule=!0;M.word=M.tilde=M.tab=M.str=M.space=M.slash=M.singleQuote=M.semicolon=M.plus=M.pipe=M.openSquare=M.openParenthesis=M.newline=M.greaterThan=M.feed=M.equals=M.doubleQuote=M.dollar=M.cr=M.comment=M.comma=M.combinator=M.colon=M.closeSquare=M.closeParenthesis=M.caret=M.bang=M.backslash=M.at=M.asterisk=M.ampersand=void 0;var GS=38;M.ampersand=GS;var QS=42;M.asterisk=QS;var YS=64;M.at=YS;var KS=44;M.comma=KS;var XS=58;M.colon=XS;var JS=59;M.semicolon=JS;var ZS=40;M.openParenthesis=ZS;var eA=41;M.closeParenthesis=eA;var tA=91;M.openSquare=tA;var rA=93;M.closeSquare=rA;var iA=36;M.dollar=iA;var nA=126;M.tilde=nA;var sA=94;M.caret=sA;var aA=43;M.plus=aA;var oA=61;M.equals=oA;var lA=124;M.pipe=lA;var uA=62;M.greaterThan=uA;var fA=32;M.space=fA;var ud=39;M.singleQuote=ud;var cA=34;M.doubleQuote=cA;var pA=47;M.slash=pA;var dA=33;M.bang=dA;var hA=92;M.backslash=hA;var mA=13;M.cr=mA;var gA=12;M.feed=gA;var yA=10;M.newline=yA;var bA=9;M.tab=bA;var wA=ud;M.str=wA;var vA=-1;M.comment=vA;var xA=-2;M.word=xA;var kA=-3;M.combinator=kA});var pd=x(wi=>{u();"use strict";wi.__esModule=!0;wi.FIELDS=void 0;wi.default=TA;var D=SA(ao()),nr,te;function fd(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(fd=function(n){return n?t:e})(r)}function SA(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=fd(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}var AA=(nr={},nr[D.tab]=!0,nr[D.newline]=!0,nr[D.cr]=!0,nr[D.feed]=!0,nr),CA=(te={},te[D.space]=!0,te[D.tab]=!0,te[D.newline]=!0,te[D.cr]=!0,te[D.feed]=!0,te[D.ampersand]=!0,te[D.asterisk]=!0,te[D.bang]=!0,te[D.comma]=!0,te[D.colon]=!0,te[D.semicolon]=!0,te[D.openParenthesis]=!0,te[D.closeParenthesis]=!0,te[D.openSquare]=!0,te[D.closeSquare]=!0,te[D.singleQuote]=!0,te[D.doubleQuote]=!0,te[D.plus]=!0,te[D.pipe]=!0,te[D.tilde]=!0,te[D.greaterThan]=!0,te[D.equals]=!0,te[D.dollar]=!0,te[D.caret]=!0,te[D.slash]=!0,te),oo={},cd="0123456789abcdefABCDEF";for(zn=0;zn0?(k=s+v,S=w-y[v].length):(k=s,S=a),O=D.comment,s=k,p=k,d=w-S):c===D.slash?(w=o,O=c,p=s,d=o-a,l=w+1):(w=_A(t,o),O=D.word,p=s,d=w-a),l=w+1;break}e.push([O,s,o-a,p,d,o,l]),S&&(a=S,S=null),o=l}return e}});var vd=x((vi,wd)=>{u();"use strict";vi.__esModule=!0;vi.default=void 0;var RA=je(Pa()),lo=je(Da()),PA=je(La()),dd=je(Na()),IA=je(Fa()),DA=je(Ua()),uo=je(Ha()),qA=je(Ga()),hd=Un(Za()),$A=je(to()),fo=je(io()),LA=je(so()),MA=je(ld()),P=Un(pd()),$=Un(ao()),NA=Un(Se()),le=ti(),Vt,co;function md(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(md=function(n){return n?t:e})(r)}function Un(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=md(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function je(r){return r&&r.__esModule?r:{default:r}}function gd(r,e){for(var t=0;t0){var s=this.current.last;if(s){var o=this.convertWhitespaceNodesToSpace(a),l=o.space,c=o.rawSpace;c!==void 0&&(s.rawSpaceAfter+=c),s.spaces.after+=l}else a.forEach(function(O){return i.newNode(O)})}return}var f=this.currToken,d=void 0;n>this.position&&(d=this.parseWhitespaceEquivalentTokens(n));var p;if(this.isNamedCombinator()?p=this.namedCombinator():this.currToken[P.FIELDS.TYPE]===$.combinator?(p=new fo.default({value:this.content(),source:sr(this.currToken),sourceIndex:this.currToken[P.FIELDS.START_POS]}),this.position++):po[this.currToken[P.FIELDS.TYPE]]||d||this.unexpected(),p){if(d){var h=this.convertWhitespaceNodesToSpace(d),b=h.space,v=h.rawSpace;p.spaces.before=b,p.rawSpaceBefore=v}}else{var y=this.convertWhitespaceNodesToSpace(d,!0),w=y.space,k=y.rawSpace;k||(k=w);var S={},E={spaces:{}};w.endsWith(" ")&&k.endsWith(" ")?(S.before=w.slice(0,w.length-1),E.spaces.before=k.slice(0,k.length-1)):w.startsWith(" ")&&k.startsWith(" ")?(S.after=w.slice(1),E.spaces.after=k.slice(1)):E.value=k,p=new fo.default({value:" ",source:ho(f,this.tokens[this.position-1]),sourceIndex:f[P.FIELDS.START_POS],spaces:S,raws:E})}return this.currToken&&this.currToken[P.FIELDS.TYPE]===$.space&&(p.spaces.after=this.optionalSpace(this.content()),this.position++),this.newNode(p)},e.comma=function(){if(this.position===this.tokens.length-1){this.root.trailingComma=!0,this.position++;return}this.current._inferEndPosition();var i=new lo.default({source:{start:yd(this.tokens[this.position+1])}});this.current.parent.append(i),this.current=i,this.position++},e.comment=function(){var i=this.currToken;this.newNode(new dd.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.error=function(i,n){throw this.root.error(i,n)},e.missingBackslash=function(){return this.error("Expected a backslash preceding the semicolon.",{index:this.currToken[P.FIELDS.START_POS]})},e.missingParenthesis=function(){return this.expected("opening parenthesis",this.currToken[P.FIELDS.START_POS])},e.missingSquareBracket=function(){return this.expected("opening square bracket",this.currToken[P.FIELDS.START_POS])},e.unexpected=function(){return this.error("Unexpected '"+this.content()+"'. Escaping special characters with \\ may help.",this.currToken[P.FIELDS.START_POS])},e.unexpectedPipe=function(){return this.error("Unexpected '|'.",this.currToken[P.FIELDS.START_POS])},e.namespace=function(){var i=this.prevToken&&this.content(this.prevToken)||!0;if(this.nextToken[P.FIELDS.TYPE]===$.word)return this.position++,this.word(i);if(this.nextToken[P.FIELDS.TYPE]===$.asterisk)return this.position++,this.universal(i);this.unexpectedPipe()},e.nesting=function(){if(this.nextToken){var i=this.content(this.nextToken);if(i==="|"){this.position++;return}}var n=this.currToken;this.newNode(new LA.default({value:this.content(),source:sr(n),sourceIndex:n[P.FIELDS.START_POS]})),this.position++},e.parentheses=function(){var i=this.current.last,n=1;if(this.position++,i&&i.type===NA.PSEUDO){var a=new lo.default({source:{start:yd(this.tokens[this.position-1])}}),s=this.current;for(i.append(a),this.current=a;this.position1&&i.nextToken&&i.nextToken[P.FIELDS.TYPE]===$.openParenthesis&&i.error("Misplaced parenthesis.",{index:i.nextToken[P.FIELDS.START_POS]})});else return this.expected(["pseudo-class","pseudo-element"],this.currToken[P.FIELDS.START_POS])},e.space=function(){var i=this.content();this.position===0||this.prevToken[P.FIELDS.TYPE]===$.comma||this.prevToken[P.FIELDS.TYPE]===$.openParenthesis||this.current.nodes.every(function(n){return n.type==="comment"})?(this.spaces=this.optionalSpace(i),this.position++):this.position===this.tokens.length-1||this.nextToken[P.FIELDS.TYPE]===$.comma||this.nextToken[P.FIELDS.TYPE]===$.closeParenthesis?(this.current.last.spaces.after=this.optionalSpace(i),this.position++):this.combinator()},e.string=function(){var i=this.currToken;this.newNode(new uo.default({value:this.content(),source:sr(i),sourceIndex:i[P.FIELDS.START_POS]})),this.position++},e.universal=function(i){var n=this.nextToken;if(n&&this.content(n)==="|")return this.position++,this.namespace();var a=this.currToken;this.newNode(new $A.default({value:this.content(),source:sr(a),sourceIndex:a[P.FIELDS.START_POS]}),i),this.position++},e.splitWord=function(i,n){for(var a=this,s=this.nextToken,o=this.content();s&&~[$.dollar,$.caret,$.equals,$.word].indexOf(s[P.FIELDS.TYPE]);){this.position++;var l=this.content();if(o+=l,l.lastIndexOf("\\")===l.length-1){var c=this.nextToken;c&&c[P.FIELDS.TYPE]===$.space&&(o+=this.requiredSpace(this.content(c)),this.position++)}s=this.nextToken}var f=mo(o,".").filter(function(b){var v=o[b-1]==="\\",y=/^\d+\.\d+%$/.test(o);return!v&&!y}),d=mo(o,"#").filter(function(b){return o[b-1]!=="\\"}),p=mo(o,"#{");p.length&&(d=d.filter(function(b){return!~p.indexOf(b)}));var h=(0,MA.default)(jA([0].concat(f,d)));h.forEach(function(b,v){var y=h[v+1]||o.length,w=o.slice(b,y);if(v===0&&n)return n.call(a,w,h.length);var k,S=a.currToken,E=S[P.FIELDS.START_POS]+h[v],O=Ht(S[1],S[2]+b,S[3],S[2]+(y-1));if(~f.indexOf(b)){var B={value:w.slice(1),source:O,sourceIndex:E};k=new PA.default(ar(B,"value"))}else if(~d.indexOf(b)){var N={value:w.slice(1),source:O,sourceIndex:E};k=new IA.default(ar(N,"value"))}else{var T={value:w,source:O,sourceIndex:E};ar(T,"value"),k=new DA.default(T)}a.newNode(k,i),i=null}),this.position++},e.word=function(i){var n=this.nextToken;return n&&this.content(n)==="|"?(this.position++,this.namespace()):this.splitWord(i)},e.loop=function(){for(;this.position{u();"use strict";xi.__esModule=!0;xi.default=void 0;var UA=VA(vd());function VA(r){return r&&r.__esModule?r:{default:r}}var HA=function(){function r(t,i){this.func=t||function(){},this.funcRes=null,this.options=i}var e=r.prototype;return e._shouldUpdateSelector=function(i,n){n===void 0&&(n={});var a=Object.assign({},this.options,n);return a.updateSelector===!1?!1:typeof i!="string"},e._isLossy=function(i){i===void 0&&(i={});var n=Object.assign({},this.options,i);return n.lossless===!1},e._root=function(i,n){n===void 0&&(n={});var a=new UA.default(i,this._parseOptions(n));return a.root},e._parseOptions=function(i){return{lossy:this._isLossy(i)}},e._run=function(i,n){var a=this;return n===void 0&&(n={}),new Promise(function(s,o){try{var l=a._root(i,n);Promise.resolve(a.func(l)).then(function(c){var f=void 0;return a._shouldUpdateSelector(i,n)&&(f=l.toString(),i.selector=f),{transform:c,root:l,string:f}}).then(s,o)}catch(c){o(c);return}})},e._runSync=function(i,n){n===void 0&&(n={});var a=this._root(i,n),s=this.func(a);if(s&&typeof s.then=="function")throw new Error("Selector processor returned a promise to a synchronous call.");var o=void 0;return n.updateSelector&&typeof i!="string"&&(o=a.toString(),i.selector=o),{transform:s,root:a,string:o}},e.ast=function(i,n){return this._run(i,n).then(function(a){return a.root})},e.astSync=function(i,n){return this._runSync(i,n).root},e.transform=function(i,n){return this._run(i,n).then(function(a){return a.transform})},e.transformSync=function(i,n){return this._runSync(i,n).transform},e.process=function(i,n){return this._run(i,n).then(function(a){return a.string||a.root.toString()})},e.processSync=function(i,n){var a=this._runSync(i,n);return a.string||a.root.toString()},r}();xi.default=HA;xd.exports=xi.default});var Sd=x(ne=>{u();"use strict";ne.__esModule=!0;ne.universal=ne.tag=ne.string=ne.selector=ne.root=ne.pseudo=ne.nesting=ne.id=ne.comment=ne.combinator=ne.className=ne.attribute=void 0;var WA=ze(Za()),GA=ze(La()),QA=ze(io()),YA=ze(Na()),KA=ze(Fa()),XA=ze(so()),JA=ze(Ga()),ZA=ze(Pa()),eC=ze(Da()),tC=ze(Ha()),rC=ze(Ua()),iC=ze(to());function ze(r){return r&&r.__esModule?r:{default:r}}var nC=function(e){return new WA.default(e)};ne.attribute=nC;var sC=function(e){return new GA.default(e)};ne.className=sC;var aC=function(e){return new QA.default(e)};ne.combinator=aC;var oC=function(e){return new YA.default(e)};ne.comment=oC;var lC=function(e){return new KA.default(e)};ne.id=lC;var uC=function(e){return new XA.default(e)};ne.nesting=uC;var fC=function(e){return new JA.default(e)};ne.pseudo=fC;var cC=function(e){return new ZA.default(e)};ne.root=cC;var pC=function(e){return new eC.default(e)};ne.selector=pC;var dC=function(e){return new tC.default(e)};ne.string=dC;var hC=function(e){return new rC.default(e)};ne.tag=hC;var mC=function(e){return new iC.default(e)};ne.universal=mC});var Ed=x(J=>{u();"use strict";J.__esModule=!0;J.isComment=J.isCombinator=J.isClassName=J.isAttribute=void 0;J.isContainer=EC;J.isIdentifier=void 0;J.isNamespace=OC;J.isNesting=void 0;J.isNode=go;J.isPseudo=void 0;J.isPseudoClass=_C;J.isPseudoElement=_d;J.isUniversal=J.isTag=J.isString=J.isSelector=J.isRoot=void 0;var ue=Se(),Oe,gC=(Oe={},Oe[ue.ATTRIBUTE]=!0,Oe[ue.CLASS]=!0,Oe[ue.COMBINATOR]=!0,Oe[ue.COMMENT]=!0,Oe[ue.ID]=!0,Oe[ue.NESTING]=!0,Oe[ue.PSEUDO]=!0,Oe[ue.ROOT]=!0,Oe[ue.SELECTOR]=!0,Oe[ue.STRING]=!0,Oe[ue.TAG]=!0,Oe[ue.UNIVERSAL]=!0,Oe);function go(r){return typeof r=="object"&&gC[r.type]}function Ue(r,e){return go(e)&&e.type===r}var Ad=Ue.bind(null,ue.ATTRIBUTE);J.isAttribute=Ad;var yC=Ue.bind(null,ue.CLASS);J.isClassName=yC;var bC=Ue.bind(null,ue.COMBINATOR);J.isCombinator=bC;var wC=Ue.bind(null,ue.COMMENT);J.isComment=wC;var vC=Ue.bind(null,ue.ID);J.isIdentifier=vC;var xC=Ue.bind(null,ue.NESTING);J.isNesting=xC;var yo=Ue.bind(null,ue.PSEUDO);J.isPseudo=yo;var kC=Ue.bind(null,ue.ROOT);J.isRoot=kC;var SC=Ue.bind(null,ue.SELECTOR);J.isSelector=SC;var AC=Ue.bind(null,ue.STRING);J.isString=AC;var Cd=Ue.bind(null,ue.TAG);J.isTag=Cd;var CC=Ue.bind(null,ue.UNIVERSAL);J.isUniversal=CC;function _d(r){return yo(r)&&r.value&&(r.value.startsWith("::")||r.value.toLowerCase()===":before"||r.value.toLowerCase()===":after"||r.value.toLowerCase()===":first-letter"||r.value.toLowerCase()===":first-line")}function _C(r){return yo(r)&&!_d(r)}function EC(r){return!!(go(r)&&r.walk)}function OC(r){return Ad(r)||Cd(r)}});var Od=x(Ke=>{u();"use strict";Ke.__esModule=!0;var bo=Se();Object.keys(bo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===bo[r]||(Ke[r]=bo[r])});var wo=Sd();Object.keys(wo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===wo[r]||(Ke[r]=wo[r])});var vo=Ed();Object.keys(vo).forEach(function(r){r==="default"||r==="__esModule"||r in Ke&&Ke[r]===vo[r]||(Ke[r]=vo[r])})});var it=x((ki,Rd)=>{u();"use strict";ki.__esModule=!0;ki.default=void 0;var TC=IC(kd()),RC=PC(Od());function Td(r){if(typeof WeakMap!="function")return null;var e=new WeakMap,t=new WeakMap;return(Td=function(n){return n?t:e})(r)}function PC(r,e){if(!e&&r&&r.__esModule)return r;if(r===null||typeof r!="object"&&typeof r!="function")return{default:r};var t=Td(e);if(t&&t.has(r))return t.get(r);var i={},n=Object.defineProperty&&Object.getOwnPropertyDescriptor;for(var a in r)if(a!=="default"&&Object.prototype.hasOwnProperty.call(r,a)){var s=n?Object.getOwnPropertyDescriptor(r,a):null;s&&(s.get||s.set)?Object.defineProperty(i,a,s):i[a]=r[a]}return i.default=r,t&&t.set(r,i),i}function IC(r){return r&&r.__esModule?r:{default:r}}var xo=function(e){return new TC.default(e)};Object.assign(xo,RC);delete xo.__esModule;var DC=xo;ki.default=DC;Rd.exports=ki.default});function mt(r){return["fontSize","outline"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e[0]),e):r==="fontFamily"?e=>{typeof e=="function"&&(e=e({}));let t=Array.isArray(e)&&ke(e[1])?e[0]:e;return Array.isArray(t)?t.join(", "):t}:["boxShadow","transitionProperty","transitionDuration","transitionDelay","transitionTimingFunction","backgroundImage","backgroundSize","backgroundColor","cursor","animation"].includes(r)?e=>(typeof e=="function"&&(e=e({})),Array.isArray(e)&&(e=e.join(", ")),e):["gridTemplateColumns","gridTemplateRows","objectPosition"].includes(r)?e=>(typeof e=="function"&&(e=e({})),typeof e=="string"&&(e=ee.list.comma(e).join(" ")),e):(e,t={})=>(typeof e=="function"&&(e=e(t)),e)}var Si=R(()=>{u();Ot();Kt()});var Md=x(($I,_o)=>{u();var{Rule:Pd,AtRule:qC}=$e(),Id=it();function ko(r,e){let t;try{Id(i=>{t=i}).processSync(r)}catch(i){throw r.includes(":")?e?e.error("Missed semicolon"):i:e?e.error(i.message):i}return t.at(0)}function Dd(r,e){let t=!1;return r.each(i=>{if(i.type==="nesting"){let n=e.clone({});i.value!=="&"?i.replaceWith(ko(i.value.replace("&",n.toString()))):i.replaceWith(n),t=!0}else"nodes"in i&&i.nodes&&Dd(i,e)&&(t=!0)}),t}function qd(r,e){let t=[];return r.selectors.forEach(i=>{let n=ko(i,r);e.selectors.forEach(a=>{if(!a)return;let s=ko(a,e);Dd(s,n)||(s.prepend(Id.combinator({value:" "})),s.prepend(n.clone({}))),t.push(s.toString())})}),t}function Vn(r,e){let t=r.prev();for(e.after(r);t&&t.type==="comment";){let i=t.prev();e.after(t),t=i}return r}function $C(r){return function e(t,i,n,a=n){let s=[];if(i.each(o=>{o.type==="rule"&&n?a&&(o.selectors=qd(t,o)):o.type==="atrule"&&o.nodes?r[o.name]?e(t,o,a):i[Ao]!==!1&&s.push(o):s.push(o)}),n&&s.length){let o=t.clone({nodes:[]});for(let l of s)o.append(l);i.prepend(o)}}}function So(r,e,t){let i=new Pd({selector:r,nodes:[]});return i.append(e),t.after(i),i}function $d(r,e){let t={};for(let i of r)t[i]=!0;if(e)for(let i of e)t[i.replace(/^@/,"")]=!0;return t}function LC(r){r=r.trim();let e=r.match(/^\((.*)\)$/);if(!e)return{type:"basic",selector:r};let t=e[1].match(/^(with(?:out)?):(.+)$/);if(t){let i=t[1]==="with",n=Object.fromEntries(t[2].trim().split(/\s+/).map(s=>[s,!0]));if(i&&n.all)return{type:"noop"};let a=s=>!!n[s];return n.all?a=()=>!0:i&&(a=s=>s==="all"?!1:!n[s]),{type:"withrules",escapes:a}}return{type:"unknown"}}function MC(r){let e=[],t=r.parent;for(;t&&t instanceof qC;)e.push(t),t=t.parent;return e}function NC(r){let e=r[Ld];if(!e)r.after(r.nodes);else{let t=r.nodes,i,n=-1,a,s,o,l=MC(r);if(l.forEach((c,f)=>{if(e(c.name))i=c,n=f,s=o;else{let d=o;o=c.clone({nodes:[]}),d&&o.append(d),a=a||o}}),i?s?(a.append(t),i.after(s)):i.after(t):r.after(t),r.next()&&i){let c;l.slice(0,n+1).forEach((f,d,p)=>{let h=c;c=f.clone({nodes:[]}),h&&c.append(h);let b=[],y=(p[d-1]||r).next();for(;y;)b.push(y),y=y.next();c.append(b)}),c&&(s||t[t.length-1]).after(c)}}r.remove()}var Ao=Symbol("rootRuleMergeSel"),Ld=Symbol("rootRuleEscapes");function BC(r){let{params:e}=r,{type:t,selector:i,escapes:n}=LC(e);if(t==="unknown")throw r.error(`Unknown @${r.name} parameter ${JSON.stringify(e)}`);if(t==="basic"&&i){let a=new Pd({selector:i,nodes:r.nodes});r.removeAll(),r.append(a)}r[Ld]=n,r[Ao]=n?!n("all"):t==="noop"}var Co=Symbol("hasRootRule");_o.exports=(r={})=>{let e=$d(["media","supports","layer","container"],r.bubble),t=$C(e),i=$d(["document","font-face","keyframes","-webkit-keyframes","-moz-keyframes"],r.unwrap),n=(r.rootRuleName||"at-root").replace(/^@/,""),a=r.preserveEmpty;return{postcssPlugin:"postcss-nested",Once(s){s.walkAtRules(n,o=>{BC(o),s[Co]=!0})},Rule(s){let o=!1,l=s,c=!1,f=[];s.each(d=>{d.type==="rule"?(f.length&&(l=So(s.selector,f,l),f=[]),c=!0,o=!0,d.selectors=qd(s,d),l=Vn(d,l)):d.type==="atrule"?(f.length&&(l=So(s.selector,f,l),f=[]),d.name===n?(o=!0,t(s,d,!0,d[Ao]),l=Vn(d,l)):e[d.name]?(c=!0,o=!0,t(s,d,!0),l=Vn(d,l)):i[d.name]?(c=!0,o=!0,t(s,d,!1),l=Vn(d,l)):c&&f.push(d)):d.type==="decl"&&c&&f.push(d)}),f.length&&(l=So(s.selector,f,l)),o&&a!==!0&&(s.raws.semicolon=!0,s.nodes.length===0&&s.remove())},RootExit(s){s[Co]&&(s.walkAtRules(n,NC),s[Co]=!1)}}};_o.exports.postcss=!0});var jd=x((LI,Fd)=>{u();"use strict";var Nd=/-(\w|$)/g,Bd=(r,e)=>e.toUpperCase(),FC=r=>(r=r.toLowerCase(),r==="float"?"cssFloat":r.startsWith("-ms-")?r.substr(1).replace(Nd,Bd):r.replace(Nd,Bd));Fd.exports=FC});var To=x((MI,zd)=>{u();var jC=jd(),zC={boxFlex:!0,boxFlexGroup:!0,columnCount:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,strokeDashoffset:!0,strokeOpacity:!0,strokeWidth:!0};function Eo(r){return typeof r.nodes=="undefined"?!0:Oo(r)}function Oo(r){let e,t={};return r.each(i=>{if(i.type==="atrule")e="@"+i.name,i.params&&(e+=" "+i.params),typeof t[e]=="undefined"?t[e]=Eo(i):Array.isArray(t[e])?t[e].push(Eo(i)):t[e]=[t[e],Eo(i)];else if(i.type==="rule"){let n=Oo(i);if(t[i.selector])for(let a in n)t[i.selector][a]=n[a];else t[i.selector]=n}else if(i.type==="decl"){i.prop[0]==="-"&&i.prop[1]==="-"||i.parent&&i.parent.selector===":export"?e=i.prop:e=jC(i.prop);let n=i.value;!isNaN(i.value)&&zC[e]&&(n=parseFloat(i.value)),i.important&&(n+=" !important"),typeof t[e]=="undefined"?t[e]=n:Array.isArray(t[e])?t[e].push(n):t[e]=[t[e],n]}}),t}zd.exports=Oo});var Hn=x((NI,Wd)=>{u();var Ai=$e(),Ud=/\s*!important\s*$/i,UC={"box-flex":!0,"box-flex-group":!0,"column-count":!0,flex:!0,"flex-grow":!0,"flex-positive":!0,"flex-shrink":!0,"flex-negative":!0,"font-weight":!0,"line-clamp":!0,"line-height":!0,opacity:!0,order:!0,orphans:!0,"tab-size":!0,widows:!0,"z-index":!0,zoom:!0,"fill-opacity":!0,"stroke-dashoffset":!0,"stroke-opacity":!0,"stroke-width":!0};function VC(r){return r.replace(/([A-Z])/g,"-$1").replace(/^ms-/,"-ms-").toLowerCase()}function Vd(r,e,t){t===!1||t===null||(e.startsWith("--")||(e=VC(e)),typeof t=="number"&&(t===0||UC[e]?t=t.toString():t+="px"),e==="css-float"&&(e="float"),Ud.test(t)?(t=t.replace(Ud,""),r.push(Ai.decl({prop:e,value:t,important:!0}))):r.push(Ai.decl({prop:e,value:t})))}function Hd(r,e,t){let i=Ai.atRule({name:e[1],params:e[3]||""});typeof t=="object"&&(i.nodes=[],Ro(t,i)),r.push(i)}function Ro(r,e){let t,i,n;for(t in r)if(i=r[t],!(i===null||typeof i=="undefined"))if(t[0]==="@"){let a=t.match(/@(\S+)(\s+([\W\w]*)\s*)?/);if(Array.isArray(i))for(let s of i)Hd(e,a,s);else Hd(e,a,i)}else if(Array.isArray(i))for(let a of i)Vd(e,t,a);else typeof i=="object"?(n=Ai.rule({selector:t}),Ro(i,n),e.push(n)):Vd(e,t,i)}Wd.exports=function(r){let e=Ai.root();return Ro(r,e),e}});var Po=x((BI,Gd)=>{u();var HC=To();Gd.exports=function(e){return console&&console.warn&&e.warnings().forEach(t=>{let i=t.plugin||"PostCSS";console.warn(i+": "+t.text)}),HC(e.root)}});var Yd=x((FI,Qd)=>{u();var WC=$e(),GC=Po(),QC=Hn();Qd.exports=function(e){let t=WC(e);return async i=>{let n=await t.process(i,{parser:QC,from:void 0});return GC(n)}}});var Xd=x((jI,Kd)=>{u();var YC=$e(),KC=Po(),XC=Hn();Kd.exports=function(r){let e=YC(r);return t=>{let i=e.process(t,{parser:XC,from:void 0});return KC(i)}}});var Zd=x((zI,Jd)=>{u();var JC=To(),ZC=Hn(),e_=Yd(),t_=Xd();Jd.exports={objectify:JC,parse:ZC,async:e_,sync:t_}});var or,eh,UI,VI,HI,WI,th=R(()=>{u();or=pe(Zd()),eh=or.default,UI=or.default.objectify,VI=or.default.parse,HI=or.default.async,WI=or.default.sync});function lr(r){return Array.isArray(r)?r.flatMap(e=>ee([(0,rh.default)({bubble:["screen"]})]).process(e,{parser:eh}).root.nodes):lr([r])}var rh,Io=R(()=>{u();Ot();rh=pe(Md());th()});function ur(r,e,t=!1){if(r==="")return e;let i=typeof e=="string"?(0,ih.default)().astSync(e):e;return i.walkClasses(n=>{let a=n.value,s=t&&a.startsWith("-");n.value=s?`-${r}${a.slice(1)}`:`${r}${a}`}),typeof e=="string"?i.toString():i}var ih,Wn=R(()=>{u();ih=pe(it())});function Te(r){let e=nh.default.className();return e.value=r,jt(e?.raws?.value??e.value)}var nh,fr=R(()=>{u();nh=pe(it());Ki()});function Do(r){return jt(`.${Te(r)}`)}function Gn(r,e){return Do(Ci(r,e))}function Ci(r,e){return e==="DEFAULT"?r:e==="-"||e==="-DEFAULT"?`-${r}`:e.startsWith("-")?`-${r}${e}`:e.startsWith("/")?`${r}${e}`:`${r}-${e}`}var qo=R(()=>{u();fr();Ki()});function L(r,e=[[r,[r]]],{filterDefault:t=!1,...i}={}){let n=mt(r);return function({matchUtilities:a,theme:s}){for(let o of e){let l=Array.isArray(o[0])?o:[o];a(l.reduce((c,[f,d])=>Object.assign(c,{[f]:p=>d.reduce((h,b)=>Array.isArray(b)?Object.assign(h,{[b[0]]:b[1]}):Object.assign(h,{[b]:n(p)}),{})}),{}),{...i,values:t?Object.fromEntries(Object.entries(s(r)??{}).filter(([c])=>c!=="DEFAULT")):s(r)})}}}var sh=R(()=>{u();Si()});function Tt(r){return r=Array.isArray(r)?r:[r],r.map(e=>{let t=e.values.map(i=>i.raw!==void 0?i.raw:[i.min&&`(min-width: ${i.min})`,i.max&&`(max-width: ${i.max})`].filter(Boolean).join(" and "));return e.not?`not all and ${t}`:t}).join(", ")}var Qn=R(()=>{u()});function $o(r){return r.split(l_).map(t=>{let i=t.trim(),n={value:i},a=i.split(u_),s=new Set;for(let o of a)!s.has("DIRECTIONS")&&r_.has(o)?(n.direction=o,s.add("DIRECTIONS")):!s.has("PLAY_STATES")&&i_.has(o)?(n.playState=o,s.add("PLAY_STATES")):!s.has("FILL_MODES")&&n_.has(o)?(n.fillMode=o,s.add("FILL_MODES")):!s.has("ITERATION_COUNTS")&&(s_.has(o)||f_.test(o))?(n.iterationCount=o,s.add("ITERATION_COUNTS")):!s.has("TIMING_FUNCTION")&&a_.has(o)||!s.has("TIMING_FUNCTION")&&o_.some(l=>o.startsWith(`${l}(`))?(n.timingFunction=o,s.add("TIMING_FUNCTION")):!s.has("DURATION")&&ah.test(o)?(n.duration=o,s.add("DURATION")):!s.has("DELAY")&&ah.test(o)?(n.delay=o,s.add("DELAY")):s.has("NAME")?(n.unknown||(n.unknown=[]),n.unknown.push(o)):(n.name=o,s.add("NAME"));return n})}var r_,i_,n_,s_,a_,o_,l_,u_,ah,f_,oh=R(()=>{u();r_=new Set(["normal","reverse","alternate","alternate-reverse"]),i_=new Set(["running","paused"]),n_=new Set(["none","forwards","backwards","both"]),s_=new Set(["infinite"]),a_=new Set(["linear","ease","ease-in","ease-out","ease-in-out","step-start","step-end"]),o_=["cubic-bezier","steps"],l_=/\,(?![^(]*\))/g,u_=/\ +(?![^(]*\))/g,ah=/^(-?[\d.]+m?s)$/,f_=/^(\d+)$/});var lh,xe,uh=R(()=>{u();lh=r=>Object.assign({},...Object.entries(r??{}).flatMap(([e,t])=>typeof t=="object"?Object.entries(lh(t)).map(([i,n])=>({[e+(i==="DEFAULT"?"":`-${i}`)]:n})):[{[`${e}`]:t}])),xe=lh});var ch,fh=R(()=>{ch="3.4.14"});function Rt(r,e=!0){return Array.isArray(r)?r.map(t=>{if(e&&Array.isArray(t))throw new Error("The tuple syntax is not supported for `screens`.");if(typeof t=="string")return{name:t.toString(),not:!1,values:[{min:t,max:void 0}]};let[i,n]=t;return i=i.toString(),typeof n=="string"?{name:i,not:!1,values:[{min:n,max:void 0}]}:Array.isArray(n)?{name:i,not:!1,values:n.map(a=>dh(a))}:{name:i,not:!1,values:[dh(n)]}}):Rt(Object.entries(r??{}),!1)}function Yn(r){return r.values.length!==1?{result:!1,reason:"multiple-values"}:r.values[0].raw!==void 0?{result:!1,reason:"raw-values"}:r.values[0].min!==void 0&&r.values[0].max!==void 0?{result:!1,reason:"min-and-max"}:{result:!0,reason:null}}function ph(r,e,t){let i=Kn(e,r),n=Kn(t,r),a=Yn(i),s=Yn(n);if(a.reason==="multiple-values"||s.reason==="multiple-values")throw new Error("Attempted to sort a screen with multiple values. This should never happen. Please open a bug report.");if(a.reason==="raw-values"||s.reason==="raw-values")throw new Error("Attempted to sort a screen with raw values. This should never happen. Please open a bug report.");if(a.reason==="min-and-max"||s.reason==="min-and-max")throw new Error("Attempted to sort a screen with both min and max values. This should never happen. Please open a bug report.");let{min:o,max:l}=i.values[0],{min:c,max:f}=n.values[0];e.not&&([o,l]=[l,o]),t.not&&([c,f]=[f,c]),o=o===void 0?o:parseFloat(o),l=l===void 0?l:parseFloat(l),c=c===void 0?c:parseFloat(c),f=f===void 0?f:parseFloat(f);let[d,p]=r==="min"?[o,c]:[f,l];return d-p}function Kn(r,e){return typeof r=="object"?r:{name:"arbitrary-screen",values:[{[e]:r}]}}function dh({"min-width":r,min:e=r,max:t,raw:i}={}){return{min:e,max:t,raw:i}}var Xn=R(()=>{u()});function Jn(r,e){r.walkDecls(t=>{if(e.includes(t.prop)){t.remove();return}for(let i of e)t.value.includes(`/ var(${i})`)&&(t.value=t.value.replace(`/ var(${i})`,""))})}var hh=R(()=>{u()});var se,Xe,nt,ge,mh,gh=R(()=>{u();ft();et();Ot();sh();Qn();fr();oh();uh();Lr();ea();Kt();Si();fh();Be();Xn();Gs();hh();ct();Br();_i();se={childVariant:({addVariant:r})=>{r("*","& > *")},pseudoElementVariants:({addVariant:r})=>{r("first-letter","&::first-letter"),r("first-line","&::first-line"),r("marker",[({container:e})=>(Jn(e,["--tw-text-opacity"]),"& *::marker"),({container:e})=>(Jn(e,["--tw-text-opacity"]),"&::marker")]),r("selection",["& *::selection","&::selection"]),r("file","&::file-selector-button"),r("placeholder","&::placeholder"),r("backdrop","&::backdrop"),r("before",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::before")),r("after",({container:e})=>(e.walkRules(t=>{let i=!1;t.walkDecls("content",()=>{i=!0}),i||t.prepend(ee.decl({prop:"content",value:"var(--tw-content)"}))}),"&::after"))},pseudoClassVariants:({addVariant:r,matchVariant:e,config:t,prefix:i})=>{let n=[["first","&:first-child"],["last","&:last-child"],["only","&:only-child"],["odd","&:nth-child(odd)"],["even","&:nth-child(even)"],"first-of-type","last-of-type","only-of-type",["visited",({container:s})=>(Jn(s,["--tw-text-opacity","--tw-border-opacity","--tw-bg-opacity"]),"&:visited")],"target",["open","&[open]"],"default","checked","indeterminate","placeholder-shown","autofill","optional","required","valid","invalid","in-range","out-of-range","read-only","empty","focus-within",["hover",we(t(),"hoverOnlyWhenSupported")?"@media (hover: hover) and (pointer: fine) { &:hover }":"&:hover"],"focus","focus-visible","active","enabled","disabled"].map(s=>Array.isArray(s)?s:[s,`&:${s}`]);for(let[s,o]of n)r(s,l=>typeof o=="function"?o(l):o);let a={group:(s,{modifier:o})=>o?[`:merge(${i(".group")}\\/${Te(o)})`," &"]:[`:merge(${i(".group")})`," &"],peer:(s,{modifier:o})=>o?[`:merge(${i(".peer")}\\/${Te(o)})`," ~ &"]:[`:merge(${i(".peer")})`," ~ &"]};for(let[s,o]of Object.entries(a))e(s,(l="",c)=>{let f=K(typeof l=="function"?l(c):l);f.includes("&")||(f="&"+f);let[d,p]=o("",c),h=null,b=null,v=0;for(let y=0;y{r("ltr",'&:where([dir="ltr"], [dir="ltr"] *)'),r("rtl",'&:where([dir="rtl"], [dir="rtl"] *)')},reducedMotionVariants:({addVariant:r})=>{r("motion-safe","@media (prefers-reduced-motion: no-preference)"),r("motion-reduce","@media (prefers-reduced-motion: reduce)")},darkVariants:({config:r,addVariant:e})=>{let[t,i=".dark"]=[].concat(r("darkMode","media"));if(t===!1&&(t="media",G.warn("darkmode-false",["The `darkMode` option in your Tailwind CSS configuration is set to `false`, which now behaves the same as `media`.","Change `darkMode` to `media` or remove it entirely.","https://tailwindcss.com/docs/upgrade-guide#remove-dark-mode-configuration"])),t==="variant"){let n;if(Array.isArray(i)||typeof i=="function"?n=i:typeof i=="string"&&(n=[i]),Array.isArray(n))for(let a of n)a===".dark"?(t=!1,G.warn("darkmode-variant-without-selector",["When using `variant` for `darkMode`, you must provide a selector.",'Example: `darkMode: ["variant", ".your-selector &"]`'])):a.includes("&")||(t=!1,G.warn("darkmode-variant-without-ampersand",["When using `variant` for `darkMode`, your selector must contain `&`.",'Example `darkMode: ["variant", ".your-selector &"]`']));i=n}t==="selector"?e("dark",`&:where(${i}, ${i} *)`):t==="media"?e("dark","@media (prefers-color-scheme: dark)"):t==="variant"?e("dark",i):t==="class"&&e("dark",`&:is(${i} *)`)},printVariant:({addVariant:r})=>{r("print","@media print")},screenVariants:({theme:r,addVariant:e,matchVariant:t})=>{let i=r("screens")??{},n=Object.values(i).every(w=>typeof w=="string"),a=Rt(r("screens")),s=new Set([]);function o(w){return w.match(/(\D+)$/)?.[1]??"(none)"}function l(w){w!==void 0&&s.add(o(w))}function c(w){return l(w),s.size===1}for(let w of a)for(let k of w.values)l(k.min),l(k.max);let f=s.size<=1;function d(w){return Object.fromEntries(a.filter(k=>Yn(k).result).map(k=>{let{min:S,max:E}=k.values[0];if(w==="min"&&S!==void 0)return k;if(w==="min"&&E!==void 0)return{...k,not:!k.not};if(w==="max"&&E!==void 0)return k;if(w==="max"&&S!==void 0)return{...k,not:!k.not}}).map(k=>[k.name,k]))}function p(w){return(k,S)=>ph(w,k.value,S.value)}let h=p("max"),b=p("min");function v(w){return k=>{if(n)if(f){if(typeof k=="string"&&!c(k))return G.warn("minmax-have-mixed-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[]}else return G.warn("mixed-screen-units",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing mixed units."]),[];else return G.warn("complex-screen-config",["The `min-*` and `max-*` variants are not supported with a `screens` configuration containing objects."]),[];return[`@media ${Tt(Kn(k,w))}`]}}t("max",v("max"),{sort:h,values:n?d("max"):{}});let y="min-screens";for(let w of a)e(w.name,`@media ${Tt(w)}`,{id:y,sort:n&&f?b:void 0,value:w});t("min",v("min"),{id:y,sort:b})},supportsVariants:({matchVariant:r,theme:e})=>{r("supports",(t="")=>{let i=K(t),n=/^\w*\s*\(/.test(i);return i=n?i.replace(/\b(and|or|not)\b/g," $1 "):i,n?`@supports ${i}`:(i.includes(":")||(i=`${i}: var(--tw)`),i.startsWith("(")&&i.endsWith(")")||(i=`(${i})`),`@supports ${i}`)},{values:e("supports")??{}})},hasVariants:({matchVariant:r,prefix:e})=>{r("has",t=>`&:has(${K(t)})`,{values:{},[Pt]:{respectPrefix:!1}}),r("group-has",(t,{modifier:i})=>i?`:merge(${e(".group")}\\/${i}):has(${K(t)}) &`:`:merge(${e(".group")}):has(${K(t)}) &`,{values:{},[Pt]:{respectPrefix:!1}}),r("peer-has",(t,{modifier:i})=>i?`:merge(${e(".peer")}\\/${i}):has(${K(t)}) ~ &`:`:merge(${e(".peer")}):has(${K(t)}) ~ &`,{values:{},[Pt]:{respectPrefix:!1}})},ariaVariants:({matchVariant:r,theme:e})=>{r("aria",t=>`&[aria-${Ye(K(t))}]`,{values:e("aria")??{}}),r("group-aria",(t,{modifier:i})=>i?`:merge(.group\\/${i})[aria-${Ye(K(t))}] &`:`:merge(.group)[aria-${Ye(K(t))}] &`,{values:e("aria")??{}}),r("peer-aria",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[aria-${Ye(K(t))}] ~ &`:`:merge(.peer)[aria-${Ye(K(t))}] ~ &`,{values:e("aria")??{}})},dataVariants:({matchVariant:r,theme:e})=>{r("data",t=>`&[data-${Ye(K(t))}]`,{values:e("data")??{}}),r("group-data",(t,{modifier:i})=>i?`:merge(.group\\/${i})[data-${Ye(K(t))}] &`:`:merge(.group)[data-${Ye(K(t))}] &`,{values:e("data")??{}}),r("peer-data",(t,{modifier:i})=>i?`:merge(.peer\\/${i})[data-${Ye(K(t))}] ~ &`:`:merge(.peer)[data-${Ye(K(t))}] ~ &`,{values:e("data")??{}})},orientationVariants:({addVariant:r})=>{r("portrait","@media (orientation: portrait)"),r("landscape","@media (orientation: landscape)")},prefersContrastVariants:({addVariant:r})=>{r("contrast-more","@media (prefers-contrast: more)"),r("contrast-less","@media (prefers-contrast: less)")},forcedColorsVariants:({addVariant:r})=>{r("forced-colors","@media (forced-colors: active)")}},Xe=["translate(var(--tw-translate-x), var(--tw-translate-y))","rotate(var(--tw-rotate))","skewX(var(--tw-skew-x))","skewY(var(--tw-skew-y))","scaleX(var(--tw-scale-x))","scaleY(var(--tw-scale-y))"].join(" "),nt=["var(--tw-blur)","var(--tw-brightness)","var(--tw-contrast)","var(--tw-grayscale)","var(--tw-hue-rotate)","var(--tw-invert)","var(--tw-saturate)","var(--tw-sepia)","var(--tw-drop-shadow)"].join(" "),ge=["var(--tw-backdrop-blur)","var(--tw-backdrop-brightness)","var(--tw-backdrop-contrast)","var(--tw-backdrop-grayscale)","var(--tw-backdrop-hue-rotate)","var(--tw-backdrop-invert)","var(--tw-backdrop-opacity)","var(--tw-backdrop-saturate)","var(--tw-backdrop-sepia)"].join(" "),mh={preflight:({addBase:r})=>{let e=ee.parse(`*,::after,::before{box-sizing:border-box;border-width:0;border-style:solid;border-color:theme('borderColor.DEFAULT', currentColor)}::after,::before{--tw-content:''}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;tab-size:4;font-family:theme('fontFamily.sans', ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji");font-feature-settings:theme('fontFamily.sans[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.sans[1].fontVariationSettings', normal);-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:theme('fontFamily.mono', ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace);font-feature-settings:theme('fontFamily.mono[1].fontFeatureSettings', normal);font-variation-settings:theme('fontFamily.mono[1].fontVariationSettings', normal);font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::placeholder,textarea::placeholder{opacity:1;color:theme('colors.gray.4', #9ca3af)}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}`);r([ee.comment({text:`! tailwindcss v${ch} | MIT License | https://tailwindcss.com`}),...e.nodes])},container:(()=>{function r(t=[]){return t.flatMap(i=>i.values.map(n=>n.min)).filter(i=>i!==void 0)}function e(t,i,n){if(typeof n=="undefined")return[];if(!(typeof n=="object"&&n!==null))return[{screen:"DEFAULT",minWidth:0,padding:n}];let a=[];n.DEFAULT&&a.push({screen:"DEFAULT",minWidth:0,padding:n.DEFAULT});for(let s of t)for(let o of i)for(let{min:l}of o.values)l===s&&a.push({minWidth:s,padding:n[o.name]});return a}return function({addComponents:t,theme:i}){let n=Rt(i("container.screens",i("screens"))),a=r(n),s=e(a,n,i("container.padding")),o=c=>{let f=s.find(d=>d.minWidth===c);return f?{paddingRight:f.padding,paddingLeft:f.padding}:{}},l=Array.from(new Set(a.slice().sort((c,f)=>parseInt(c)-parseInt(f)))).map(c=>({[`@media (min-width: ${c})`]:{".container":{"max-width":c,...o(c)}}}));t([{".container":Object.assign({width:"100%"},i("container.center",!1)?{marginRight:"auto",marginLeft:"auto"}:{},o(0))},...l])}})(),accessibility:({addUtilities:r})=>{r({".sr-only":{position:"absolute",width:"1px",height:"1px",padding:"0",margin:"-1px",overflow:"hidden",clip:"rect(0, 0, 0, 0)",whiteSpace:"nowrap",borderWidth:"0"},".not-sr-only":{position:"static",width:"auto",height:"auto",padding:"0",margin:"0",overflow:"visible",clip:"auto",whiteSpace:"normal"}})},pointerEvents:({addUtilities:r})=>{r({".pointer-events-none":{"pointer-events":"none"},".pointer-events-auto":{"pointer-events":"auto"}})},visibility:({addUtilities:r})=>{r({".visible":{visibility:"visible"},".invisible":{visibility:"hidden"},".collapse":{visibility:"collapse"}})},position:({addUtilities:r})=>{r({".static":{position:"static"},".fixed":{position:"fixed"},".absolute":{position:"absolute"},".relative":{position:"relative"},".sticky":{position:"sticky"}})},inset:L("inset",[["inset",["inset"]],[["inset-x",["left","right"]],["inset-y",["top","bottom"]]],[["start",["inset-inline-start"]],["end",["inset-inline-end"]],["top",["top"]],["right",["right"]],["bottom",["bottom"]],["left",["left"]]]],{supportsNegativeValues:!0}),isolation:({addUtilities:r})=>{r({".isolate":{isolation:"isolate"},".isolation-auto":{isolation:"auto"}})},zIndex:L("zIndex",[["z",["zIndex"]]],{supportsNegativeValues:!0}),order:L("order",void 0,{supportsNegativeValues:!0}),gridColumn:L("gridColumn",[["col",["gridColumn"]]]),gridColumnStart:L("gridColumnStart",[["col-start",["gridColumnStart"]]],{supportsNegativeValues:!0}),gridColumnEnd:L("gridColumnEnd",[["col-end",["gridColumnEnd"]]],{supportsNegativeValues:!0}),gridRow:L("gridRow",[["row",["gridRow"]]]),gridRowStart:L("gridRowStart",[["row-start",["gridRowStart"]]],{supportsNegativeValues:!0}),gridRowEnd:L("gridRowEnd",[["row-end",["gridRowEnd"]]],{supportsNegativeValues:!0}),float:({addUtilities:r})=>{r({".float-start":{float:"inline-start"},".float-end":{float:"inline-end"},".float-right":{float:"right"},".float-left":{float:"left"},".float-none":{float:"none"}})},clear:({addUtilities:r})=>{r({".clear-start":{clear:"inline-start"},".clear-end":{clear:"inline-end"},".clear-left":{clear:"left"},".clear-right":{clear:"right"},".clear-both":{clear:"both"},".clear-none":{clear:"none"}})},margin:L("margin",[["m",["margin"]],[["mx",["margin-left","margin-right"]],["my",["margin-top","margin-bottom"]]],[["ms",["margin-inline-start"]],["me",["margin-inline-end"]],["mt",["margin-top"]],["mr",["margin-right"]],["mb",["margin-bottom"]],["ml",["margin-left"]]]],{supportsNegativeValues:!0}),boxSizing:({addUtilities:r})=>{r({".box-border":{"box-sizing":"border-box"},".box-content":{"box-sizing":"content-box"}})},lineClamp:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"line-clamp":i=>({overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical","-webkit-line-clamp":`${i}`})},{values:t("lineClamp")}),e({".line-clamp-none":{overflow:"visible",display:"block","-webkit-box-orient":"horizontal","-webkit-line-clamp":"none"}})},display:({addUtilities:r})=>{r({".block":{display:"block"},".inline-block":{display:"inline-block"},".inline":{display:"inline"},".flex":{display:"flex"},".inline-flex":{display:"inline-flex"},".table":{display:"table"},".inline-table":{display:"inline-table"},".table-caption":{display:"table-caption"},".table-cell":{display:"table-cell"},".table-column":{display:"table-column"},".table-column-group":{display:"table-column-group"},".table-footer-group":{display:"table-footer-group"},".table-header-group":{display:"table-header-group"},".table-row-group":{display:"table-row-group"},".table-row":{display:"table-row"},".flow-root":{display:"flow-root"},".grid":{display:"grid"},".inline-grid":{display:"inline-grid"},".contents":{display:"contents"},".list-item":{display:"list-item"},".hidden":{display:"none"}})},aspectRatio:L("aspectRatio",[["aspect",["aspect-ratio"]]]),size:L("size",[["size",["width","height"]]]),height:L("height",[["h",["height"]]]),maxHeight:L("maxHeight",[["max-h",["maxHeight"]]]),minHeight:L("minHeight",[["min-h",["minHeight"]]]),width:L("width",[["w",["width"]]]),minWidth:L("minWidth",[["min-w",["minWidth"]]]),maxWidth:L("maxWidth",[["max-w",["maxWidth"]]]),flex:L("flex"),flexShrink:L("flexShrink",[["flex-shrink",["flex-shrink"]],["shrink",["flex-shrink"]]]),flexGrow:L("flexGrow",[["flex-grow",["flex-grow"]],["grow",["flex-grow"]]]),flexBasis:L("flexBasis",[["basis",["flex-basis"]]]),tableLayout:({addUtilities:r})=>{r({".table-auto":{"table-layout":"auto"},".table-fixed":{"table-layout":"fixed"}})},captionSide:({addUtilities:r})=>{r({".caption-top":{"caption-side":"top"},".caption-bottom":{"caption-side":"bottom"}})},borderCollapse:({addUtilities:r})=>{r({".border-collapse":{"border-collapse":"collapse"},".border-separate":{"border-collapse":"separate"}})},borderSpacing:({addDefaults:r,matchUtilities:e,theme:t})=>{r("border-spacing",{"--tw-border-spacing-x":0,"--tw-border-spacing-y":0}),e({"border-spacing":i=>({"--tw-border-spacing-x":i,"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-x":i=>({"--tw-border-spacing-x":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"}),"border-spacing-y":i=>({"--tw-border-spacing-y":i,"@defaults border-spacing":{},"border-spacing":"var(--tw-border-spacing-x) var(--tw-border-spacing-y)"})},{values:t("borderSpacing")})},transformOrigin:L("transformOrigin",[["origin",["transformOrigin"]]]),translate:L("translate",[[["translate-x",[["@defaults transform",{}],"--tw-translate-x",["transform",Xe]]],["translate-y",[["@defaults transform",{}],"--tw-translate-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),rotate:L("rotate",[["rotate",[["@defaults transform",{}],"--tw-rotate",["transform",Xe]]]],{supportsNegativeValues:!0}),skew:L("skew",[[["skew-x",[["@defaults transform",{}],"--tw-skew-x",["transform",Xe]]],["skew-y",[["@defaults transform",{}],"--tw-skew-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),scale:L("scale",[["scale",[["@defaults transform",{}],"--tw-scale-x","--tw-scale-y",["transform",Xe]]],[["scale-x",[["@defaults transform",{}],"--tw-scale-x",["transform",Xe]]],["scale-y",[["@defaults transform",{}],"--tw-scale-y",["transform",Xe]]]]],{supportsNegativeValues:!0}),transform:({addDefaults:r,addUtilities:e})=>{r("transform",{"--tw-translate-x":"0","--tw-translate-y":"0","--tw-rotate":"0","--tw-skew-x":"0","--tw-skew-y":"0","--tw-scale-x":"1","--tw-scale-y":"1"}),e({".transform":{"@defaults transform":{},transform:Xe},".transform-cpu":{transform:Xe},".transform-gpu":{transform:Xe.replace("translate(var(--tw-translate-x), var(--tw-translate-y))","translate3d(var(--tw-translate-x), var(--tw-translate-y), 0)")},".transform-none":{transform:"none"}})},animation:({matchUtilities:r,theme:e,config:t})=>{let i=a=>Te(t("prefix")+a),n=Object.fromEntries(Object.entries(e("keyframes")??{}).map(([a,s])=>[a,{[`@keyframes ${i(a)}`]:s}]));r({animate:a=>{let s=$o(a);return[...s.flatMap(o=>n[o.name]),{animation:s.map(({name:o,value:l})=>o===void 0||n[o]===void 0?l:l.replace(o,i(o))).join(", ")}]}},{values:e("animation")})},cursor:L("cursor"),touchAction:({addDefaults:r,addUtilities:e})=>{r("touch-action",{"--tw-pan-x":" ","--tw-pan-y":" ","--tw-pinch-zoom":" "});let t="var(--tw-pan-x) var(--tw-pan-y) var(--tw-pinch-zoom)";e({".touch-auto":{"touch-action":"auto"},".touch-none":{"touch-action":"none"},".touch-pan-x":{"@defaults touch-action":{},"--tw-pan-x":"pan-x","touch-action":t},".touch-pan-left":{"@defaults touch-action":{},"--tw-pan-x":"pan-left","touch-action":t},".touch-pan-right":{"@defaults touch-action":{},"--tw-pan-x":"pan-right","touch-action":t},".touch-pan-y":{"@defaults touch-action":{},"--tw-pan-y":"pan-y","touch-action":t},".touch-pan-up":{"@defaults touch-action":{},"--tw-pan-y":"pan-up","touch-action":t},".touch-pan-down":{"@defaults touch-action":{},"--tw-pan-y":"pan-down","touch-action":t},".touch-pinch-zoom":{"@defaults touch-action":{},"--tw-pinch-zoom":"pinch-zoom","touch-action":t},".touch-manipulation":{"touch-action":"manipulation"}})},userSelect:({addUtilities:r})=>{r({".select-none":{"user-select":"none"},".select-text":{"user-select":"text"},".select-all":{"user-select":"all"},".select-auto":{"user-select":"auto"}})},resize:({addUtilities:r})=>{r({".resize-none":{resize:"none"},".resize-y":{resize:"vertical"},".resize-x":{resize:"horizontal"},".resize":{resize:"both"}})},scrollSnapType:({addDefaults:r,addUtilities:e})=>{r("scroll-snap-type",{"--tw-scroll-snap-strictness":"proximity"}),e({".snap-none":{"scroll-snap-type":"none"},".snap-x":{"@defaults scroll-snap-type":{},"scroll-snap-type":"x var(--tw-scroll-snap-strictness)"},".snap-y":{"@defaults scroll-snap-type":{},"scroll-snap-type":"y var(--tw-scroll-snap-strictness)"},".snap-both":{"@defaults scroll-snap-type":{},"scroll-snap-type":"both var(--tw-scroll-snap-strictness)"},".snap-mandatory":{"--tw-scroll-snap-strictness":"mandatory"},".snap-proximity":{"--tw-scroll-snap-strictness":"proximity"}})},scrollSnapAlign:({addUtilities:r})=>{r({".snap-start":{"scroll-snap-align":"start"},".snap-end":{"scroll-snap-align":"end"},".snap-center":{"scroll-snap-align":"center"},".snap-align-none":{"scroll-snap-align":"none"}})},scrollSnapStop:({addUtilities:r})=>{r({".snap-normal":{"scroll-snap-stop":"normal"},".snap-always":{"scroll-snap-stop":"always"}})},scrollMargin:L("scrollMargin",[["scroll-m",["scroll-margin"]],[["scroll-mx",["scroll-margin-left","scroll-margin-right"]],["scroll-my",["scroll-margin-top","scroll-margin-bottom"]]],[["scroll-ms",["scroll-margin-inline-start"]],["scroll-me",["scroll-margin-inline-end"]],["scroll-mt",["scroll-margin-top"]],["scroll-mr",["scroll-margin-right"]],["scroll-mb",["scroll-margin-bottom"]],["scroll-ml",["scroll-margin-left"]]]],{supportsNegativeValues:!0}),scrollPadding:L("scrollPadding",[["scroll-p",["scroll-padding"]],[["scroll-px",["scroll-padding-left","scroll-padding-right"]],["scroll-py",["scroll-padding-top","scroll-padding-bottom"]]],[["scroll-ps",["scroll-padding-inline-start"]],["scroll-pe",["scroll-padding-inline-end"]],["scroll-pt",["scroll-padding-top"]],["scroll-pr",["scroll-padding-right"]],["scroll-pb",["scroll-padding-bottom"]],["scroll-pl",["scroll-padding-left"]]]]),listStylePosition:({addUtilities:r})=>{r({".list-inside":{"list-style-position":"inside"},".list-outside":{"list-style-position":"outside"}})},listStyleType:L("listStyleType",[["list",["listStyleType"]]]),listStyleImage:L("listStyleImage",[["list-image",["listStyleImage"]]]),appearance:({addUtilities:r})=>{r({".appearance-none":{appearance:"none"},".appearance-auto":{appearance:"auto"}})},columns:L("columns",[["columns",["columns"]]]),breakBefore:({addUtilities:r})=>{r({".break-before-auto":{"break-before":"auto"},".break-before-avoid":{"break-before":"avoid"},".break-before-all":{"break-before":"all"},".break-before-avoid-page":{"break-before":"avoid-page"},".break-before-page":{"break-before":"page"},".break-before-left":{"break-before":"left"},".break-before-right":{"break-before":"right"},".break-before-column":{"break-before":"column"}})},breakInside:({addUtilities:r})=>{r({".break-inside-auto":{"break-inside":"auto"},".break-inside-avoid":{"break-inside":"avoid"},".break-inside-avoid-page":{"break-inside":"avoid-page"},".break-inside-avoid-column":{"break-inside":"avoid-column"}})},breakAfter:({addUtilities:r})=>{r({".break-after-auto":{"break-after":"auto"},".break-after-avoid":{"break-after":"avoid"},".break-after-all":{"break-after":"all"},".break-after-avoid-page":{"break-after":"avoid-page"},".break-after-page":{"break-after":"page"},".break-after-left":{"break-after":"left"},".break-after-right":{"break-after":"right"},".break-after-column":{"break-after":"column"}})},gridAutoColumns:L("gridAutoColumns",[["auto-cols",["gridAutoColumns"]]]),gridAutoFlow:({addUtilities:r})=>{r({".grid-flow-row":{gridAutoFlow:"row"},".grid-flow-col":{gridAutoFlow:"column"},".grid-flow-dense":{gridAutoFlow:"dense"},".grid-flow-row-dense":{gridAutoFlow:"row dense"},".grid-flow-col-dense":{gridAutoFlow:"column dense"}})},gridAutoRows:L("gridAutoRows",[["auto-rows",["gridAutoRows"]]]),gridTemplateColumns:L("gridTemplateColumns",[["grid-cols",["gridTemplateColumns"]]]),gridTemplateRows:L("gridTemplateRows",[["grid-rows",["gridTemplateRows"]]]),flexDirection:({addUtilities:r})=>{r({".flex-row":{"flex-direction":"row"},".flex-row-reverse":{"flex-direction":"row-reverse"},".flex-col":{"flex-direction":"column"},".flex-col-reverse":{"flex-direction":"column-reverse"}})},flexWrap:({addUtilities:r})=>{r({".flex-wrap":{"flex-wrap":"wrap"},".flex-wrap-reverse":{"flex-wrap":"wrap-reverse"},".flex-nowrap":{"flex-wrap":"nowrap"}})},placeContent:({addUtilities:r})=>{r({".place-content-center":{"place-content":"center"},".place-content-start":{"place-content":"start"},".place-content-end":{"place-content":"end"},".place-content-between":{"place-content":"space-between"},".place-content-around":{"place-content":"space-around"},".place-content-evenly":{"place-content":"space-evenly"},".place-content-baseline":{"place-content":"baseline"},".place-content-stretch":{"place-content":"stretch"}})},placeItems:({addUtilities:r})=>{r({".place-items-start":{"place-items":"start"},".place-items-end":{"place-items":"end"},".place-items-center":{"place-items":"center"},".place-items-baseline":{"place-items":"baseline"},".place-items-stretch":{"place-items":"stretch"}})},alignContent:({addUtilities:r})=>{r({".content-normal":{"align-content":"normal"},".content-center":{"align-content":"center"},".content-start":{"align-content":"flex-start"},".content-end":{"align-content":"flex-end"},".content-between":{"align-content":"space-between"},".content-around":{"align-content":"space-around"},".content-evenly":{"align-content":"space-evenly"},".content-baseline":{"align-content":"baseline"},".content-stretch":{"align-content":"stretch"}})},alignItems:({addUtilities:r})=>{r({".items-start":{"align-items":"flex-start"},".items-end":{"align-items":"flex-end"},".items-center":{"align-items":"center"},".items-baseline":{"align-items":"baseline"},".items-stretch":{"align-items":"stretch"}})},justifyContent:({addUtilities:r})=>{r({".justify-normal":{"justify-content":"normal"},".justify-start":{"justify-content":"flex-start"},".justify-end":{"justify-content":"flex-end"},".justify-center":{"justify-content":"center"},".justify-between":{"justify-content":"space-between"},".justify-around":{"justify-content":"space-around"},".justify-evenly":{"justify-content":"space-evenly"},".justify-stretch":{"justify-content":"stretch"}})},justifyItems:({addUtilities:r})=>{r({".justify-items-start":{"justify-items":"start"},".justify-items-end":{"justify-items":"end"},".justify-items-center":{"justify-items":"center"},".justify-items-stretch":{"justify-items":"stretch"}})},gap:L("gap",[["gap",["gap"]],[["gap-x",["columnGap"]],["gap-y",["rowGap"]]]]),space:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"space-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"0","margin-right":`calc(${i} * var(--tw-space-x-reverse))`,"margin-left":`calc(${i} * calc(1 - var(--tw-space-x-reverse)))`}}),"space-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"0","margin-top":`calc(${i} * calc(1 - var(--tw-space-y-reverse)))`,"margin-bottom":`calc(${i} * var(--tw-space-y-reverse))`}})},{values:t("space"),supportsNegativeValues:!0}),e({".space-y-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-y-reverse":"1"},".space-x-reverse > :not([hidden]) ~ :not([hidden])":{"--tw-space-x-reverse":"1"}})},divideWidth:({matchUtilities:r,addUtilities:e,theme:t})=>{r({"divide-x":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"0","border-right-width":`calc(${i} * var(--tw-divide-x-reverse))`,"border-left-width":`calc(${i} * calc(1 - var(--tw-divide-x-reverse)))`}}),"divide-y":i=>(i=i==="0"?"0px":i,{"& > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"0","border-top-width":`calc(${i} * calc(1 - var(--tw-divide-y-reverse)))`,"border-bottom-width":`calc(${i} * var(--tw-divide-y-reverse))`}})},{values:t("divideWidth"),type:["line-width","length","any"]}),e({".divide-y-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-y-reverse":"1"},".divide-x-reverse > :not([hidden]) ~ :not([hidden])":{"@defaults border-width":{},"--tw-divide-x-reverse":"1"}})},divideStyle:({addUtilities:r})=>{r({".divide-solid > :not([hidden]) ~ :not([hidden])":{"border-style":"solid"},".divide-dashed > :not([hidden]) ~ :not([hidden])":{"border-style":"dashed"},".divide-dotted > :not([hidden]) ~ :not([hidden])":{"border-style":"dotted"},".divide-double > :not([hidden]) ~ :not([hidden])":{"border-style":"double"},".divide-none > :not([hidden]) ~ :not([hidden])":{"border-style":"none"}})},divideColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({divide:i=>t("divideOpacity")?{["& > :not([hidden]) ~ :not([hidden])"]:Ae({color:i,property:"border-color",variable:"--tw-divide-opacity"})}:{["& > :not([hidden]) ~ :not([hidden])"]:{"border-color":X(i)}}},{values:(({DEFAULT:i,...n})=>n)(xe(e("divideColor"))),type:["color","any"]})},divideOpacity:({matchUtilities:r,theme:e})=>{r({"divide-opacity":t=>({["& > :not([hidden]) ~ :not([hidden])"]:{"--tw-divide-opacity":t}})},{values:e("divideOpacity")})},placeSelf:({addUtilities:r})=>{r({".place-self-auto":{"place-self":"auto"},".place-self-start":{"place-self":"start"},".place-self-end":{"place-self":"end"},".place-self-center":{"place-self":"center"},".place-self-stretch":{"place-self":"stretch"}})},alignSelf:({addUtilities:r})=>{r({".self-auto":{"align-self":"auto"},".self-start":{"align-self":"flex-start"},".self-end":{"align-self":"flex-end"},".self-center":{"align-self":"center"},".self-stretch":{"align-self":"stretch"},".self-baseline":{"align-self":"baseline"}})},justifySelf:({addUtilities:r})=>{r({".justify-self-auto":{"justify-self":"auto"},".justify-self-start":{"justify-self":"start"},".justify-self-end":{"justify-self":"end"},".justify-self-center":{"justify-self":"center"},".justify-self-stretch":{"justify-self":"stretch"}})},overflow:({addUtilities:r})=>{r({".overflow-auto":{overflow:"auto"},".overflow-hidden":{overflow:"hidden"},".overflow-clip":{overflow:"clip"},".overflow-visible":{overflow:"visible"},".overflow-scroll":{overflow:"scroll"},".overflow-x-auto":{"overflow-x":"auto"},".overflow-y-auto":{"overflow-y":"auto"},".overflow-x-hidden":{"overflow-x":"hidden"},".overflow-y-hidden":{"overflow-y":"hidden"},".overflow-x-clip":{"overflow-x":"clip"},".overflow-y-clip":{"overflow-y":"clip"},".overflow-x-visible":{"overflow-x":"visible"},".overflow-y-visible":{"overflow-y":"visible"},".overflow-x-scroll":{"overflow-x":"scroll"},".overflow-y-scroll":{"overflow-y":"scroll"}})},overscrollBehavior:({addUtilities:r})=>{r({".overscroll-auto":{"overscroll-behavior":"auto"},".overscroll-contain":{"overscroll-behavior":"contain"},".overscroll-none":{"overscroll-behavior":"none"},".overscroll-y-auto":{"overscroll-behavior-y":"auto"},".overscroll-y-contain":{"overscroll-behavior-y":"contain"},".overscroll-y-none":{"overscroll-behavior-y":"none"},".overscroll-x-auto":{"overscroll-behavior-x":"auto"},".overscroll-x-contain":{"overscroll-behavior-x":"contain"},".overscroll-x-none":{"overscroll-behavior-x":"none"}})},scrollBehavior:({addUtilities:r})=>{r({".scroll-auto":{"scroll-behavior":"auto"},".scroll-smooth":{"scroll-behavior":"smooth"}})},textOverflow:({addUtilities:r})=>{r({".truncate":{overflow:"hidden","text-overflow":"ellipsis","white-space":"nowrap"},".overflow-ellipsis":{"text-overflow":"ellipsis"},".text-ellipsis":{"text-overflow":"ellipsis"},".text-clip":{"text-overflow":"clip"}})},hyphens:({addUtilities:r})=>{r({".hyphens-none":{hyphens:"none"},".hyphens-manual":{hyphens:"manual"},".hyphens-auto":{hyphens:"auto"}})},whitespace:({addUtilities:r})=>{r({".whitespace-normal":{"white-space":"normal"},".whitespace-nowrap":{"white-space":"nowrap"},".whitespace-pre":{"white-space":"pre"},".whitespace-pre-line":{"white-space":"pre-line"},".whitespace-pre-wrap":{"white-space":"pre-wrap"},".whitespace-break-spaces":{"white-space":"break-spaces"}})},textWrap:({addUtilities:r})=>{r({".text-wrap":{"text-wrap":"wrap"},".text-nowrap":{"text-wrap":"nowrap"},".text-balance":{"text-wrap":"balance"},".text-pretty":{"text-wrap":"pretty"}})},wordBreak:({addUtilities:r})=>{r({".break-normal":{"overflow-wrap":"normal","word-break":"normal"},".break-words":{"overflow-wrap":"break-word"},".break-all":{"word-break":"break-all"},".break-keep":{"word-break":"keep-all"}})},borderRadius:L("borderRadius",[["rounded",["border-radius"]],[["rounded-s",["border-start-start-radius","border-end-start-radius"]],["rounded-e",["border-start-end-radius","border-end-end-radius"]],["rounded-t",["border-top-left-radius","border-top-right-radius"]],["rounded-r",["border-top-right-radius","border-bottom-right-radius"]],["rounded-b",["border-bottom-right-radius","border-bottom-left-radius"]],["rounded-l",["border-top-left-radius","border-bottom-left-radius"]]],[["rounded-ss",["border-start-start-radius"]],["rounded-se",["border-start-end-radius"]],["rounded-ee",["border-end-end-radius"]],["rounded-es",["border-end-start-radius"]],["rounded-tl",["border-top-left-radius"]],["rounded-tr",["border-top-right-radius"]],["rounded-br",["border-bottom-right-radius"]],["rounded-bl",["border-bottom-left-radius"]]]]),borderWidth:L("borderWidth",[["border",[["@defaults border-width",{}],"border-width"]],[["border-x",[["@defaults border-width",{}],"border-left-width","border-right-width"]],["border-y",[["@defaults border-width",{}],"border-top-width","border-bottom-width"]]],[["border-s",[["@defaults border-width",{}],"border-inline-start-width"]],["border-e",[["@defaults border-width",{}],"border-inline-end-width"]],["border-t",[["@defaults border-width",{}],"border-top-width"]],["border-r",[["@defaults border-width",{}],"border-right-width"]],["border-b",[["@defaults border-width",{}],"border-bottom-width"]],["border-l",[["@defaults border-width",{}],"border-left-width"]]]],{type:["line-width","length"]}),borderStyle:({addUtilities:r})=>{r({".border-solid":{"border-style":"solid"},".border-dashed":{"border-style":"dashed"},".border-dotted":{"border-style":"dotted"},".border-double":{"border-style":"double"},".border-hidden":{"border-style":"hidden"},".border-none":{"border-style":"none"}})},borderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({border:i=>t("borderOpacity")?Ae({color:i,property:"border-color",variable:"--tw-border-opacity"}):{"border-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-x":i=>t("borderOpacity")?Ae({color:i,property:["border-left-color","border-right-color"],variable:"--tw-border-opacity"}):{"border-left-color":X(i),"border-right-color":X(i)},"border-y":i=>t("borderOpacity")?Ae({color:i,property:["border-top-color","border-bottom-color"],variable:"--tw-border-opacity"}):{"border-top-color":X(i),"border-bottom-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]}),r({"border-s":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-start-color",variable:"--tw-border-opacity"}):{"border-inline-start-color":X(i)},"border-e":i=>t("borderOpacity")?Ae({color:i,property:"border-inline-end-color",variable:"--tw-border-opacity"}):{"border-inline-end-color":X(i)},"border-t":i=>t("borderOpacity")?Ae({color:i,property:"border-top-color",variable:"--tw-border-opacity"}):{"border-top-color":X(i)},"border-r":i=>t("borderOpacity")?Ae({color:i,property:"border-right-color",variable:"--tw-border-opacity"}):{"border-right-color":X(i)},"border-b":i=>t("borderOpacity")?Ae({color:i,property:"border-bottom-color",variable:"--tw-border-opacity"}):{"border-bottom-color":X(i)},"border-l":i=>t("borderOpacity")?Ae({color:i,property:"border-left-color",variable:"--tw-border-opacity"}):{"border-left-color":X(i)}},{values:(({DEFAULT:i,...n})=>n)(xe(e("borderColor"))),type:["color","any"]})},borderOpacity:L("borderOpacity",[["border-opacity",["--tw-border-opacity"]]]),backgroundColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({bg:i=>t("backgroundOpacity")?Ae({color:i,property:"background-color",variable:"--tw-bg-opacity"}):{"background-color":X(i)}},{values:xe(e("backgroundColor")),type:["color","any"]})},backgroundOpacity:L("backgroundOpacity",[["bg-opacity",["--tw-bg-opacity"]]]),backgroundImage:L("backgroundImage",[["bg",["background-image"]]],{type:["lookup","image","url"]}),gradientColorStops:(()=>{function r(e){return Ze(e,0,"rgb(255 255 255 / 0)")}return function({matchUtilities:e,theme:t,addDefaults:i}){i("gradient-color-stops",{"--tw-gradient-from-position":" ","--tw-gradient-via-position":" ","--tw-gradient-to-position":" "});let n={values:xe(t("gradientColorStops")),type:["color","any"]},a={values:t("gradientColorStopPositions"),type:["length","percentage"]};e({from:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-from":`${X(s)} var(--tw-gradient-from-position)`,"--tw-gradient-to":`${o} var(--tw-gradient-to-position)`,"--tw-gradient-stops":"var(--tw-gradient-from), var(--tw-gradient-to)"}}},n),e({from:s=>({"--tw-gradient-from-position":s})},a),e({via:s=>{let o=r(s);return{"@defaults gradient-color-stops":{},"--tw-gradient-to":`${o} var(--tw-gradient-to-position)`,"--tw-gradient-stops":`var(--tw-gradient-from), ${X(s)} var(--tw-gradient-via-position), var(--tw-gradient-to)`}}},n),e({via:s=>({"--tw-gradient-via-position":s})},a),e({to:s=>({"@defaults gradient-color-stops":{},"--tw-gradient-to":`${X(s)} var(--tw-gradient-to-position)`})},n),e({to:s=>({"--tw-gradient-to-position":s})},a)}})(),boxDecorationBreak:({addUtilities:r})=>{r({".decoration-slice":{"box-decoration-break":"slice"},".decoration-clone":{"box-decoration-break":"clone"},".box-decoration-slice":{"box-decoration-break":"slice"},".box-decoration-clone":{"box-decoration-break":"clone"}})},backgroundSize:L("backgroundSize",[["bg",["background-size"]]],{type:["lookup","length","percentage","size"]}),backgroundAttachment:({addUtilities:r})=>{r({".bg-fixed":{"background-attachment":"fixed"},".bg-local":{"background-attachment":"local"},".bg-scroll":{"background-attachment":"scroll"}})},backgroundClip:({addUtilities:r})=>{r({".bg-clip-border":{"background-clip":"border-box"},".bg-clip-padding":{"background-clip":"padding-box"},".bg-clip-content":{"background-clip":"content-box"},".bg-clip-text":{"background-clip":"text"}})},backgroundPosition:L("backgroundPosition",[["bg",["background-position"]]],{type:["lookup",["position",{preferOnConflict:!0}]]}),backgroundRepeat:({addUtilities:r})=>{r({".bg-repeat":{"background-repeat":"repeat"},".bg-no-repeat":{"background-repeat":"no-repeat"},".bg-repeat-x":{"background-repeat":"repeat-x"},".bg-repeat-y":{"background-repeat":"repeat-y"},".bg-repeat-round":{"background-repeat":"round"},".bg-repeat-space":{"background-repeat":"space"}})},backgroundOrigin:({addUtilities:r})=>{r({".bg-origin-border":{"background-origin":"border-box"},".bg-origin-padding":{"background-origin":"padding-box"},".bg-origin-content":{"background-origin":"content-box"}})},fill:({matchUtilities:r,theme:e})=>{r({fill:t=>({fill:X(t)})},{values:xe(e("fill")),type:["color","any"]})},stroke:({matchUtilities:r,theme:e})=>{r({stroke:t=>({stroke:X(t)})},{values:xe(e("stroke")),type:["color","url","any"]})},strokeWidth:L("strokeWidth",[["stroke",["stroke-width"]]],{type:["length","number","percentage"]}),objectFit:({addUtilities:r})=>{r({".object-contain":{"object-fit":"contain"},".object-cover":{"object-fit":"cover"},".object-fill":{"object-fit":"fill"},".object-none":{"object-fit":"none"},".object-scale-down":{"object-fit":"scale-down"}})},objectPosition:L("objectPosition",[["object",["object-position"]]]),padding:L("padding",[["p",["padding"]],[["px",["padding-left","padding-right"]],["py",["padding-top","padding-bottom"]]],[["ps",["padding-inline-start"]],["pe",["padding-inline-end"]],["pt",["padding-top"]],["pr",["padding-right"]],["pb",["padding-bottom"]],["pl",["padding-left"]]]]),textAlign:({addUtilities:r})=>{r({".text-left":{"text-align":"left"},".text-center":{"text-align":"center"},".text-right":{"text-align":"right"},".text-justify":{"text-align":"justify"},".text-start":{"text-align":"start"},".text-end":{"text-align":"end"}})},textIndent:L("textIndent",[["indent",["text-indent"]]],{supportsNegativeValues:!0}),verticalAlign:({addUtilities:r,matchUtilities:e})=>{r({".align-baseline":{"vertical-align":"baseline"},".align-top":{"vertical-align":"top"},".align-middle":{"vertical-align":"middle"},".align-bottom":{"vertical-align":"bottom"},".align-text-top":{"vertical-align":"text-top"},".align-text-bottom":{"vertical-align":"text-bottom"},".align-sub":{"vertical-align":"sub"},".align-super":{"vertical-align":"super"}}),e({align:t=>({"vertical-align":t})})},fontFamily:({matchUtilities:r,theme:e})=>{r({font:t=>{let[i,n={}]=Array.isArray(t)&&ke(t[1])?t:[t],{fontFeatureSettings:a,fontVariationSettings:s}=n;return{"font-family":Array.isArray(i)?i.join(", "):i,...a===void 0?{}:{"font-feature-settings":a},...s===void 0?{}:{"font-variation-settings":s}}}},{values:e("fontFamily"),type:["lookup","generic-name","family-name"]})},fontSize:({matchUtilities:r,theme:e})=>{r({text:(t,{modifier:i})=>{let[n,a]=Array.isArray(t)?t:[t];if(i)return{"font-size":n,"line-height":i};let{lineHeight:s,letterSpacing:o,fontWeight:l}=ke(a)?a:{lineHeight:a};return{"font-size":n,...s===void 0?{}:{"line-height":s},...o===void 0?{}:{"letter-spacing":o},...l===void 0?{}:{"font-weight":l}}}},{values:e("fontSize"),modifiers:e("lineHeight"),type:["absolute-size","relative-size","length","percentage"]})},fontWeight:L("fontWeight",[["font",["fontWeight"]]],{type:["lookup","number","any"]}),textTransform:({addUtilities:r})=>{r({".uppercase":{"text-transform":"uppercase"},".lowercase":{"text-transform":"lowercase"},".capitalize":{"text-transform":"capitalize"},".normal-case":{"text-transform":"none"}})},fontStyle:({addUtilities:r})=>{r({".italic":{"font-style":"italic"},".not-italic":{"font-style":"normal"}})},fontVariantNumeric:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-ordinal) var(--tw-slashed-zero) var(--tw-numeric-figure) var(--tw-numeric-spacing) var(--tw-numeric-fraction)";r("font-variant-numeric",{"--tw-ordinal":" ","--tw-slashed-zero":" ","--tw-numeric-figure":" ","--tw-numeric-spacing":" ","--tw-numeric-fraction":" "}),e({".normal-nums":{"font-variant-numeric":"normal"},".ordinal":{"@defaults font-variant-numeric":{},"--tw-ordinal":"ordinal","font-variant-numeric":t},".slashed-zero":{"@defaults font-variant-numeric":{},"--tw-slashed-zero":"slashed-zero","font-variant-numeric":t},".lining-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"lining-nums","font-variant-numeric":t},".oldstyle-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-figure":"oldstyle-nums","font-variant-numeric":t},".proportional-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"proportional-nums","font-variant-numeric":t},".tabular-nums":{"@defaults font-variant-numeric":{},"--tw-numeric-spacing":"tabular-nums","font-variant-numeric":t},".diagonal-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"diagonal-fractions","font-variant-numeric":t},".stacked-fractions":{"@defaults font-variant-numeric":{},"--tw-numeric-fraction":"stacked-fractions","font-variant-numeric":t}})},lineHeight:L("lineHeight",[["leading",["lineHeight"]]]),letterSpacing:L("letterSpacing",[["tracking",["letterSpacing"]]],{supportsNegativeValues:!0}),textColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({text:i=>t("textOpacity")?Ae({color:i,property:"color",variable:"--tw-text-opacity"}):{color:X(i)}},{values:xe(e("textColor")),type:["color","any"]})},textOpacity:L("textOpacity",[["text-opacity",["--tw-text-opacity"]]]),textDecoration:({addUtilities:r})=>{r({".underline":{"text-decoration-line":"underline"},".overline":{"text-decoration-line":"overline"},".line-through":{"text-decoration-line":"line-through"},".no-underline":{"text-decoration-line":"none"}})},textDecorationColor:({matchUtilities:r,theme:e})=>{r({decoration:t=>({"text-decoration-color":X(t)})},{values:xe(e("textDecorationColor")),type:["color","any"]})},textDecorationStyle:({addUtilities:r})=>{r({".decoration-solid":{"text-decoration-style":"solid"},".decoration-double":{"text-decoration-style":"double"},".decoration-dotted":{"text-decoration-style":"dotted"},".decoration-dashed":{"text-decoration-style":"dashed"},".decoration-wavy":{"text-decoration-style":"wavy"}})},textDecorationThickness:L("textDecorationThickness",[["decoration",["text-decoration-thickness"]]],{type:["length","percentage"]}),textUnderlineOffset:L("textUnderlineOffset",[["underline-offset",["text-underline-offset"]]],{type:["length","percentage","any"]}),fontSmoothing:({addUtilities:r})=>{r({".antialiased":{"-webkit-font-smoothing":"antialiased","-moz-osx-font-smoothing":"grayscale"},".subpixel-antialiased":{"-webkit-font-smoothing":"auto","-moz-osx-font-smoothing":"auto"}})},placeholderColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({placeholder:i=>t("placeholderOpacity")?{"&::placeholder":Ae({color:i,property:"color",variable:"--tw-placeholder-opacity"})}:{"&::placeholder":{color:X(i)}}},{values:xe(e("placeholderColor")),type:["color","any"]})},placeholderOpacity:({matchUtilities:r,theme:e})=>{r({"placeholder-opacity":t=>({["&::placeholder"]:{"--tw-placeholder-opacity":t}})},{values:e("placeholderOpacity")})},caretColor:({matchUtilities:r,theme:e})=>{r({caret:t=>({"caret-color":X(t)})},{values:xe(e("caretColor")),type:["color","any"]})},accentColor:({matchUtilities:r,theme:e})=>{r({accent:t=>({"accent-color":X(t)})},{values:xe(e("accentColor")),type:["color","any"]})},opacity:L("opacity",[["opacity",["opacity"]]]),backgroundBlendMode:({addUtilities:r})=>{r({".bg-blend-normal":{"background-blend-mode":"normal"},".bg-blend-multiply":{"background-blend-mode":"multiply"},".bg-blend-screen":{"background-blend-mode":"screen"},".bg-blend-overlay":{"background-blend-mode":"overlay"},".bg-blend-darken":{"background-blend-mode":"darken"},".bg-blend-lighten":{"background-blend-mode":"lighten"},".bg-blend-color-dodge":{"background-blend-mode":"color-dodge"},".bg-blend-color-burn":{"background-blend-mode":"color-burn"},".bg-blend-hard-light":{"background-blend-mode":"hard-light"},".bg-blend-soft-light":{"background-blend-mode":"soft-light"},".bg-blend-difference":{"background-blend-mode":"difference"},".bg-blend-exclusion":{"background-blend-mode":"exclusion"},".bg-blend-hue":{"background-blend-mode":"hue"},".bg-blend-saturation":{"background-blend-mode":"saturation"},".bg-blend-color":{"background-blend-mode":"color"},".bg-blend-luminosity":{"background-blend-mode":"luminosity"}})},mixBlendMode:({addUtilities:r})=>{r({".mix-blend-normal":{"mix-blend-mode":"normal"},".mix-blend-multiply":{"mix-blend-mode":"multiply"},".mix-blend-screen":{"mix-blend-mode":"screen"},".mix-blend-overlay":{"mix-blend-mode":"overlay"},".mix-blend-darken":{"mix-blend-mode":"darken"},".mix-blend-lighten":{"mix-blend-mode":"lighten"},".mix-blend-color-dodge":{"mix-blend-mode":"color-dodge"},".mix-blend-color-burn":{"mix-blend-mode":"color-burn"},".mix-blend-hard-light":{"mix-blend-mode":"hard-light"},".mix-blend-soft-light":{"mix-blend-mode":"soft-light"},".mix-blend-difference":{"mix-blend-mode":"difference"},".mix-blend-exclusion":{"mix-blend-mode":"exclusion"},".mix-blend-hue":{"mix-blend-mode":"hue"},".mix-blend-saturation":{"mix-blend-mode":"saturation"},".mix-blend-color":{"mix-blend-mode":"color"},".mix-blend-luminosity":{"mix-blend-mode":"luminosity"},".mix-blend-plus-darker":{"mix-blend-mode":"plus-darker"},".mix-blend-plus-lighter":{"mix-blend-mode":"plus-lighter"}})},boxShadow:(()=>{let r=mt("boxShadow"),e=["var(--tw-ring-offset-shadow, 0 0 #0000)","var(--tw-ring-shadow, 0 0 #0000)","var(--tw-shadow)"].join(", ");return function({matchUtilities:t,addDefaults:i,theme:n}){i("box-shadow",{"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),t({shadow:a=>{a=r(a);let s=Ji(a);for(let o of s)!o.valid||(o.color="var(--tw-shadow-color)");return{"@defaults box-shadow":{},"--tw-shadow":a==="none"?"0 0 #0000":a,"--tw-shadow-colored":a==="none"?"0 0 #0000":qf(s),"box-shadow":e}}},{values:n("boxShadow"),type:["shadow"]})}})(),boxShadowColor:({matchUtilities:r,theme:e})=>{r({shadow:t=>({"--tw-shadow-color":X(t),"--tw-shadow":"var(--tw-shadow-colored)"})},{values:xe(e("boxShadowColor")),type:["color","any"]})},outlineStyle:({addUtilities:r})=>{r({".outline-none":{outline:"2px solid transparent","outline-offset":"2px"},".outline":{"outline-style":"solid"},".outline-dashed":{"outline-style":"dashed"},".outline-dotted":{"outline-style":"dotted"},".outline-double":{"outline-style":"double"}})},outlineWidth:L("outlineWidth",[["outline",["outline-width"]]],{type:["length","number","percentage"]}),outlineOffset:L("outlineOffset",[["outline-offset",["outline-offset"]]],{type:["length","number","percentage","any"],supportsNegativeValues:!0}),outlineColor:({matchUtilities:r,theme:e})=>{r({outline:t=>({"outline-color":X(t)})},{values:xe(e("outlineColor")),type:["color","any"]})},ringWidth:({matchUtilities:r,addDefaults:e,addUtilities:t,theme:i,config:n})=>{let a=(()=>{if(we(n(),"respectDefaultRingColorOpacity"))return i("ringColor.DEFAULT");let s=i("ringOpacity.DEFAULT","0.5");return i("ringColor")?.DEFAULT?Ze(i("ringColor")?.DEFAULT,s,`rgb(147 197 253 / ${s})`):`rgb(147 197 253 / ${s})`})();e("ring-width",{"--tw-ring-inset":" ","--tw-ring-offset-width":i("ringOffsetWidth.DEFAULT","0px"),"--tw-ring-offset-color":i("ringOffsetColor.DEFAULT","#fff"),"--tw-ring-color":a,"--tw-ring-offset-shadow":"0 0 #0000","--tw-ring-shadow":"0 0 #0000","--tw-shadow":"0 0 #0000","--tw-shadow-colored":"0 0 #0000"}),r({ring:s=>({"@defaults ring-width":{},"--tw-ring-offset-shadow":"var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color)","--tw-ring-shadow":`var(--tw-ring-inset) 0 0 0 calc(${s} + var(--tw-ring-offset-width)) var(--tw-ring-color)`,"box-shadow":["var(--tw-ring-offset-shadow)","var(--tw-ring-shadow)","var(--tw-shadow, 0 0 #0000)"].join(", ")})},{values:i("ringWidth"),type:"length"}),t({".ring-inset":{"@defaults ring-width":{},"--tw-ring-inset":"inset"}})},ringColor:({matchUtilities:r,theme:e,corePlugins:t})=>{r({ring:i=>t("ringOpacity")?Ae({color:i,property:"--tw-ring-color",variable:"--tw-ring-opacity"}):{"--tw-ring-color":X(i)}},{values:Object.fromEntries(Object.entries(xe(e("ringColor"))).filter(([i])=>i!=="DEFAULT")),type:["color","any"]})},ringOpacity:r=>{let{config:e}=r;return L("ringOpacity",[["ring-opacity",["--tw-ring-opacity"]]],{filterDefault:!we(e(),"respectDefaultRingColorOpacity")})(r)},ringOffsetWidth:L("ringOffsetWidth",[["ring-offset",["--tw-ring-offset-width"]]],{type:"length"}),ringOffsetColor:({matchUtilities:r,theme:e})=>{r({"ring-offset":t=>({"--tw-ring-offset-color":X(t)})},{values:xe(e("ringOffsetColor")),type:["color","any"]})},blur:({matchUtilities:r,theme:e})=>{r({blur:t=>({"--tw-blur":t.trim()===""?" ":`blur(${t})`,"@defaults filter":{},filter:nt})},{values:e("blur")})},brightness:({matchUtilities:r,theme:e})=>{r({brightness:t=>({"--tw-brightness":`brightness(${t})`,"@defaults filter":{},filter:nt})},{values:e("brightness")})},contrast:({matchUtilities:r,theme:e})=>{r({contrast:t=>({"--tw-contrast":`contrast(${t})`,"@defaults filter":{},filter:nt})},{values:e("contrast")})},dropShadow:({matchUtilities:r,theme:e})=>{r({"drop-shadow":t=>({"--tw-drop-shadow":Array.isArray(t)?t.map(i=>`drop-shadow(${i})`).join(" "):`drop-shadow(${t})`,"@defaults filter":{},filter:nt})},{values:e("dropShadow")})},grayscale:({matchUtilities:r,theme:e})=>{r({grayscale:t=>({"--tw-grayscale":`grayscale(${t})`,"@defaults filter":{},filter:nt})},{values:e("grayscale")})},hueRotate:({matchUtilities:r,theme:e})=>{r({"hue-rotate":t=>({"--tw-hue-rotate":`hue-rotate(${t})`,"@defaults filter":{},filter:nt})},{values:e("hueRotate"),supportsNegativeValues:!0})},invert:({matchUtilities:r,theme:e})=>{r({invert:t=>({"--tw-invert":`invert(${t})`,"@defaults filter":{},filter:nt})},{values:e("invert")})},saturate:({matchUtilities:r,theme:e})=>{r({saturate:t=>({"--tw-saturate":`saturate(${t})`,"@defaults filter":{},filter:nt})},{values:e("saturate")})},sepia:({matchUtilities:r,theme:e})=>{r({sepia:t=>({"--tw-sepia":`sepia(${t})`,"@defaults filter":{},filter:nt})},{values:e("sepia")})},filter:({addDefaults:r,addUtilities:e})=>{r("filter",{"--tw-blur":" ","--tw-brightness":" ","--tw-contrast":" ","--tw-grayscale":" ","--tw-hue-rotate":" ","--tw-invert":" ","--tw-saturate":" ","--tw-sepia":" ","--tw-drop-shadow":" "}),e({".filter":{"@defaults filter":{},filter:nt},".filter-none":{filter:"none"}})},backdropBlur:({matchUtilities:r,theme:e})=>{r({"backdrop-blur":t=>({"--tw-backdrop-blur":t.trim()===""?" ":`blur(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBlur")})},backdropBrightness:({matchUtilities:r,theme:e})=>{r({"backdrop-brightness":t=>({"--tw-backdrop-brightness":`brightness(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropBrightness")})},backdropContrast:({matchUtilities:r,theme:e})=>{r({"backdrop-contrast":t=>({"--tw-backdrop-contrast":`contrast(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropContrast")})},backdropGrayscale:({matchUtilities:r,theme:e})=>{r({"backdrop-grayscale":t=>({"--tw-backdrop-grayscale":`grayscale(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropGrayscale")})},backdropHueRotate:({matchUtilities:r,theme:e})=>{r({"backdrop-hue-rotate":t=>({"--tw-backdrop-hue-rotate":`hue-rotate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropHueRotate"),supportsNegativeValues:!0})},backdropInvert:({matchUtilities:r,theme:e})=>{r({"backdrop-invert":t=>({"--tw-backdrop-invert":`invert(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropInvert")})},backdropOpacity:({matchUtilities:r,theme:e})=>{r({"backdrop-opacity":t=>({"--tw-backdrop-opacity":`opacity(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropOpacity")})},backdropSaturate:({matchUtilities:r,theme:e})=>{r({"backdrop-saturate":t=>({"--tw-backdrop-saturate":`saturate(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSaturate")})},backdropSepia:({matchUtilities:r,theme:e})=>{r({"backdrop-sepia":t=>({"--tw-backdrop-sepia":`sepia(${t})`,"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge})},{values:e("backdropSepia")})},backdropFilter:({addDefaults:r,addUtilities:e})=>{r("backdrop-filter",{"--tw-backdrop-blur":" ","--tw-backdrop-brightness":" ","--tw-backdrop-contrast":" ","--tw-backdrop-grayscale":" ","--tw-backdrop-hue-rotate":" ","--tw-backdrop-invert":" ","--tw-backdrop-opacity":" ","--tw-backdrop-saturate":" ","--tw-backdrop-sepia":" "}),e({".backdrop-filter":{"@defaults backdrop-filter":{},"-webkit-backdrop-filter":ge,"backdrop-filter":ge},".backdrop-filter-none":{"-webkit-backdrop-filter":"none","backdrop-filter":"none"}})},transitionProperty:({matchUtilities:r,theme:e})=>{let t=e("transitionTimingFunction.DEFAULT"),i=e("transitionDuration.DEFAULT");r({transition:n=>({"transition-property":n,...n==="none"?{}:{"transition-timing-function":t,"transition-duration":i}})},{values:e("transitionProperty")})},transitionDelay:L("transitionDelay",[["delay",["transitionDelay"]]]),transitionDuration:L("transitionDuration",[["duration",["transitionDuration"]]],{filterDefault:!0}),transitionTimingFunction:L("transitionTimingFunction",[["ease",["transitionTimingFunction"]]],{filterDefault:!0}),willChange:L("willChange",[["will-change",["will-change"]]]),contain:({addDefaults:r,addUtilities:e})=>{let t="var(--tw-contain-size) var(--tw-contain-layout) var(--tw-contain-paint) var(--tw-contain-style)";r("contain",{"--tw-contain-size":" ","--tw-contain-layout":" ","--tw-contain-paint":" ","--tw-contain-style":" "}),e({".contain-none":{contain:"none"},".contain-content":{contain:"content"},".contain-strict":{contain:"strict"},".contain-size":{"@defaults contain":{},"--tw-contain-size":"size",contain:t},".contain-inline-size":{"@defaults contain":{},"--tw-contain-size":"inline-size",contain:t},".contain-layout":{"@defaults contain":{},"--tw-contain-layout":"layout",contain:t},".contain-paint":{"@defaults contain":{},"--tw-contain-paint":"paint",contain:t},".contain-style":{"@defaults contain":{},"--tw-contain-style":"style",contain:t}})},content:L("content",[["content",["--tw-content",["content","var(--tw-content)"]]]]),forcedColorAdjust:({addUtilities:r})=>{r({".forced-color-adjust-auto":{"forced-color-adjust":"auto"},".forced-color-adjust-none":{"forced-color-adjust":"none"}})}}});function p_(r){if(r===void 0)return!1;if(r==="true"||r==="1")return!0;if(r==="false"||r==="0")return!1;if(r==="*")return!0;let e=r.split(",").map(t=>t.split(":")[0]);return e.includes("-tailwindcss")?!1:!!e.includes("tailwindcss")}var Je,yh,bh,Zn,Lo,gt,Ei,It=R(()=>{u();Je=typeof m!="undefined"?{NODE_ENV:"production",DEBUG:p_(m.env.DEBUG)}:{NODE_ENV:"production",DEBUG:!1},yh=new Map,bh=new Map,Zn=new Map,Lo=new Map,gt=new String("*"),Ei=Symbol("__NONE__")});function cr(r){let e=[],t=!1;for(let i=0;i0)}var wh,vh,d_,Mo=R(()=>{u();wh=new Map([["{","}"],["[","]"],["(",")"]]),vh=new Map(Array.from(wh.entries()).map(([r,e])=>[e,r])),d_=new Set(['"',"'","`"])});function pr(r){let[e]=xh(r);return e.forEach(([t,i])=>t.removeChild(i)),r.nodes.push(...e.map(([,t])=>t)),r}function xh(r){let e=[],t=null;for(let i of r.nodes)if(i.type==="combinator")e=e.filter(([,n])=>Bo(n).includes("jumpable")),t=null;else if(i.type==="pseudo"){h_(i)?(t=i,e.push([r,i,null])):t&&m_(i,t)?e.push([r,i,t]):t=null;for(let n of i.nodes??[]){let[a,s]=xh(n);t=s||t,e.push(...a)}}return[e,t]}function kh(r){return r.value.startsWith("::")||No[r.value]!==void 0}function h_(r){return kh(r)&&Bo(r).includes("terminal")}function m_(r,e){return r.type!=="pseudo"||kh(r)?!1:Bo(e).includes("actionable")}function Bo(r){return No[r.value]??No.__default__}var No,es=R(()=>{u();No={"::after":["terminal","jumpable"],"::backdrop":["terminal","jumpable"],"::before":["terminal","jumpable"],"::cue":["terminal"],"::cue-region":["terminal"],"::first-letter":["terminal","jumpable"],"::first-line":["terminal","jumpable"],"::grammar-error":["terminal"],"::marker":["terminal","jumpable"],"::part":["terminal","actionable"],"::placeholder":["terminal","jumpable"],"::selection":["terminal","jumpable"],"::slotted":["terminal"],"::spelling-error":["terminal"],"::target-text":["terminal"],"::file-selector-button":["terminal","actionable"],"::deep":["actionable"],"::v-deep":["actionable"],"::ng-deep":["actionable"],":after":["terminal","jumpable"],":before":["terminal","jumpable"],":first-letter":["terminal","jumpable"],":first-line":["terminal","jumpable"],":where":[],":is":[],":has":[],__default__:["terminal","actionable"]}});function dr(r,{context:e,candidate:t}){let i=e?.tailwindConfig.prefix??"",n=r.map(s=>{let o=(0,st.default)().astSync(s.format);return{...s,ast:s.respectPrefix?ur(i,o):o}}),a=st.default.root({nodes:[st.default.selector({nodes:[st.default.className({value:Te(t)})]})]});for(let{ast:s}of n)[a,s]=y_(a,s),s.walkNesting(o=>o.replaceWith(...a.nodes[0].nodes)),a=s;return a}function Ah(r){let e=[];for(;r.prev()&&r.prev().type!=="combinator";)r=r.prev();for(;r&&r.type!=="combinator";)e.push(r),r=r.next();return e}function g_(r){return r.sort((e,t)=>e.type==="tag"&&t.type==="class"?-1:e.type==="class"&&t.type==="tag"?1:e.type==="class"&&t.type==="pseudo"&&t.value.startsWith("::")?-1:e.type==="pseudo"&&e.value.startsWith("::")&&t.type==="class"?1:r.index(e)-r.index(t)),r}function jo(r,e){let t=!1;r.walk(i=>{if(i.type==="class"&&i.value===e)return t=!0,!1}),t||r.remove()}function ts(r,e,{context:t,candidate:i,base:n}){let a=t?.tailwindConfig?.separator??":";n=n??ve(i,a).pop();let s=(0,st.default)().astSync(r);if(s.walkClasses(f=>{f.raws&&f.value.includes(n)&&(f.raws.value=Te((0,Sh.default)(f.raws.value)))}),s.each(f=>jo(f,n)),s.length===0)return null;let o=Array.isArray(e)?dr(e,{context:t,candidate:i}):e;if(o===null)return s.toString();let l=st.default.comment({value:"/*__simple__*/"}),c=st.default.comment({value:"/*__simple__*/"});return s.walkClasses(f=>{if(f.value!==n)return;let d=f.parent,p=o.nodes[0].nodes;if(d.nodes.length===1){f.replaceWith(...p);return}let h=Ah(f);d.insertBefore(h[0],l),d.insertAfter(h[h.length-1],c);for(let v of p)d.insertBefore(h[0],v.clone());f.remove(),h=Ah(l);let b=d.index(l);d.nodes.splice(b,h.length,...g_(st.default.selector({nodes:h})).nodes),l.remove(),c.remove()}),s.walkPseudos(f=>{f.value===Fo&&f.replaceWith(f.nodes)}),s.each(f=>pr(f)),s.toString()}function y_(r,e){let t=[];return r.walkPseudos(i=>{i.value===Fo&&t.push({pseudo:i,value:i.nodes[0].toString()})}),e.walkPseudos(i=>{if(i.value!==Fo)return;let n=i.nodes[0].toString(),a=t.find(c=>c.value===n);if(!a)return;let s=[],o=i.next();for(;o&&o.type!=="combinator";)s.push(o),o=o.next();let l=o;a.pseudo.parent.insertAfter(a.pseudo,st.default.selector({nodes:s.map(c=>c.clone())})),i.remove(),s.forEach(c=>c.remove()),l&&l.type==="combinator"&&l.remove()}),[r,e]}var st,Sh,Fo,zo=R(()=>{u();st=pe(it()),Sh=pe(Rn());fr();Wn();es();zt();Fo=":merge"});function rs(r,e){let t=(0,Uo.default)().astSync(r);return t.each(i=>{i.nodes.some(a=>a.type==="combinator")&&(i.nodes=[Uo.default.pseudo({value:":is",nodes:[i.clone()]})]),pr(i)}),`${e} ${t.toString()}`}var Uo,Vo=R(()=>{u();Uo=pe(it());es()});function Ho(r){return b_.transformSync(r)}function*w_(r){let e=1/0;for(;e>=0;){let t,i=!1;if(e===1/0&&r.endsWith("]")){let s=r.indexOf("[");r[s-1]==="-"?t=s-1:r[s-1]==="/"?(t=s-1,i=!0):t=-1}else e===1/0&&r.includes("/")?(t=r.lastIndexOf("/"),i=!0):t=r.lastIndexOf("-",e);if(t<0)break;let n=r.slice(0,t),a=r.slice(i?t:t+1);e=t-1,!(n===""||a==="/")&&(yield[n,a])}}function v_(r,e){if(r.length===0||e.tailwindConfig.prefix==="")return r;for(let t of r){let[i]=t;if(i.options.respectPrefix){let n=ee.root({nodes:[t[1].clone()]}),a=t[1].raws.tailwind.classCandidate;n.walkRules(s=>{let o=a.startsWith("-");s.selector=ur(e.tailwindConfig.prefix,s.selector,o)}),t[1]=n.nodes[0]}}return r}function x_(r,e){if(r.length===0)return r;let t=[];function i(n){return n.parent&&n.parent.type==="atrule"&&n.parent.name==="keyframes"}for(let[n,a]of r){let s=ee.root({nodes:[a.clone()]});s.walkRules(o=>{if(i(o))return;let l=(0,is.default)().astSync(o.selector);l.each(c=>jo(c,e)),Wf(l,c=>c===e?`!${c}`:c),o.selector=l.toString(),o.walkDecls(c=>c.important=!0)}),t.push([{...n,important:!0},s.nodes[0]])}return t}function k_(r,e,t){if(e.length===0)return e;let i={modifier:null,value:Ei};{let[n,...a]=ve(r,"/");if(a.length>1&&(n=n+"/"+a.slice(0,-1).join("/"),a=a.slice(-1)),a.length&&!t.variantMap.has(r)&&(r=n,i.modifier=a[0],!we(t.tailwindConfig,"generalizedModifiers")))return[]}if(r.endsWith("]")&&!r.startsWith("[")){let n=/(.)(-?)\[(.*)\]/g.exec(r);if(n){let[,a,s,o]=n;if(a==="@"&&s==="-")return[];if(a!=="@"&&s==="")return[];r=r.replace(`${s}[${o}]`,""),i.value=o}}if(Qo(r)&&!t.variantMap.has(r)){let n=t.offsets.recordVariant(r),a=K(r.slice(1,-1)),s=ve(a,",");if(s.length>1)return[];if(!s.every(os))return[];let o=s.map((l,c)=>[t.offsets.applyParallelOffset(n,c),Oi(l.trim())]);t.variantMap.set(r,o)}if(t.variantMap.has(r)){let n=Qo(r),a=t.variantOptions.get(r)?.[Pt]??{},s=t.variantMap.get(r).slice(),o=[],l=(()=>!(n||a.respectPrefix===!1))();for(let[c,f]of e){if(c.layer==="user")continue;let d=ee.root({nodes:[f.clone()]});for(let[p,h,b]of s){let w=function(){v.raws.neededBackup||(v.raws.neededBackup=!0,v.walkRules(O=>O.raws.originalSelector=O.selector))},k=function(O){return w(),v.each(B=>{B.type==="rule"&&(B.selectors=B.selectors.map(N=>O({get className(){return Ho(N)},selector:N})))}),v},v=(b??d).clone(),y=[],S=h({get container(){return w(),v},separator:t.tailwindConfig.separator,modifySelectors:k,wrap(O){let B=v.nodes;v.removeAll(),O.append(B),v.append(O)},format(O){y.push({format:O,respectPrefix:l})},args:i});if(Array.isArray(S)){for(let[O,B]of S.entries())s.push([t.offsets.applyParallelOffset(p,O),B,v.clone()]);continue}if(typeof S=="string"&&y.push({format:S,respectPrefix:l}),S===null)continue;v.raws.neededBackup&&(delete v.raws.neededBackup,v.walkRules(O=>{let B=O.raws.originalSelector;if(!B||(delete O.raws.originalSelector,B===O.selector))return;let N=O.selector,T=(0,is.default)(F=>{F.walkClasses(Y=>{Y.value=`${r}${t.tailwindConfig.separator}${Y.value}`})}).processSync(B);y.push({format:N.replace(T,"&"),respectPrefix:l}),O.selector=B})),v.nodes[0].raws.tailwind={...v.nodes[0].raws.tailwind,parentLayer:c.layer};let E=[{...c,sort:t.offsets.applyVariantOffset(c.sort,p,Object.assign(i,t.variantOptions.get(r))),collectedFormats:(c.collectedFormats??[]).concat(y)},v.nodes[0]];o.push(E)}}return o}return[]}function Wo(r,e,t={}){return!ke(r)&&!Array.isArray(r)?[[r],t]:Array.isArray(r)?Wo(r[0],e,r[1]):(e.has(r)||e.set(r,lr(r)),[e.get(r),t])}function A_(r){return S_.test(r)}function C_(r){if(!r.includes("://"))return!1;try{let e=new URL(r);return e.scheme!==""&&e.host!==""}catch(e){return!1}}function Ch(r){let e=!0;return r.walkDecls(t=>{if(!_h(t.prop,t.value))return e=!1,!1}),e}function _h(r,e){if(C_(`${r}:${e}`))return!1;try{return ee.parse(`a{${r}:${e}}`).toResult(),!0}catch(t){return!1}}function __(r,e){let[,t,i]=r.match(/^\[([a-zA-Z0-9-_]+):(\S+)\]$/)??[];if(i===void 0||!A_(t)||!cr(i))return null;let n=K(i,{property:t});return _h(t,n)?[[{sort:e.offsets.arbitraryProperty(r),layer:"utilities",options:{respectImportant:!0}},()=>({[Do(r)]:{[t]:n}})]]:null}function*E_(r,e){e.candidateRuleMap.has(r)&&(yield[e.candidateRuleMap.get(r),"DEFAULT"]),yield*function*(o){o!==null&&(yield[o,"DEFAULT"])}(__(r,e));let t=r,i=!1,n=e.tailwindConfig.prefix,a=n.length,s=t.startsWith(n)||t.startsWith(`-${n}`);t[a]==="-"&&s&&(i=!0,t=n+t.slice(a+1)),i&&e.candidateRuleMap.has(t)&&(yield[e.candidateRuleMap.get(t),"-DEFAULT"]);for(let[o,l]of w_(t))e.candidateRuleMap.has(o)&&(yield[e.candidateRuleMap.get(o),i?`-${l}`:l])}function O_(r,e){return r===gt?[gt]:ve(r,e)}function*T_(r,e){for(let t of r)t[1].raws.tailwind={...t[1].raws.tailwind,classCandidate:e,preserveSource:t[0].options?.preserveSource??!1},yield t}function*Go(r,e){let t=e.tailwindConfig.separator,[i,...n]=O_(r,t).reverse(),a=!1;i.startsWith("!")&&(a=!0,i=i.slice(1));for(let s of E_(i,e)){let o=[],l=new Map,[c,f]=s,d=c.length===1;for(let[p,h]of c){let b=[];if(typeof h=="function")for(let v of[].concat(h(f,{isOnlyPlugin:d}))){let[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}else if(f==="DEFAULT"||f==="-DEFAULT"){let v=h,[y,w]=Wo(v,e.postCssNodeCache);for(let k of y)b.push([{...p,options:{...p.options,...w}},k])}if(b.length>0){let v=Array.from(Zs(p.options?.types??[],f,p.options??{},e.tailwindConfig)).map(([y,w])=>w);v.length>0&&l.set(b,v),o.push(b)}}if(Qo(f)){if(o.length>1){let b=function(y){return y.length===1?y[0]:y.find(w=>{let k=l.get(w);return w.some(([{options:S},E])=>Ch(E)?S.types.some(({type:O,preferOnConflict:B})=>k.includes(O)&&B):!1)})},[p,h]=o.reduce((y,w)=>(w.some(([{options:S}])=>S.types.some(({type:E})=>E==="any"))?y[0].push(w):y[1].push(w),y),[[],[]]),v=b(h)??b(p);if(v)o=[v];else{let y=o.map(k=>new Set([...l.get(k)??[]]));for(let k of y)for(let S of k){let E=!1;for(let O of y)k!==O&&O.has(S)&&(O.delete(S),E=!0);E&&k.delete(S)}let w=[];for(let[k,S]of y.entries())for(let E of S){let O=o[k].map(([,B])=>B).flat().map(B=>B.toString().split(` +`).slice(1,-1).map(N=>N.trim()).map(N=>` ${N}`).join(` +`)).join(` + +`);w.push(` Use \`${r.replace("[",`[${E}:`)}\` for \`${O.trim()}\``);break}G.warn([`The class \`${r}\` is ambiguous and matches multiple utilities.`,...w,`If this is content and not a class, replace it with \`${r.replace("[","[").replace("]","]")}\` to silence this warning.`]);continue}}o=o.map(p=>p.filter(h=>Ch(h[1])))}o=o.flat(),o=Array.from(T_(o,i)),o=v_(o,e),a&&(o=x_(o,i));for(let p of n)o=k_(p,o,e);for(let p of o)p[1].raws.tailwind={...p[1].raws.tailwind,candidate:r},p=R_(p,{context:e,candidate:r}),p!==null&&(yield p)}}function R_(r,{context:e,candidate:t}){if(!r[0].collectedFormats)return r;let i=!0,n;try{n=dr(r[0].collectedFormats,{context:e,candidate:t})}catch{return null}let a=ee.root({nodes:[r[1].clone()]});return a.walkRules(s=>{if(!ns(s))try{let o=ts(s.selector,n,{candidate:t,context:e});if(o===null){s.remove();return}s.selector=o}catch{return i=!1,!1}}),!i||a.nodes.length===0?null:(r[1]=a.nodes[0],r)}function ns(r){return r.parent&&r.parent.type==="atrule"&&r.parent.name==="keyframes"}function P_(r){if(r===!0)return e=>{ns(e)||e.walkDecls(t=>{t.parent.type==="rule"&&!ns(t.parent)&&(t.important=!0)})};if(typeof r=="string")return e=>{ns(e)||(e.selectors=e.selectors.map(t=>rs(t,r)))}}function ss(r,e,t=!1){let i=[],n=P_(e.tailwindConfig.important);for(let a of r){if(e.notClassCache.has(a))continue;if(e.candidateRuleCache.has(a)){i=i.concat(Array.from(e.candidateRuleCache.get(a)));continue}let s=Array.from(Go(a,e));if(s.length===0){e.notClassCache.add(a);continue}e.classCache.set(a,s);let o=e.candidateRuleCache.get(a)??new Set;e.candidateRuleCache.set(a,o);for(let l of s){let[{sort:c,options:f},d]=l;if(f.respectImportant&&n){let h=ee.root({nodes:[d.clone()]});h.walkRules(n),d=h.nodes[0]}let p=[c,t?d.clone():d];o.add(p),e.ruleCache.add(p),i.push(p)}}return i}function Qo(r){return r.startsWith("[")&&r.endsWith("]")}var is,b_,S_,as=R(()=>{u();Ot();is=pe(it());Io();Kt();Wn();Fr();Be();It();zo();qo();Br();_i();Mo();zt();ct();Vo();b_=(0,is.default)(r=>r.first.filter(({type:e})=>e==="class").pop().value);S_=/^[a-z_-]/});var Eh,Oh=R(()=>{u();Eh={}});function I_(r){try{return Eh.createHash("md5").update(r,"utf-8").digest("binary")}catch(e){return""}}function Th(r,e){let t=e.toString();if(!t.includes("@tailwind"))return!1;let i=Lo.get(r),n=I_(t),a=i!==n;return Lo.set(r,n),a}var Rh=R(()=>{u();Oh();It()});function ls(r){return(r>0n)-(r<0n)}var Ph=R(()=>{u()});function Ih(r,e){let t=0n,i=0n;for(let[n,a]of e)r&n&&(t=t|n,i=i|a);return r&~t|i}var Dh=R(()=>{u()});function qh(r){let e=null;for(let t of r)e=e??t,e=e>t?e:t;return e}function D_(r,e){let t=r.length,i=e.length,n=t{u();Ph();Dh();Yo=class{constructor(){this.offsets={defaults:0n,base:0n,components:0n,utilities:0n,variants:0n,user:0n},this.layerPositions={defaults:0n,base:1n,components:2n,utilities:3n,user:4n,variants:5n},this.reservedVariantBits=0n,this.variantOffsets=new Map}create(e){return{layer:e,parentLayer:e,arbitrary:0n,variants:0n,parallelIndex:0n,index:this.offsets[e]++,propertyOffset:0n,property:"",options:[]}}arbitraryProperty(e){return{...this.create("utilities"),arbitrary:1n,property:e}}forVariant(e,t=0){let i=this.variantOffsets.get(e);if(i===void 0)throw new Error(`Cannot find offset for unknown variant ${e}`);return{...this.create("variants"),variants:i<n.startsWith("[")).sort(([n],[a])=>D_(n,a)),t=e.map(([,n])=>n).sort((n,a)=>ls(n-a));return e.map(([,n],a)=>[n,t[a]]).filter(([n,a])=>n!==a)}remapArbitraryVariantOffsets(e){let t=this.recalculateVariantOffsets();return t.length===0?e:e.map(i=>{let[n,a]=i;return n={...n,variants:Ih(n.variants,t)},[n,a]})}sortArbitraryProperties(e){let t=new Set;for(let[s]of e)s.arbitrary===1n&&t.add(s.property);if(t.size===0)return e;let i=Array.from(t).sort(),n=new Map,a=1n;for(let s of i)n.set(s,a++);return e.map(s=>{let[o,l]=s;return o={...o,propertyOffset:n.get(o.property)??0n},[o,l]})}sort(e){return e=this.remapArbitraryVariantOffsets(e),e=this.sortArbitraryProperties(e),e.sort(([t],[i])=>ls(this.compare(t,i)))}}});function Zo(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function Mh({type:r="any",...e}){let t=[].concat(r);return{...e,types:t.map(i=>Array.isArray(i)?{type:i[0],...i[1]}:{type:i,preferOnConflict:!1})}}function q_(r){let e=[],t="",i=0;for(let n=0;n0&&e.push(t.trim()),e=e.filter(n=>n!==""),e}function $_(r,e,{before:t=[]}={}){if(t=[].concat(t),t.length<=0){r.push(e);return}let i=r.length-1;for(let n of t){let a=r.indexOf(n);a!==-1&&(i=Math.min(i,a))}r.splice(i,0,e)}function Nh(r){return Array.isArray(r)?r.flatMap(e=>!Array.isArray(e)&&!ke(e)?e:lr(e)):Nh([r])}function L_(r,e){return(0,Ko.default)(i=>{let n=[];return e&&e(i),i.walkClasses(a=>{n.push(a.value)}),n}).transformSync(r)}function M_(r){r.walkPseudos(e=>{e.value===":not"&&e.remove()})}function N_(r,e={containsNonOnDemandable:!1},t=0){let i=[],n=[];r.type==="rule"?n.push(...r.selectors):r.type==="atrule"&&r.walkRules(a=>n.push(...a.selectors));for(let a of n){let s=L_(a,M_);s.length===0&&(e.containsNonOnDemandable=!0);for(let o of s)i.push(o)}return t===0?[e.containsNonOnDemandable||i.length===0,i]:i}function us(r){return Nh(r).flatMap(e=>{let t=new Map,[i,n]=N_(e);return i&&n.unshift(gt),n.map(a=>(t.has(e)||t.set(e,e),[a,t.get(e)]))})}function os(r){return r.startsWith("@")||r.includes("&")}function Oi(r){r=r.replace(/\n+/g,"").replace(/\s{1,}/g," ").trim();let e=q_(r).map(t=>{if(!t.startsWith("@"))return({format:a})=>a(t);let[,i,n]=/@(\S*)( .+|[({].*)?/g.exec(t);return({wrap:a})=>a(ee.atRule({name:i,params:n?.trim()??""}))}).reverse();return t=>{for(let i of e)i(t)}}function B_(r,e,{variantList:t,variantMap:i,offsets:n,classList:a}){function s(p,h){return p?(0,Lh.default)(r,p,h):r}function o(p){return ur(r.prefix,p)}function l(p,h){return p===gt?gt:h.respectPrefix?e.tailwindConfig.prefix+p:p}function c(p,h,b={}){let v=kt(p),y=s(["theme",...v],h);return mt(v[0])(y,b)}let f=0,d={postcss:ee,prefix:o,e:Te,config:s,theme:c,corePlugins:p=>Array.isArray(r.corePlugins)?r.corePlugins.includes(p):s(["corePlugins",p],!0),variants:()=>[],addBase(p){for(let[h,b]of us(p)){let v=l(h,{}),y=n.create("base");e.candidateRuleMap.has(v)||e.candidateRuleMap.set(v,[]),e.candidateRuleMap.get(v).push([{sort:y,layer:"base"},b])}},addDefaults(p,h){let b={[`@defaults ${p}`]:h};for(let[v,y]of us(b)){let w=l(v,{});e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("defaults"),layer:"defaults"},y])}},addComponents(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!1},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("components"),layer:"components",options:h},y])}},addUtilities(p,h){h=Object.assign({},{preserveSource:!1,respectPrefix:!0,respectImportant:!0},Array.isArray(h)?{}:h);for(let[v,y]of us(p)){let w=l(v,h);a.add(w),e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push([{sort:n.create("utilities"),layer:"utilities",options:h},y])}},matchUtilities:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!0,modifiers:!1},...h});let v=n.create("utilities");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"utilities",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},matchComponents:function(p,h){h=Mh({...{respectPrefix:!0,respectImportant:!1,modifiers:!1},...h});let v=n.create("components");for(let y in p){let S=function(O,{isOnlyPlugin:B}){let[N,T,F]=Js(h.types,O,h,r);if(N===void 0)return[];if(!h.types.some(({type:U})=>U===T))if(B)G.warn([`Unnecessary typehint \`${T}\` in \`${y}-${O}\`.`,`You can safely update it to \`${y}-${O.replace(T+":","")}\`.`]);else return[];if(!cr(N))return[];let Y={get modifier(){return h.modifiers||G.warn(`modifier-used-without-options-for-${y}`,["Your plugin must set `modifiers: true` in its options to support modifiers."]),F}},_=we(r,"generalizedModifiers");return[].concat(_?k(N,Y):k(N)).filter(Boolean).map(U=>({[Gn(y,O)]:U}))},w=l(y,h),k=p[y];a.add([w,h]);let E=[{sort:v,layer:"components",options:h},S];e.candidateRuleMap.has(w)||e.candidateRuleMap.set(w,[]),e.candidateRuleMap.get(w).push(E)}},addVariant(p,h,b={}){h=[].concat(h).map(v=>{if(typeof v!="string")return(y={})=>{let{args:w,modifySelectors:k,container:S,separator:E,wrap:O,format:B}=y,N=v(Object.assign({modifySelectors:k,container:S,separator:E},b.type===Xo.MatchVariant&&{args:w,wrap:O,format:B}));if(typeof N=="string"&&!os(N))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Array.isArray(N)?N.filter(T=>typeof T=="string").map(T=>Oi(T)):N&&typeof N=="string"&&Oi(N)(y)};if(!os(v))throw new Error(`Your custom variant \`${p}\` has an invalid format string. Make sure it's an at-rule or contains a \`&\` placeholder.`);return Oi(v)}),$_(t,p,b),i.set(p,h),e.variantOptions.set(p,b)},matchVariant(p,h,b){let v=b?.id??++f,y=p==="@",w=we(r,"generalizedModifiers");for(let[S,E]of Object.entries(b?.values??{}))S!=="DEFAULT"&&d.addVariant(y?`${p}${S}`:`${p}-${S}`,({args:O,container:B})=>h(E,w?{modifier:O?.modifier,container:B}:{container:B}),{...b,value:E,id:v,type:Xo.MatchVariant,variantInfo:Jo.Base});let k="DEFAULT"in(b?.values??{});d.addVariant(p,({args:S,container:E})=>S?.value===Ei&&!k?null:h(S?.value===Ei?b.values.DEFAULT:S?.value??(typeof S=="string"?S:""),w?{modifier:S?.modifier,container:E}:{container:E}),{...b,id:v,type:Xo.MatchVariant,variantInfo:Jo.Dynamic})}};return d}function fs(r){return el.has(r)||el.set(r,new Map),el.get(r)}function Bh(r,e){let t=!1,i=new Map;for(let n of r){if(!n)continue;let a=sa.parse(n),s=a.hash?a.href.replace(a.hash,""):a.href;s=a.search?s.replace(a.search,""):s;let o=be.statSync(decodeURIComponent(s),{throwIfNoEntry:!1})?.mtimeMs;!o||((!e.has(n)||o>e.get(n))&&(t=!0),i.set(n,o))}return[t,i]}function Fh(r){r.walkAtRules(e=>{["responsive","variants"].includes(e.name)&&(Fh(e),e.before(e.nodes),e.remove())})}function F_(r){let e=[];return r.each(t=>{t.type==="atrule"&&["responsive","variants"].includes(t.name)&&(t.name="layer",t.params="utilities")}),r.walkAtRules("layer",t=>{if(Fh(t),t.params==="base"){for(let i of t.nodes)e.push(function({addBase:n}){n(i,{respectPrefix:!1})});t.remove()}else if(t.params==="components"){for(let i of t.nodes)e.push(function({addComponents:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}else if(t.params==="utilities"){for(let i of t.nodes)e.push(function({addUtilities:n}){n(i,{respectPrefix:!1,preserveSource:!0})});t.remove()}}),e}function j_(r,e){let t=Object.entries({...se,...mh}).map(([l,c])=>r.tailwindConfig.corePlugins.includes(l)?c:null).filter(Boolean),i=r.tailwindConfig.plugins.map(l=>(l.__isOptionsFunction&&(l=l()),typeof l=="function"?l:l.handler)),n=F_(e),a=[se.childVariant,se.pseudoElementVariants,se.pseudoClassVariants,se.hasVariants,se.ariaVariants,se.dataVariants],s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.darkVariants,se.forcedColorsVariants,se.printVariant];return(r.tailwindConfig.darkMode==="class"||Array.isArray(r.tailwindConfig.darkMode)&&r.tailwindConfig.darkMode[0]==="class")&&(s=[se.supportsVariants,se.reducedMotionVariants,se.prefersContrastVariants,se.darkVariants,se.screenVariants,se.orientationVariants,se.directionVariants,se.forcedColorsVariants,se.printVariant]),[...t,...a,...i,...s,...n]}function z_(r,e){let t=[],i=new Map;e.variantMap=i;let n=new Yo;e.offsets=n;let a=new Set,s=B_(e.tailwindConfig,e,{variantList:t,variantMap:i,offsets:n,classList:a});for(let f of r)if(Array.isArray(f))for(let d of f)d(s);else f?.(s);n.recordVariants(t,f=>i.get(f).length);for(let[f,d]of i.entries())e.variantMap.set(f,d.map((p,h)=>[n.forVariant(f,h),p]));let o=(e.tailwindConfig.safelist??[]).filter(Boolean);if(o.length>0){let f=[];for(let d of o){if(typeof d=="string"){e.changedContent.push({content:d,extension:"html"});continue}if(d instanceof RegExp){G.warn("root-regex",["Regular expressions in `safelist` work differently in Tailwind CSS v3.0.","Update your `safelist` configuration to eliminate this warning.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"]);continue}f.push(d)}if(f.length>0){let d=new Map,p=e.tailwindConfig.prefix.length,h=f.some(b=>b.pattern.source.includes("!"));for(let b of a){let v=Array.isArray(b)?(()=>{let[y,w]=b,S=Object.keys(w?.values??{}).map(E=>Ci(y,E));return w?.supportsNegativeValues&&(S=[...S,...S.map(E=>"-"+E)],S=[...S,...S.map(E=>E.slice(0,p)+"-"+E.slice(p))]),w.types.some(({type:E})=>E==="color")&&(S=[...S,...S.flatMap(E=>Object.keys(e.tailwindConfig.theme.opacity).map(O=>`${E}/${O}`))]),h&&w?.respectImportant&&(S=[...S,...S.map(E=>"!"+E)]),S})():[b];for(let y of v)for(let{pattern:w,variants:k=[]}of f)if(w.lastIndex=0,d.has(w)||d.set(w,0),!!w.test(y)){d.set(w,d.get(w)+1),e.changedContent.push({content:y,extension:"html"});for(let S of k)e.changedContent.push({content:S+e.tailwindConfig.separator+y,extension:"html"})}}for(let[b,v]of d.entries())v===0&&G.warn([`The safelist pattern \`${b}\` doesn't match any Tailwind CSS classes.`,"Fix this pattern or remove it from your `safelist` configuration.","https://tailwindcss.com/docs/content-configuration#safelisting-classes"])}}let l=[].concat(e.tailwindConfig.darkMode??"media")[1]??"dark",c=[Zo(e,l),Zo(e,"group"),Zo(e,"peer")];e.getClassOrder=function(d){let p=[...d].sort((y,w)=>y===w?0:y[y,null])),b=ss(new Set(p),e,!0);b=e.offsets.sort(b);let v=BigInt(c.length);for(let[,y]of b){let w=y.raws.tailwind.candidate;h.set(w,h.get(w)??v++)}return d.map(y=>{let w=h.get(y)??null,k=c.indexOf(y);return w===null&&k!==-1&&(w=BigInt(k)),[y,w]})},e.getClassList=function(d={}){let p=[];for(let h of a)if(Array.isArray(h)){let[b,v]=h,y=[],w=Object.keys(v?.modifiers??{});v?.types?.some(({type:E})=>E==="color")&&w.push(...Object.keys(e.tailwindConfig.theme.opacity??{}));let k={modifiers:w},S=d.includeMetadata&&w.length>0;for(let[E,O]of Object.entries(v?.values??{})){if(O==null)continue;let B=Ci(b,E);if(p.push(S?[B,k]:B),v?.supportsNegativeValues&&xt(O)){let N=Ci(b,`-${E}`);y.push(S?[N,k]:N)}}p.push(...y)}else p.push(h);return p},e.getVariants=function(){let d=Math.random().toString(36).substring(7).toUpperCase(),p=[];for(let[h,b]of e.variantOptions.entries())b.variantInfo!==Jo.Base&&p.push({name:h,isArbitrary:b.type===Symbol.for("MATCH_VARIANT"),values:Object.keys(b.values??{}),hasDash:h!=="@",selectors({modifier:v,value:y}={}){let w=`TAILWINDPLACEHOLDER${d}`,k=ee.rule({selector:`.${w}`}),S=ee.root({nodes:[k.clone()]}),E=S.toString(),O=(e.variantMap.get(h)??[]).flatMap(([oe,A])=>A),B=[];for(let oe of O){let A=[],C={args:{modifier:v,value:b.values?.[y]??y},separator:e.tailwindConfig.separator,modifySelectors(V){return S.each(Ee=>{Ee.type==="rule"&&(Ee.selectors=Ee.selectors.map(Ie=>V({get className(){return Ho(Ie)},selector:Ie})))}),S},format(V){A.push(V)},wrap(V){A.push(`@${V.name} ${V.params} { & }`)},container:S},he=oe(C);if(A.length>0&&B.push(A),Array.isArray(he))for(let V of he)A=[],V(C),B.push(A)}let N=[],T=S.toString();E!==T&&(S.walkRules(oe=>{let A=oe.selector,C=(0,Ko.default)(he=>{he.walkClasses(V=>{V.value=`${h}${e.tailwindConfig.separator}${V.value}`})}).processSync(A);N.push(A.replace(C,"&").replace(w,"&"))}),S.walkAtRules(oe=>{N.push(`@${oe.name} (${oe.params}) { & }`)}));let F=!(y in(b.values??{})),Y=b[Pt]??{},_=(()=>!(F||Y.respectPrefix===!1))();B=B.map(oe=>oe.map(A=>({format:A,respectPrefix:_}))),N=N.map(oe=>({format:oe,respectPrefix:_}));let Q={candidate:w,context:e},U=B.map(oe=>ts(`.${w}`,dr(oe,Q),Q).replace(`.${w}`,"&").replace("{ & }","").trim());return N.length>0&&U.push(dr(N,Q).toString().replace(`.${w}`,"&")),U}});return p}}function jh(r,e){!r.classCache.has(e)||(r.notClassCache.add(e),r.classCache.delete(e),r.applyClassCache.delete(e),r.candidateRuleMap.delete(e),r.candidateRuleCache.delete(e),r.stylesheetCache=null)}function U_(r,e){let t=e.raws.tailwind.candidate;if(!!t){for(let i of r.ruleCache)i[1].raws.tailwind.candidate===t&&r.ruleCache.delete(i);jh(r,t)}}function tl(r,e=[],t=ee.root()){let i={disposables:[],ruleCache:new Set,candidateRuleCache:new Map,classCache:new Map,applyClassCache:new Map,notClassCache:new Set(r.blocklist??[]),postCssNodeCache:new Map,candidateRuleMap:new Map,tailwindConfig:r,changedContent:e,variantMap:new Map,stylesheetCache:null,variantOptions:new Map,markInvalidUtilityCandidate:a=>jh(i,a),markInvalidUtilityNode:a=>U_(i,a)},n=j_(i,t);return z_(n,i),i}function zh(r,e,t,i,n,a){let s=e.opts.from,o=i!==null;Je.DEBUG&&console.log("Source path:",s);let l;if(o&&hr.has(s))l=hr.get(s);else if(Ti.has(n)){let p=Ti.get(n);Dt.get(p).add(s),hr.set(s,p),l=p}let c=Th(s,r);if(l){let[p,h]=Bh([...a],fs(l));if(!p&&!c)return[l,!1,h]}if(hr.has(s)){let p=hr.get(s);if(Dt.has(p)&&(Dt.get(p).delete(s),Dt.get(p).size===0)){Dt.delete(p);for(let[h,b]of Ti)b===p&&Ti.delete(h);for(let h of p.disposables.splice(0))h(p)}}Je.DEBUG&&console.log("Setting up new context...");let f=tl(t,[],r);Object.assign(f,{userConfigPath:i});let[,d]=Bh([...a],fs(f));return Ti.set(n,f),hr.set(s,f),Dt.has(f)||Dt.set(f,new Set),Dt.get(f).add(s),[f,!0,d]}var Lh,Ko,Pt,Xo,Jo,el,hr,Ti,Dt,_i=R(()=>{u();ft();aa();Ot();Lh=pe(Oa()),Ko=pe(it());Si();Io();Wn();Kt();fr();qo();Fr();gh();It();It();Gi();Be();Hi();Mo();as();Rh();$h();ct();zo();Pt=Symbol(),Xo={AddVariant:Symbol.for("ADD_VARIANT"),MatchVariant:Symbol.for("MATCH_VARIANT")},Jo={Base:1<<0,Dynamic:1<<1};el=new WeakMap;hr=yh,Ti=bh,Dt=Zn});function rl(r){return r.ignore?[]:r.glob?m.env.ROLLUP_WATCH==="true"?[{type:"dependency",file:r.base}]:[{type:"dir-dependency",dir:r.base,glob:r.glob}]:[{type:"dependency",file:r.base}]}var Uh=R(()=>{u()});function Vh(r,e){return{handler:r,config:e}}var Hh,Wh=R(()=>{u();Vh.withOptions=function(r,e=()=>({})){let t=function(i){return{__options:i,handler:r(i),config:e(i)}};return t.__isOptionsFunction=!0,t.__pluginFunction=r,t.__configFunction=e,t};Hh=Vh});var il={};Ge(il,{default:()=>V_});var V_,nl=R(()=>{u();Wh();V_=Hh});var Qh=x((F4,Gh)=>{u();var H_=(nl(),il).default,W_={overflow:"hidden",display:"-webkit-box","-webkit-box-orient":"vertical"},G_=H_(function({matchUtilities:r,addUtilities:e,theme:t,variants:i}){let n=t("lineClamp");r({"line-clamp":a=>({...W_,"-webkit-line-clamp":`${a}`})},{values:n}),e([{".line-clamp-none":{"-webkit-line-clamp":"unset"}}],i("lineClamp"))},{theme:{lineClamp:{1:"1",2:"2",3:"3",4:"4",5:"5",6:"6"}},variants:{lineClamp:["responsive"]}});Gh.exports=G_});function sl(r){r.content.files.length===0&&G.warn("content-problems",["The `content` option in your Tailwind CSS configuration is missing or empty.","Configure your content sources or your generated CSS will be missing styles.","https://tailwindcss.com/docs/content-configuration"]);try{let e=Qh();r.plugins.includes(e)&&(G.warn("line-clamp-in-core",["As of Tailwind CSS v3.3, the `@tailwindcss/line-clamp` plugin is now included by default.","Remove it from the `plugins` array in your configuration to eliminate this warning."]),r.plugins=r.plugins.filter(t=>t!==e))}catch{}return r}var Yh=R(()=>{u();Be()});var Kh,Xh=R(()=>{u();Kh=()=>!1});var cs,Jh=R(()=>{u();cs={sync:r=>[].concat(r),generateTasks:r=>[{dynamic:!1,base:".",negative:[],positive:[].concat(r),patterns:[].concat(r)}],escapePath:r=>r}});var al,Zh=R(()=>{u();al=r=>r});var em,tm=R(()=>{u();em=()=>""});function rm(r){let e=r,t=em(r);return t!=="."&&(e=r.substr(t.length),e.charAt(0)==="/"&&(e=e.substr(1))),e.substr(0,2)==="./"?e=e.substr(2):e.charAt(0)==="/"&&(e=e.substr(1)),{base:t,glob:e}}var im=R(()=>{u();tm()});var ps=x(Ve=>{u();"use strict";Ve.isInteger=r=>typeof r=="number"?Number.isInteger(r):typeof r=="string"&&r.trim()!==""?Number.isInteger(Number(r)):!1;Ve.find=(r,e)=>r.nodes.find(t=>t.type===e);Ve.exceedsLimit=(r,e,t=1,i)=>i===!1||!Ve.isInteger(r)||!Ve.isInteger(e)?!1:(Number(e)-Number(r))/Number(t)>=i;Ve.escapeNode=(r,e=0,t)=>{let i=r.nodes[e];!i||(t&&i.type===t||i.type==="open"||i.type==="close")&&i.escaped!==!0&&(i.value="\\"+i.value,i.escaped=!0)};Ve.encloseBrace=r=>r.type!=="brace"?!1:r.commas>>0+r.ranges>>0==0?(r.invalid=!0,!0):!1;Ve.isInvalidBrace=r=>r.type!=="brace"?!1:r.invalid===!0||r.dollar?!0:r.commas>>0+r.ranges>>0==0||r.open!==!0||r.close!==!0?(r.invalid=!0,!0):!1;Ve.isOpenOrClose=r=>r.type==="open"||r.type==="close"?!0:r.open===!0||r.close===!0;Ve.reduce=r=>r.reduce((e,t)=>(t.type==="text"&&e.push(t.value),t.type==="range"&&(t.type="text"),e),[]);Ve.flatten=(...r)=>{let e=[],t=i=>{for(let n=0;n{u();"use strict";var nm=ps();sm.exports=(r,e={})=>{let t=(i,n={})=>{let a=e.escapeInvalid&&nm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o="";if(i.value)return(a||s)&&nm.isOpenOrClose(i)?"\\"+i.value:i.value;if(i.value)return i.value;if(i.nodes)for(let l of i.nodes)o+=t(l);return o};return t(r)}});var om=x((X4,am)=>{u();"use strict";am.exports=function(r){return typeof r=="number"?r-r==0:typeof r=="string"&&r.trim()!==""?Number.isFinite?Number.isFinite(+r):isFinite(+r):!1}});var gm=x((J4,mm)=>{u();"use strict";var lm=om(),Wt=(r,e,t)=>{if(lm(r)===!1)throw new TypeError("toRegexRange: expected the first argument to be a number");if(e===void 0||r===e)return String(r);if(lm(e)===!1)throw new TypeError("toRegexRange: expected the second argument to be a number.");let i={relaxZeros:!0,...t};typeof i.strictZeros=="boolean"&&(i.relaxZeros=i.strictZeros===!1);let n=String(i.relaxZeros),a=String(i.shorthand),s=String(i.capture),o=String(i.wrap),l=r+":"+e+"="+n+a+s+o;if(Wt.cache.hasOwnProperty(l))return Wt.cache[l].result;let c=Math.min(r,e),f=Math.max(r,e);if(Math.abs(c-f)===1){let v=r+"|"+e;return i.capture?`(${v})`:i.wrap===!1?v:`(?:${v})`}let d=hm(r)||hm(e),p={min:r,max:e,a:c,b:f},h=[],b=[];if(d&&(p.isPadded=d,p.maxLen=String(p.max).length),c<0){let v=f<0?Math.abs(f):1;b=um(v,Math.abs(c),p,i),c=p.a=0}return f>=0&&(h=um(c,f,p,i)),p.negatives=b,p.positives=h,p.result=Q_(b,h,i),i.capture===!0?p.result=`(${p.result})`:i.wrap!==!1&&h.length+b.length>1&&(p.result=`(?:${p.result})`),Wt.cache[l]=p,p.result};function Q_(r,e,t){let i=ol(r,e,"-",!1,t)||[],n=ol(e,r,"",!1,t)||[],a=ol(r,e,"-?",!0,t)||[];return i.concat(a).concat(n).join("|")}function Y_(r,e){let t=1,i=1,n=cm(r,t),a=new Set([e]);for(;r<=n&&n<=e;)a.add(n),t+=1,n=cm(r,t);for(n=pm(e+1,i)-1;r1&&o.count.pop(),o.count.push(f.count[0]),o.string=o.pattern+dm(o.count),s=c+1;continue}t.isPadded&&(d=eE(c,t,i)),f.string=d+f.pattern+dm(f.count),a.push(f),s=c+1,o=f}return a}function ol(r,e,t,i,n){let a=[];for(let s of r){let{string:o}=s;!i&&!fm(e,"string",o)&&a.push(t+o),i&&fm(e,"string",o)&&a.push(t+o)}return a}function X_(r,e){let t=[];for(let i=0;ie?1:e>r?-1:0}function fm(r,e,t){return r.some(i=>i[e]===t)}function cm(r,e){return Number(String(r).slice(0,-e)+"9".repeat(e))}function pm(r,e){return r-r%Math.pow(10,e)}function dm(r){let[e=0,t=""]=r;return t||e>1?`{${e+(t?","+t:"")}}`:""}function Z_(r,e,t){return`[${r}${e-r==1?"":"-"}${e}]`}function hm(r){return/^-?(0+)\d/.test(r)}function eE(r,e,t){if(!e.isPadded)return r;let i=Math.abs(e.maxLen-String(r).length),n=t.relaxZeros!==!1;switch(i){case 0:return"";case 1:return n?"0?":"0";case 2:return n?"0{0,2}":"00";default:return n?`0{0,${i}}`:`0{${i}}`}}Wt.cache={};Wt.clearCache=()=>Wt.cache={};mm.exports=Wt});var fl=x((Z4,Am)=>{u();"use strict";var tE=(Bn(),Nn),ym=gm(),bm=r=>r!==null&&typeof r=="object"&&!Array.isArray(r),rE=r=>e=>r===!0?Number(e):String(e),ll=r=>typeof r=="number"||typeof r=="string"&&r!=="",Ri=r=>Number.isInteger(+r),ul=r=>{let e=`${r}`,t=-1;if(e[0]==="-"&&(e=e.slice(1)),e==="0")return!1;for(;e[++t]==="0";);return t>0},iE=(r,e,t)=>typeof r=="string"||typeof e=="string"?!0:t.stringify===!0,nE=(r,e,t)=>{if(e>0){let i=r[0]==="-"?"-":"";i&&(r=r.slice(1)),r=i+r.padStart(i?e-1:e,"0")}return t===!1?String(r):r},wm=(r,e)=>{let t=r[0]==="-"?"-":"";for(t&&(r=r.slice(1),e--);r.length{r.negatives.sort((s,o)=>so?1:0),r.positives.sort((s,o)=>so?1:0);let t=e.capture?"":"?:",i="",n="",a;return r.positives.length&&(i=r.positives.join("|")),r.negatives.length&&(n=`-(${t}${r.negatives.join("|")})`),i&&n?a=`${i}|${n}`:a=i||n,e.wrap?`(${t}${a})`:a},vm=(r,e,t,i)=>{if(t)return ym(r,e,{wrap:!1,...i});let n=String.fromCharCode(r);if(r===e)return n;let a=String.fromCharCode(e);return`[${n}-${a}]`},xm=(r,e,t)=>{if(Array.isArray(r)){let i=t.wrap===!0,n=t.capture?"":"?:";return i?`(${n}${r.join("|")})`:r.join("|")}return ym(r,e,t)},km=(...r)=>new RangeError("Invalid range arguments: "+tE.inspect(...r)),Sm=(r,e,t)=>{if(t.strictRanges===!0)throw km([r,e]);return[]},aE=(r,e)=>{if(e.strictRanges===!0)throw new TypeError(`Expected step "${r}" to be a number`);return[]},oE=(r,e,t=1,i={})=>{let n=Number(r),a=Number(e);if(!Number.isInteger(n)||!Number.isInteger(a)){if(i.strictRanges===!0)throw km([r,e]);return[]}n===0&&(n=0),a===0&&(a=0);let s=n>a,o=String(r),l=String(e),c=String(t);t=Math.max(Math.abs(t),1);let f=ul(o)||ul(l)||ul(c),d=f?Math.max(o.length,l.length,c.length):0,p=f===!1&&iE(r,e,i)===!1,h=i.transform||rE(p);if(i.toRegex&&t===1)return vm(wm(r,d),wm(e,d),!0,i);let b={negatives:[],positives:[]},v=k=>b[k<0?"negatives":"positives"].push(Math.abs(k)),y=[],w=0;for(;s?n>=a:n<=a;)i.toRegex===!0&&t>1?v(n):y.push(nE(h(n,w),d,p)),n=s?n-t:n+t,w++;return i.toRegex===!0?t>1?sE(b,i):xm(y,null,{wrap:!1,...i}):y},lE=(r,e,t=1,i={})=>{if(!Ri(r)&&r.length>1||!Ri(e)&&e.length>1)return Sm(r,e,i);let n=i.transform||(p=>String.fromCharCode(p)),a=`${r}`.charCodeAt(0),s=`${e}`.charCodeAt(0),o=a>s,l=Math.min(a,s),c=Math.max(a,s);if(i.toRegex&&t===1)return vm(l,c,!1,i);let f=[],d=0;for(;o?a>=s:a<=s;)f.push(n(a,d)),a=o?a-t:a+t,d++;return i.toRegex===!0?xm(f,null,{wrap:!1,options:i}):f},hs=(r,e,t,i={})=>{if(e==null&&ll(r))return[r];if(!ll(r)||!ll(e))return Sm(r,e,i);if(typeof t=="function")return hs(r,e,1,{transform:t});if(bm(t))return hs(r,e,0,t);let n={...i};return n.capture===!0&&(n.wrap=!0),t=t||n.step||1,Ri(t)?Ri(r)&&Ri(e)?oE(r,e,t,n):lE(r,e,Math.max(Math.abs(t),1),n):t!=null&&!bm(t)?aE(t,n):hs(r,e,1,t)};Am.exports=hs});var Em=x((e6,_m)=>{u();"use strict";var uE=fl(),Cm=ps(),fE=(r,e={})=>{let t=(i,n={})=>{let a=Cm.isInvalidBrace(n),s=i.invalid===!0&&e.escapeInvalid===!0,o=a===!0||s===!0,l=e.escapeInvalid===!0?"\\":"",c="";if(i.isOpen===!0||i.isClose===!0)return l+i.value;if(i.type==="open")return o?l+i.value:"(";if(i.type==="close")return o?l+i.value:")";if(i.type==="comma")return i.prev.type==="comma"?"":o?i.value:"|";if(i.value)return i.value;if(i.nodes&&i.ranges>0){let f=Cm.reduce(i.nodes),d=uE(...f,{...e,wrap:!1,toRegex:!0});if(d.length!==0)return f.length>1&&d.length>1?`(${d})`:d}if(i.nodes)for(let f of i.nodes)c+=t(f,i);return c};return t(r)};_m.exports=fE});var Rm=x((t6,Tm)=>{u();"use strict";var cE=fl(),Om=ds(),mr=ps(),Gt=(r="",e="",t=!1)=>{let i=[];if(r=[].concat(r),e=[].concat(e),!e.length)return r;if(!r.length)return t?mr.flatten(e).map(n=>`{${n}}`):e;for(let n of r)if(Array.isArray(n))for(let a of n)i.push(Gt(a,e,t));else for(let a of e)t===!0&&typeof a=="string"&&(a=`{${a}}`),i.push(Array.isArray(a)?Gt(n,a,t):n+a);return mr.flatten(i)},pE=(r,e={})=>{let t=e.rangeLimit===void 0?1e3:e.rangeLimit,i=(n,a={})=>{n.queue=[];let s=a,o=a.queue;for(;s.type!=="brace"&&s.type!=="root"&&s.parent;)s=s.parent,o=s.queue;if(n.invalid||n.dollar){o.push(Gt(o.pop(),Om(n,e)));return}if(n.type==="brace"&&n.invalid!==!0&&n.nodes.length===2){o.push(Gt(o.pop(),["{}"]));return}if(n.nodes&&n.ranges>0){let d=mr.reduce(n.nodes);if(mr.exceedsLimit(...d,e.step,t))throw new RangeError("expanded array length exceeds range limit. Use options.rangeLimit to increase or disable the limit.");let p=cE(...d,e);p.length===0&&(p=Om(n,e)),o.push(Gt(o.pop(),p)),n.nodes=[];return}let l=mr.encloseBrace(n),c=n.queue,f=n;for(;f.type!=="brace"&&f.type!=="root"&&f.parent;)f=f.parent,c=f.queue;for(let d=0;d{u();"use strict";Pm.exports={MAX_LENGTH:1024*64,CHAR_0:"0",CHAR_9:"9",CHAR_UPPERCASE_A:"A",CHAR_LOWERCASE_A:"a",CHAR_UPPERCASE_Z:"Z",CHAR_LOWERCASE_Z:"z",CHAR_LEFT_PARENTHESES:"(",CHAR_RIGHT_PARENTHESES:")",CHAR_ASTERISK:"*",CHAR_AMPERSAND:"&",CHAR_AT:"@",CHAR_BACKSLASH:"\\",CHAR_BACKTICK:"`",CHAR_CARRIAGE_RETURN:"\r",CHAR_CIRCUMFLEX_ACCENT:"^",CHAR_COLON:":",CHAR_COMMA:",",CHAR_DOLLAR:"$",CHAR_DOT:".",CHAR_DOUBLE_QUOTE:'"',CHAR_EQUAL:"=",CHAR_EXCLAMATION_MARK:"!",CHAR_FORM_FEED:"\f",CHAR_FORWARD_SLASH:"/",CHAR_HASH:"#",CHAR_HYPHEN_MINUS:"-",CHAR_LEFT_ANGLE_BRACKET:"<",CHAR_LEFT_CURLY_BRACE:"{",CHAR_LEFT_SQUARE_BRACKET:"[",CHAR_LINE_FEED:` +`,CHAR_NO_BREAK_SPACE:"\xA0",CHAR_PERCENT:"%",CHAR_PLUS:"+",CHAR_QUESTION_MARK:"?",CHAR_RIGHT_ANGLE_BRACKET:">",CHAR_RIGHT_CURLY_BRACE:"}",CHAR_RIGHT_SQUARE_BRACKET:"]",CHAR_SEMICOLON:";",CHAR_SINGLE_QUOTE:"'",CHAR_SPACE:" ",CHAR_TAB:" ",CHAR_UNDERSCORE:"_",CHAR_VERTICAL_LINE:"|",CHAR_ZERO_WIDTH_NOBREAK_SPACE:"\uFEFF"}});var Mm=x((i6,Lm)=>{u();"use strict";var dE=ds(),{MAX_LENGTH:Dm,CHAR_BACKSLASH:cl,CHAR_BACKTICK:hE,CHAR_COMMA:mE,CHAR_DOT:gE,CHAR_LEFT_PARENTHESES:yE,CHAR_RIGHT_PARENTHESES:bE,CHAR_LEFT_CURLY_BRACE:wE,CHAR_RIGHT_CURLY_BRACE:vE,CHAR_LEFT_SQUARE_BRACKET:qm,CHAR_RIGHT_SQUARE_BRACKET:$m,CHAR_DOUBLE_QUOTE:xE,CHAR_SINGLE_QUOTE:kE,CHAR_NO_BREAK_SPACE:SE,CHAR_ZERO_WIDTH_NOBREAK_SPACE:AE}=Im(),CE=(r,e={})=>{if(typeof r!="string")throw new TypeError("Expected a string");let t=e||{},i=typeof t.maxLength=="number"?Math.min(Dm,t.maxLength):Dm;if(r.length>i)throw new SyntaxError(`Input length (${r.length}), exceeds max characters (${i})`);let n={type:"root",input:r,nodes:[]},a=[n],s=n,o=n,l=0,c=r.length,f=0,d=0,p,h={},b=()=>r[f++],v=y=>{if(y.type==="text"&&o.type==="dot"&&(o.type="text"),o&&o.type==="text"&&y.type==="text"){o.value+=y.value;return}return s.nodes.push(y),y.parent=s,y.prev=o,o=y,y};for(v({type:"bos"});f0){if(s.ranges>0){s.ranges=0;let y=s.nodes.shift();s.nodes=[y,{type:"text",value:dE(s)}]}v({type:"comma",value:p}),s.commas++;continue}if(p===gE&&d>0&&s.commas===0){let y=s.nodes;if(d===0||y.length===0){v({type:"text",value:p});continue}if(o.type==="dot"){if(s.range=[],o.value+=p,o.type="range",s.nodes.length!==3&&s.nodes.length!==5){s.invalid=!0,s.ranges=0,o.type="text";continue}s.ranges++,s.args=[];continue}if(o.type==="range"){y.pop();let w=y[y.length-1];w.value+=o.value+p,o=w,s.ranges--;continue}v({type:"dot",value:p});continue}v({type:"text",value:p})}do if(s=a.pop(),s.type!=="root"){s.nodes.forEach(k=>{k.nodes||(k.type==="open"&&(k.isOpen=!0),k.type==="close"&&(k.isClose=!0),k.nodes||(k.type="text"),k.invalid=!0)});let y=a[a.length-1],w=y.nodes.indexOf(s);y.nodes.splice(w,1,...s.nodes)}while(a.length>0);return v({type:"eos"}),n};Lm.exports=CE});var Fm=x((n6,Bm)=>{u();"use strict";var Nm=ds(),_E=Em(),EE=Rm(),OE=Mm(),Le=(r,e={})=>{let t=[];if(Array.isArray(r))for(let i of r){let n=Le.create(i,e);Array.isArray(n)?t.push(...n):t.push(n)}else t=[].concat(Le.create(r,e));return e&&e.expand===!0&&e.nodupes===!0&&(t=[...new Set(t)]),t};Le.parse=(r,e={})=>OE(r,e);Le.stringify=(r,e={})=>typeof r=="string"?Nm(Le.parse(r,e),e):Nm(r,e);Le.compile=(r,e={})=>(typeof r=="string"&&(r=Le.parse(r,e)),_E(r,e));Le.expand=(r,e={})=>{typeof r=="string"&&(r=Le.parse(r,e));let t=EE(r,e);return e.noempty===!0&&(t=t.filter(Boolean)),e.nodupes===!0&&(t=[...new Set(t)]),t};Le.create=(r,e={})=>r===""||r.length<3?[r]:e.expand!==!0?Le.compile(r,e):Le.expand(r,e);Bm.exports=Le});var Pi=x((s6,Hm)=>{u();"use strict";var TE=(et(),Ur),at="\\\\/",jm=`[^${at}]`,yt="\\.",RE="\\+",PE="\\?",ms="\\/",IE="(?=.)",zm="[^/]",pl=`(?:${ms}|$)`,Um=`(?:^|${ms})`,dl=`${yt}{1,2}${pl}`,DE=`(?!${yt})`,qE=`(?!${Um}${dl})`,$E=`(?!${yt}{0,1}${pl})`,LE=`(?!${dl})`,ME=`[^.${ms}]`,NE=`${zm}*?`,Vm={DOT_LITERAL:yt,PLUS_LITERAL:RE,QMARK_LITERAL:PE,SLASH_LITERAL:ms,ONE_CHAR:IE,QMARK:zm,END_ANCHOR:pl,DOTS_SLASH:dl,NO_DOT:DE,NO_DOTS:qE,NO_DOT_SLASH:$E,NO_DOTS_SLASH:LE,QMARK_NO_DOT:ME,STAR:NE,START_ANCHOR:Um},BE={...Vm,SLASH_LITERAL:`[${at}]`,QMARK:jm,STAR:`${jm}*?`,DOTS_SLASH:`${yt}{1,2}(?:[${at}]|$)`,NO_DOT:`(?!${yt})`,NO_DOTS:`(?!(?:^|[${at}])${yt}{1,2}(?:[${at}]|$))`,NO_DOT_SLASH:`(?!${yt}{0,1}(?:[${at}]|$))`,NO_DOTS_SLASH:`(?!${yt}{1,2}(?:[${at}]|$))`,QMARK_NO_DOT:`[^.${at}]`,START_ANCHOR:`(?:^|[${at}])`,END_ANCHOR:`(?:[${at}]|$)`},FE={alnum:"a-zA-Z0-9",alpha:"a-zA-Z",ascii:"\\x00-\\x7F",blank:" \\t",cntrl:"\\x00-\\x1F\\x7F",digit:"0-9",graph:"\\x21-\\x7E",lower:"a-z",print:"\\x20-\\x7E ",punct:"\\-!\"#$%&'()\\*+,./:;<=>?@[\\]^_`{|}~",space:" \\t\\r\\n\\v\\f",upper:"A-Z",word:"A-Za-z0-9_",xdigit:"A-Fa-f0-9"};Hm.exports={MAX_LENGTH:1024*64,POSIX_REGEX_SOURCE:FE,REGEX_BACKSLASH:/\\(?![*+?^${}(|)[\]])/g,REGEX_NON_SPECIAL_CHARS:/^[^@![\].,$*+?^{}()|\\/]+/,REGEX_SPECIAL_CHARS:/[-*+?.^${}(|)[\]]/,REGEX_SPECIAL_CHARS_BACKREF:/(\\?)((\W)(\3*))/g,REGEX_SPECIAL_CHARS_GLOBAL:/([-*+?.^${}(|)[\]])/g,REGEX_REMOVE_BACKSLASH:/(?:\[.*?[^\\]\]|\\(?=.))/g,REPLACEMENTS:{"***":"*","**/**":"**","**/**/**":"**"},CHAR_0:48,CHAR_9:57,CHAR_UPPERCASE_A:65,CHAR_LOWERCASE_A:97,CHAR_UPPERCASE_Z:90,CHAR_LOWERCASE_Z:122,CHAR_LEFT_PARENTHESES:40,CHAR_RIGHT_PARENTHESES:41,CHAR_ASTERISK:42,CHAR_AMPERSAND:38,CHAR_AT:64,CHAR_BACKWARD_SLASH:92,CHAR_CARRIAGE_RETURN:13,CHAR_CIRCUMFLEX_ACCENT:94,CHAR_COLON:58,CHAR_COMMA:44,CHAR_DOT:46,CHAR_DOUBLE_QUOTE:34,CHAR_EQUAL:61,CHAR_EXCLAMATION_MARK:33,CHAR_FORM_FEED:12,CHAR_FORWARD_SLASH:47,CHAR_GRAVE_ACCENT:96,CHAR_HASH:35,CHAR_HYPHEN_MINUS:45,CHAR_LEFT_ANGLE_BRACKET:60,CHAR_LEFT_CURLY_BRACE:123,CHAR_LEFT_SQUARE_BRACKET:91,CHAR_LINE_FEED:10,CHAR_NO_BREAK_SPACE:160,CHAR_PERCENT:37,CHAR_PLUS:43,CHAR_QUESTION_MARK:63,CHAR_RIGHT_ANGLE_BRACKET:62,CHAR_RIGHT_CURLY_BRACE:125,CHAR_RIGHT_SQUARE_BRACKET:93,CHAR_SEMICOLON:59,CHAR_SINGLE_QUOTE:39,CHAR_SPACE:32,CHAR_TAB:9,CHAR_UNDERSCORE:95,CHAR_VERTICAL_LINE:124,CHAR_ZERO_WIDTH_NOBREAK_SPACE:65279,SEP:TE.sep,extglobChars(r){return{"!":{type:"negate",open:"(?:(?!(?:",close:`))${r.STAR})`},"?":{type:"qmark",open:"(?:",close:")?"},"+":{type:"plus",open:"(?:",close:")+"},"*":{type:"star",open:"(?:",close:")*"},"@":{type:"at",open:"(?:",close:")"}}},globChars(r){return r===!0?BE:Vm}}});var Ii=x(Re=>{u();"use strict";var jE=(et(),Ur),zE=m.platform==="win32",{REGEX_BACKSLASH:UE,REGEX_REMOVE_BACKSLASH:VE,REGEX_SPECIAL_CHARS:HE,REGEX_SPECIAL_CHARS_GLOBAL:WE}=Pi();Re.isObject=r=>r!==null&&typeof r=="object"&&!Array.isArray(r);Re.hasRegexChars=r=>HE.test(r);Re.isRegexChar=r=>r.length===1&&Re.hasRegexChars(r);Re.escapeRegex=r=>r.replace(WE,"\\$1");Re.toPosixSlashes=r=>r.replace(UE,"/");Re.removeBackslashes=r=>r.replace(VE,e=>e==="\\"?"":e);Re.supportsLookbehinds=()=>{let r=m.version.slice(1).split(".").map(Number);return r.length===3&&r[0]>=9||r[0]===8&&r[1]>=10};Re.isWindows=r=>r&&typeof r.windows=="boolean"?r.windows:zE===!0||jE.sep==="\\";Re.escapeLast=(r,e,t)=>{let i=r.lastIndexOf(e,t);return i===-1?r:r[i-1]==="\\"?Re.escapeLast(r,e,i-1):`${r.slice(0,i)}\\${r.slice(i)}`};Re.removePrefix=(r,e={})=>{let t=r;return t.startsWith("./")&&(t=t.slice(2),e.prefix="./"),t};Re.wrapOutput=(r,e={},t={})=>{let i=t.contains?"":"^",n=t.contains?"":"$",a=`${i}(?:${r})${n}`;return e.negated===!0&&(a=`(?:^(?!${a}).*$)`),a}});var Zm=x((o6,Jm)=>{u();"use strict";var Wm=Ii(),{CHAR_ASTERISK:hl,CHAR_AT:GE,CHAR_BACKWARD_SLASH:Di,CHAR_COMMA:QE,CHAR_DOT:ml,CHAR_EXCLAMATION_MARK:gl,CHAR_FORWARD_SLASH:Gm,CHAR_LEFT_CURLY_BRACE:yl,CHAR_LEFT_PARENTHESES:bl,CHAR_LEFT_SQUARE_BRACKET:YE,CHAR_PLUS:KE,CHAR_QUESTION_MARK:Qm,CHAR_RIGHT_CURLY_BRACE:XE,CHAR_RIGHT_PARENTHESES:Ym,CHAR_RIGHT_SQUARE_BRACKET:JE}=Pi(),Km=r=>r===Gm||r===Di,Xm=r=>{r.isPrefix!==!0&&(r.depth=r.isGlobstar?1/0:1)},ZE=(r,e)=>{let t=e||{},i=r.length-1,n=t.parts===!0||t.scanToEnd===!0,a=[],s=[],o=[],l=r,c=-1,f=0,d=0,p=!1,h=!1,b=!1,v=!1,y=!1,w=!1,k=!1,S=!1,E=!1,O=!1,B=0,N,T,F={value:"",depth:0,isGlob:!1},Y=()=>c>=i,_=()=>l.charCodeAt(c+1),Q=()=>(N=T,l.charCodeAt(++c));for(;c0&&(oe=l.slice(0,f),l=l.slice(f),d-=f),U&&b===!0&&d>0?(U=l.slice(0,d),A=l.slice(d)):b===!0?(U="",A=l):U=l,U&&U!==""&&U!=="/"&&U!==l&&Km(U.charCodeAt(U.length-1))&&(U=U.slice(0,-1)),t.unescape===!0&&(A&&(A=Wm.removeBackslashes(A)),U&&k===!0&&(U=Wm.removeBackslashes(U)));let C={prefix:oe,input:r,start:f,base:U,glob:A,isBrace:p,isBracket:h,isGlob:b,isExtglob:v,isGlobstar:y,negated:S,negatedExtglob:E};if(t.tokens===!0&&(C.maxDepth=0,Km(T)||s.push(F),C.tokens=s),t.parts===!0||t.tokens===!0){let he;for(let V=0;V{u();"use strict";var gs=Pi(),Me=Ii(),{MAX_LENGTH:ys,POSIX_REGEX_SOURCE:e2,REGEX_NON_SPECIAL_CHARS:t2,REGEX_SPECIAL_CHARS_BACKREF:r2,REPLACEMENTS:eg}=gs,i2=(r,e)=>{if(typeof e.expandRange=="function")return e.expandRange(...r,e);r.sort();let t=`[${r.join("-")}]`;try{new RegExp(t)}catch(i){return r.map(n=>Me.escapeRegex(n)).join("..")}return t},gr=(r,e)=>`Missing ${r}: "${e}" - use "\\\\${e}" to match literal characters`,wl=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");r=eg[r]||r;let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);let a={type:"bos",value:"",output:t.prepend||""},s=[a],o=t.capture?"":"?:",l=Me.isWindows(e),c=gs.globChars(l),f=gs.extglobChars(c),{DOT_LITERAL:d,PLUS_LITERAL:p,SLASH_LITERAL:h,ONE_CHAR:b,DOTS_SLASH:v,NO_DOT:y,NO_DOT_SLASH:w,NO_DOTS_SLASH:k,QMARK:S,QMARK_NO_DOT:E,STAR:O,START_ANCHOR:B}=c,N=q=>`(${o}(?:(?!${B}${q.dot?v:d}).)*?)`,T=t.dot?"":y,F=t.dot?S:E,Y=t.bash===!0?N(t):O;t.capture&&(Y=`(${Y})`),typeof t.noext=="boolean"&&(t.noextglob=t.noext);let _={input:r,index:-1,start:0,dot:t.dot===!0,consumed:"",output:"",prefix:"",backtrack:!1,negated:!1,brackets:0,braces:0,parens:0,quotes:0,globstar:!1,tokens:s};r=Me.removePrefix(r,_),n=r.length;let Q=[],U=[],oe=[],A=a,C,he=()=>_.index===n-1,V=_.peek=(q=1)=>r[_.index+q],Ee=_.advance=()=>r[++_.index]||"",Ie=()=>r.slice(_.index+1),De=(q="",ae=0)=>{_.consumed+=q,_.index+=ae},Bi=q=>{_.output+=q.output!=null?q.output:q.value,De(q.value)},Rv=()=>{let q=1;for(;V()==="!"&&(V(2)!=="("||V(3)==="?");)Ee(),_.start++,q++;return q%2==0?!1:(_.negated=!0,_.start++,!0)},Fi=q=>{_[q]++,oe.push(q)},Ft=q=>{_[q]--,oe.pop()},W=q=>{if(A.type==="globstar"){let ae=_.braces>0&&(q.type==="comma"||q.type==="brace"),I=q.extglob===!0||Q.length&&(q.type==="pipe"||q.type==="paren");q.type!=="slash"&&q.type!=="paren"&&!ae&&!I&&(_.output=_.output.slice(0,-A.output.length),A.type="star",A.value="*",A.output=Y,_.output+=A.output)}if(Q.length&&q.type!=="paren"&&(Q[Q.length-1].inner+=q.value),(q.value||q.output)&&Bi(q),A&&A.type==="text"&&q.type==="text"){A.value+=q.value,A.output=(A.output||"")+q.value;return}q.prev=A,s.push(q),A=q},ji=(q,ae)=>{let I={...f[ae],conditions:1,inner:""};I.prev=A,I.parens=_.parens,I.output=_.output;let H=(t.capture?"(":"")+I.open;Fi("parens"),W({type:q,value:ae,output:_.output?"":b}),W({type:"paren",extglob:!0,value:Ee(),output:H}),Q.push(I)},Pv=q=>{let ae=q.close+(t.capture?")":""),I;if(q.type==="negate"){let H=Y;if(q.inner&&q.inner.length>1&&q.inner.includes("/")&&(H=N(t)),(H!==Y||he()||/^\)+$/.test(Ie()))&&(ae=q.close=`)$))${H}`),q.inner.includes("*")&&(I=Ie())&&/^\.[^\\/.]+$/.test(I)){let ce=wl(I,{...e,fastpaths:!1}).output;ae=q.close=`)${ce})${H})`}q.prev.type==="bos"&&(_.negatedExtglob=!0)}W({type:"paren",extglob:!0,value:C,output:ae}),Ft("parens")};if(t.fastpaths!==!1&&!/(^[*!]|[/()[\]{}"])/.test(r)){let q=!1,ae=r.replace(r2,(I,H,ce,Ce,ye,Ms)=>Ce==="\\"?(q=!0,I):Ce==="?"?H?H+Ce+(ye?S.repeat(ye.length):""):Ms===0?F+(ye?S.repeat(ye.length):""):S.repeat(ce.length):Ce==="."?d.repeat(ce.length):Ce==="*"?H?H+Ce+(ye?Y:""):Y:H?I:`\\${I}`);return q===!0&&(t.unescape===!0?ae=ae.replace(/\\/g,""):ae=ae.replace(/\\+/g,I=>I.length%2==0?"\\\\":I?"\\":"")),ae===r&&t.contains===!0?(_.output=r,_):(_.output=Me.wrapOutput(ae,_,e),_)}for(;!he();){if(C=Ee(),C==="\0")continue;if(C==="\\"){let I=V();if(I==="/"&&t.bash!==!0||I==="."||I===";")continue;if(!I){C+="\\",W({type:"text",value:C});continue}let H=/^\\+/.exec(Ie()),ce=0;if(H&&H[0].length>2&&(ce=H[0].length,_.index+=ce,ce%2!=0&&(C+="\\")),t.unescape===!0?C=Ee():C+=Ee(),_.brackets===0){W({type:"text",value:C});continue}}if(_.brackets>0&&(C!=="]"||A.value==="["||A.value==="[^")){if(t.posix!==!1&&C===":"){let I=A.value.slice(1);if(I.includes("[")&&(A.posix=!0,I.includes(":"))){let H=A.value.lastIndexOf("["),ce=A.value.slice(0,H),Ce=A.value.slice(H+2),ye=e2[Ce];if(ye){A.value=ce+ye,_.backtrack=!0,Ee(),!a.output&&s.indexOf(A)===1&&(a.output=b);continue}}}(C==="["&&V()!==":"||C==="-"&&V()==="]")&&(C=`\\${C}`),C==="]"&&(A.value==="["||A.value==="[^")&&(C=`\\${C}`),t.posix===!0&&C==="!"&&A.value==="["&&(C="^"),A.value+=C,Bi({value:C});continue}if(_.quotes===1&&C!=='"'){C=Me.escapeRegex(C),A.value+=C,Bi({value:C});continue}if(C==='"'){_.quotes=_.quotes===1?0:1,t.keepQuotes===!0&&W({type:"text",value:C});continue}if(C==="("){Fi("parens"),W({type:"paren",value:C});continue}if(C===")"){if(_.parens===0&&t.strictBrackets===!0)throw new SyntaxError(gr("opening","("));let I=Q[Q.length-1];if(I&&_.parens===I.parens+1){Pv(Q.pop());continue}W({type:"paren",value:C,output:_.parens?")":"\\)"}),Ft("parens");continue}if(C==="["){if(t.nobracket===!0||!Ie().includes("]")){if(t.nobracket!==!0&&t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));C=`\\${C}`}else Fi("brackets");W({type:"bracket",value:C});continue}if(C==="]"){if(t.nobracket===!0||A&&A.type==="bracket"&&A.value.length===1){W({type:"text",value:C,output:`\\${C}`});continue}if(_.brackets===0){if(t.strictBrackets===!0)throw new SyntaxError(gr("opening","["));W({type:"text",value:C,output:`\\${C}`});continue}Ft("brackets");let I=A.value.slice(1);if(A.posix!==!0&&I[0]==="^"&&!I.includes("/")&&(C=`/${C}`),A.value+=C,Bi({value:C}),t.literalBrackets===!1||Me.hasRegexChars(I))continue;let H=Me.escapeRegex(A.value);if(_.output=_.output.slice(0,-A.value.length),t.literalBrackets===!0){_.output+=H,A.value=H;continue}A.value=`(${o}${H}|${A.value})`,_.output+=A.value;continue}if(C==="{"&&t.nobrace!==!0){Fi("braces");let I={type:"brace",value:C,output:"(",outputIndex:_.output.length,tokensIndex:_.tokens.length};U.push(I),W(I);continue}if(C==="}"){let I=U[U.length-1];if(t.nobrace===!0||!I){W({type:"text",value:C,output:C});continue}let H=")";if(I.dots===!0){let ce=s.slice(),Ce=[];for(let ye=ce.length-1;ye>=0&&(s.pop(),ce[ye].type!=="brace");ye--)ce[ye].type!=="dots"&&Ce.unshift(ce[ye].value);H=i2(Ce,t),_.backtrack=!0}if(I.comma!==!0&&I.dots!==!0){let ce=_.output.slice(0,I.outputIndex),Ce=_.tokens.slice(I.tokensIndex);I.value=I.output="\\{",C=H="\\}",_.output=ce;for(let ye of Ce)_.output+=ye.output||ye.value}W({type:"brace",value:C,output:H}),Ft("braces"),U.pop();continue}if(C==="|"){Q.length>0&&Q[Q.length-1].conditions++,W({type:"text",value:C});continue}if(C===","){let I=C,H=U[U.length-1];H&&oe[oe.length-1]==="braces"&&(H.comma=!0,I="|"),W({type:"comma",value:C,output:I});continue}if(C==="/"){if(A.type==="dot"&&_.index===_.start+1){_.start=_.index+1,_.consumed="",_.output="",s.pop(),A=a;continue}W({type:"slash",value:C,output:h});continue}if(C==="."){if(_.braces>0&&A.type==="dot"){A.value==="."&&(A.output=d);let I=U[U.length-1];A.type="dots",A.output+=C,A.value+=C,I.dots=!0;continue}if(_.braces+_.parens===0&&A.type!=="bos"&&A.type!=="slash"){W({type:"text",value:C,output:d});continue}W({type:"dot",value:C,output:d});continue}if(C==="?"){if(!(A&&A.value==="(")&&t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("qmark",C);continue}if(A&&A.type==="paren"){let H=V(),ce=C;if(H==="<"&&!Me.supportsLookbehinds())throw new Error("Node.js v10 or higher is required for regex lookbehinds");(A.value==="("&&!/[!=<:]/.test(H)||H==="<"&&!/<([!=]|\w+>)/.test(Ie()))&&(ce=`\\${C}`),W({type:"text",value:C,output:ce});continue}if(t.dot!==!0&&(A.type==="slash"||A.type==="bos")){W({type:"qmark",value:C,output:E});continue}W({type:"qmark",value:C,output:S});continue}if(C==="!"){if(t.noextglob!==!0&&V()==="("&&(V(2)!=="?"||!/[!=<:]/.test(V(3)))){ji("negate",C);continue}if(t.nonegate!==!0&&_.index===0){Rv();continue}}if(C==="+"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){ji("plus",C);continue}if(A&&A.value==="("||t.regex===!1){W({type:"plus",value:C,output:p});continue}if(A&&(A.type==="bracket"||A.type==="paren"||A.type==="brace")||_.parens>0){W({type:"plus",value:C});continue}W({type:"plus",value:p});continue}if(C==="@"){if(t.noextglob!==!0&&V()==="("&&V(2)!=="?"){W({type:"at",extglob:!0,value:C,output:""});continue}W({type:"text",value:C});continue}if(C!=="*"){(C==="$"||C==="^")&&(C=`\\${C}`);let I=t2.exec(Ie());I&&(C+=I[0],_.index+=I[0].length),W({type:"text",value:C});continue}if(A&&(A.type==="globstar"||A.star===!0)){A.type="star",A.star=!0,A.value+=C,A.output=Y,_.backtrack=!0,_.globstar=!0,De(C);continue}let q=Ie();if(t.noextglob!==!0&&/^\([^?]/.test(q)){ji("star",C);continue}if(A.type==="star"){if(t.noglobstar===!0){De(C);continue}let I=A.prev,H=I.prev,ce=I.type==="slash"||I.type==="bos",Ce=H&&(H.type==="star"||H.type==="globstar");if(t.bash===!0&&(!ce||q[0]&&q[0]!=="/")){W({type:"star",value:C,output:""});continue}let ye=_.braces>0&&(I.type==="comma"||I.type==="brace"),Ms=Q.length&&(I.type==="pipe"||I.type==="paren");if(!ce&&I.type!=="paren"&&!ye&&!Ms){W({type:"star",value:C,output:""});continue}for(;q.slice(0,3)==="/**";){let zi=r[_.index+4];if(zi&&zi!=="/")break;q=q.slice(3),De("/**",3)}if(I.type==="bos"&&he()){A.type="globstar",A.value+=C,A.output=N(t),_.output=A.output,_.globstar=!0,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&!Ce&&he()){_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=N(t)+(t.strictSlashes?")":"|$)"),A.value+=C,_.globstar=!0,_.output+=I.output+A.output,De(C);continue}if(I.type==="slash"&&I.prev.type!=="bos"&&q[0]==="/"){let zi=q[1]!==void 0?"|$":"";_.output=_.output.slice(0,-(I.output+A.output).length),I.output=`(?:${I.output}`,A.type="globstar",A.output=`${N(t)}${h}|${h}${zi})`,A.value+=C,_.output+=I.output+A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}if(I.type==="bos"&&q[0]==="/"){A.type="globstar",A.value+=C,A.output=`(?:^|${h}|${N(t)}${h})`,_.output=A.output,_.globstar=!0,De(C+Ee()),W({type:"slash",value:"/",output:""});continue}_.output=_.output.slice(0,-A.output.length),A.type="globstar",A.output=N(t),A.value+=C,_.output+=A.output,_.globstar=!0,De(C);continue}let ae={type:"star",value:C,output:Y};if(t.bash===!0){ae.output=".*?",(A.type==="bos"||A.type==="slash")&&(ae.output=T+ae.output),W(ae);continue}if(A&&(A.type==="bracket"||A.type==="paren")&&t.regex===!0){ae.output=C,W(ae);continue}(_.index===_.start||A.type==="slash"||A.type==="dot")&&(A.type==="dot"?(_.output+=w,A.output+=w):t.dot===!0?(_.output+=k,A.output+=k):(_.output+=T,A.output+=T),V()!=="*"&&(_.output+=b,A.output+=b)),W(ae)}for(;_.brackets>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","]"));_.output=Me.escapeLast(_.output,"["),Ft("brackets")}for(;_.parens>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing",")"));_.output=Me.escapeLast(_.output,"("),Ft("parens")}for(;_.braces>0;){if(t.strictBrackets===!0)throw new SyntaxError(gr("closing","}"));_.output=Me.escapeLast(_.output,"{"),Ft("braces")}if(t.strictSlashes!==!0&&(A.type==="star"||A.type==="bracket")&&W({type:"maybe_slash",value:"",output:`${h}?`}),_.backtrack===!0){_.output="";for(let q of _.tokens)_.output+=q.output!=null?q.output:q.value,q.suffix&&(_.output+=q.suffix)}return _};wl.fastpaths=(r,e)=>{let t={...e},i=typeof t.maxLength=="number"?Math.min(ys,t.maxLength):ys,n=r.length;if(n>i)throw new SyntaxError(`Input length: ${n}, exceeds maximum allowed length: ${i}`);r=eg[r]||r;let a=Me.isWindows(e),{DOT_LITERAL:s,SLASH_LITERAL:o,ONE_CHAR:l,DOTS_SLASH:c,NO_DOT:f,NO_DOTS:d,NO_DOTS_SLASH:p,STAR:h,START_ANCHOR:b}=gs.globChars(a),v=t.dot?d:f,y=t.dot?p:f,w=t.capture?"":"?:",k={negated:!1,prefix:""},S=t.bash===!0?".*?":h;t.capture&&(S=`(${S})`);let E=T=>T.noglobstar===!0?S:`(${w}(?:(?!${b}${T.dot?c:s}).)*?)`,O=T=>{switch(T){case"*":return`${v}${l}${S}`;case".*":return`${s}${l}${S}`;case"*.*":return`${v}${S}${s}${l}${S}`;case"*/*":return`${v}${S}${o}${l}${y}${S}`;case"**":return v+E(t);case"**/*":return`(?:${v}${E(t)}${o})?${y}${l}${S}`;case"**/*.*":return`(?:${v}${E(t)}${o})?${y}${S}${s}${l}${S}`;case"**/.*":return`(?:${v}${E(t)}${o})?${s}${l}${S}`;default:{let F=/^(.*?)\.(\w+)$/.exec(T);if(!F)return;let Y=O(F[1]);return Y?Y+s+F[2]:void 0}}},B=Me.removePrefix(r,k),N=O(B);return N&&t.strictSlashes!==!0&&(N+=`${o}?`),N};tg.exports=wl});var ng=x((u6,ig)=>{u();"use strict";var n2=(et(),Ur),s2=Zm(),vl=rg(),xl=Ii(),a2=Pi(),o2=r=>r&&typeof r=="object"&&!Array.isArray(r),de=(r,e,t=!1)=>{if(Array.isArray(r)){let f=r.map(p=>de(p,e,t));return p=>{for(let h of f){let b=h(p);if(b)return b}return!1}}let i=o2(r)&&r.tokens&&r.input;if(r===""||typeof r!="string"&&!i)throw new TypeError("Expected pattern to be a non-empty string");let n=e||{},a=xl.isWindows(e),s=i?de.compileRe(r,e):de.makeRe(r,e,!1,!0),o=s.state;delete s.state;let l=()=>!1;if(n.ignore){let f={...e,ignore:null,onMatch:null,onResult:null};l=de(n.ignore,f,t)}let c=(f,d=!1)=>{let{isMatch:p,match:h,output:b}=de.test(f,s,e,{glob:r,posix:a}),v={glob:r,state:o,regex:s,posix:a,input:f,output:b,match:h,isMatch:p};return typeof n.onResult=="function"&&n.onResult(v),p===!1?(v.isMatch=!1,d?v:!1):l(f)?(typeof n.onIgnore=="function"&&n.onIgnore(v),v.isMatch=!1,d?v:!1):(typeof n.onMatch=="function"&&n.onMatch(v),d?v:!0)};return t&&(c.state=o),c};de.test=(r,e,t,{glob:i,posix:n}={})=>{if(typeof r!="string")throw new TypeError("Expected input to be a string");if(r==="")return{isMatch:!1,output:""};let a=t||{},s=a.format||(n?xl.toPosixSlashes:null),o=r===i,l=o&&s?s(r):r;return o===!1&&(l=s?s(r):r,o=l===i),(o===!1||a.capture===!0)&&(a.matchBase===!0||a.basename===!0?o=de.matchBase(r,e,t,n):o=e.exec(l)),{isMatch:Boolean(o),match:o,output:l}};de.matchBase=(r,e,t,i=xl.isWindows(t))=>(e instanceof RegExp?e:de.makeRe(e,t)).test(n2.basename(r));de.isMatch=(r,e,t)=>de(e,t)(r);de.parse=(r,e)=>Array.isArray(r)?r.map(t=>de.parse(t,e)):vl(r,{...e,fastpaths:!1});de.scan=(r,e)=>s2(r,e);de.compileRe=(r,e,t=!1,i=!1)=>{if(t===!0)return r.output;let n=e||{},a=n.contains?"":"^",s=n.contains?"":"$",o=`${a}(?:${r.output})${s}`;r&&r.negated===!0&&(o=`^(?!${o}).*$`);let l=de.toRegex(o,e);return i===!0&&(l.state=r),l};de.makeRe=(r,e={},t=!1,i=!1)=>{if(!r||typeof r!="string")throw new TypeError("Expected a non-empty string");let n={negated:!1,fastpaths:!0};return e.fastpaths!==!1&&(r[0]==="."||r[0]==="*")&&(n.output=vl.fastpaths(r,e)),n.output||(n=vl(r,e)),de.compileRe(n,e,t,i)};de.toRegex=(r,e)=>{try{let t=e||{};return new RegExp(r,t.flags||(t.nocase?"i":""))}catch(t){if(e&&e.debug===!0)throw t;return/$^/}};de.constants=a2;ig.exports=de});var ag=x((f6,sg)=>{u();"use strict";sg.exports=ng()});var cg=x((c6,fg)=>{u();"use strict";var og=(Bn(),Nn),lg=Fm(),ot=ag(),kl=Ii(),ug=r=>r===""||r==="./",fe=(r,e,t)=>{e=[].concat(e),r=[].concat(r);let i=new Set,n=new Set,a=new Set,s=0,o=f=>{a.add(f.output),t&&t.onResult&&t.onResult(f)};for(let f=0;f!i.has(f));if(t&&c.length===0){if(t.failglob===!0)throw new Error(`No matches found for "${e.join(", ")}"`);if(t.nonull===!0||t.nullglob===!0)return t.unescape?e.map(f=>f.replace(/\\/g,"")):e}return c};fe.match=fe;fe.matcher=(r,e)=>ot(r,e);fe.isMatch=(r,e,t)=>ot(e,t)(r);fe.any=fe.isMatch;fe.not=(r,e,t={})=>{e=[].concat(e).map(String);let i=new Set,n=[],a=o=>{t.onResult&&t.onResult(o),n.push(o.output)},s=new Set(fe(r,e,{...t,onResult:a}));for(let o of n)s.has(o)||i.add(o);return[...i]};fe.contains=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);if(Array.isArray(e))return e.some(i=>fe.contains(r,i,t));if(typeof e=="string"){if(ug(r)||ug(e))return!1;if(r.includes(e)||r.startsWith("./")&&r.slice(2).includes(e))return!0}return fe.isMatch(r,e,{...t,contains:!0})};fe.matchKeys=(r,e,t)=>{if(!kl.isObject(r))throw new TypeError("Expected the first argument to be an object");let i=fe(Object.keys(r),e,t),n={};for(let a of i)n[a]=r[a];return n};fe.some=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(i.some(s=>a(s)))return!0}return!1};fe.every=(r,e,t)=>{let i=[].concat(r);for(let n of[].concat(e)){let a=ot(String(n),t);if(!i.every(s=>a(s)))return!1}return!0};fe.all=(r,e,t)=>{if(typeof r!="string")throw new TypeError(`Expected a string: "${og.inspect(r)}"`);return[].concat(e).every(i=>ot(i,t)(r))};fe.capture=(r,e,t)=>{let i=kl.isWindows(t),a=ot.makeRe(String(r),{...t,capture:!0}).exec(i?kl.toPosixSlashes(e):e);if(a)return a.slice(1).map(s=>s===void 0?"":s)};fe.makeRe=(...r)=>ot.makeRe(...r);fe.scan=(...r)=>ot.scan(...r);fe.parse=(r,e)=>{let t=[];for(let i of[].concat(r||[]))for(let n of lg(String(i),e))t.push(ot.parse(n,e));return t};fe.braces=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return e&&e.nobrace===!0||!/\{.*\}/.test(r)?[r]:lg(r,e)};fe.braceExpand=(r,e)=>{if(typeof r!="string")throw new TypeError("Expected a string");return fe.braces(r,{...e,expand:!0})};fg.exports=fe});function dg(r,e){let t=e.content.files;t=t.filter(o=>typeof o=="string"),t=t.map(al);let i=cs.generateTasks(t),n=[],a=[];for(let o of i)n.push(...o.positive.map(l=>hg(l,!1))),a.push(...o.negative.map(l=>hg(l,!0)));let s=[...n,...a];return s=u2(r,s),s=s.flatMap(f2),s=s.map(l2),s}function hg(r,e){let t={original:r,base:r,ignore:e,pattern:r,glob:null};return Kh(r)&&Object.assign(t,rm(r)),t}function l2(r){let e=al(r.base);return e=cs.escapePath(e),r.pattern=r.glob?`${e}/${r.glob}`:e,r.pattern=r.ignore?`!${r.pattern}`:r.pattern,r}function u2(r,e){let t=[];return r.userConfigPath&&r.tailwindConfig.content.relative&&(t=[me.dirname(r.userConfigPath)]),e.map(i=>(i.base=me.resolve(...t,i.base),i))}function f2(r){let e=[r];try{let t=be.realpathSync(r.base);t!==r.base&&e.push({...r,base:t})}catch{}return e}function mg(r,e,t){let i=r.tailwindConfig.content.files.filter(s=>typeof s.raw=="string").map(({raw:s,extension:o="html"})=>({content:s,extension:o})),[n,a]=p2(e,t);for(let s of n){let o=me.extname(s).slice(1);i.push({file:s,extension:o})}return[i,a]}function c2(r){if(!r.some(a=>a.includes("**")&&!yg.test(a)))return()=>{};let t=[],i=[];for(let a of r){let s=pg.default.matcher(a);yg.test(a)&&i.push(s),t.push(s)}let n=!1;return a=>{if(n||i.some(f=>f(a)))return;let s=t.findIndex(f=>f(a));if(s===-1)return;let o=r[s],l=me.relative(m.cwd(),o);l[0]!=="."&&(l=`./${l}`);let c=gg.find(f=>a.includes(f));c&&(n=!0,G.warn("broad-content-glob-pattern",[`Your \`content\` configuration includes a pattern which looks like it's accidentally matching all of \`${c}\` and can cause serious performance issues.`,`Pattern: \`${l}\``,"See our documentation for recommendations:","https://tailwindcss.com/docs/content-configuration#pattern-recommendations"]))}}function p2(r,e){let t=r.map(o=>o.pattern),i=new Map,n=c2(t),a=new Set;Je.DEBUG&&console.time("Finding changed files");let s=cs.sync(t,{absolute:!0});for(let o of s){n(o);let l=e.get(o)||-1/0,c=be.statSync(o).mtimeMs;c>l&&(a.add(o),i.set(o,c))}return Je.DEBUG&&console.timeEnd("Finding changed files"),[a,i]}var pg,gg,yg,bg=R(()=>{u();ft();et();Xh();Jh();Zh();im();It();Be();pg=pe(cg());gg=["node_modules"],yg=new RegExp(`(${gg.map(r=>String.raw`\b${r}\b`).join("|")})`)});function wg(){}var vg=R(()=>{u()});function g2(r,e){for(let t of e){let i=`${r}${t}`;if(be.existsSync(i)&&be.statSync(i).isFile())return i}for(let t of e){let i=`${r}/index${t}`;if(be.existsSync(i))return i}return null}function*xg(r,e,t,i=me.extname(r)){let n=g2(me.resolve(e,r),d2.includes(i)?h2:m2);if(n===null||t.has(n))return;t.add(n),yield n,e=me.dirname(n),i=me.extname(n);let a=be.readFileSync(n,"utf-8");for(let s of[...a.matchAll(/import[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/import[\s\S]*from[\s\S]*?['"](.{3,}?)['"]/gi),...a.matchAll(/require\(['"`](.+)['"`]\)/gi)])!s[1].startsWith(".")||(yield*xg(s[1],e,t,i))}function Sl(r){return r===null?new Set:new Set(xg(r,me.dirname(r),new Set))}var d2,h2,m2,kg=R(()=>{u();ft();et();d2=[".js",".cjs",".mjs"],h2=["",".js",".cjs",".mjs",".ts",".cts",".mts",".jsx",".tsx"],m2=["",".ts",".cts",".mts",".tsx",".js",".cjs",".mjs",".jsx"]});function y2(r,e){if(Al.has(r))return Al.get(r);let t=dg(r,e);return Al.set(r,t).get(r)}function b2(r){let e=na(r);if(e!==null){let[i,n,a,s]=Ag.get(e)||[],o=Sl(e),l=!1,c=new Map;for(let p of o){let h=be.statSync(p).mtimeMs;c.set(p,h),(!s||!s.has(p)||h>s.get(p))&&(l=!0)}if(!l)return[i,e,n,a];for(let p of o)delete pf.cache[p];let f=sl(zr(wg(e))),d=Vi(f);return Ag.set(e,[f,d,o,c]),[f,e,d,o]}let t=zr(r?.config??r??{});return t=sl(t),[t,null,Vi(t),[]]}function Cl(r){return({tailwindDirectives:e,registerDependency:t})=>(i,n)=>{let[a,s,o,l]=b2(r),c=new Set(l);if(e.size>0){c.add(n.opts.from);for(let b of n.messages)b.type==="dependency"&&c.add(b.file)}let[f,,d]=zh(i,n,a,s,o,c),p=fs(f),h=y2(f,a);if(e.size>0){for(let y of h)for(let w of rl(y))t(w);let[b,v]=mg(f,h,p);for(let y of b)f.changedContent.push(y);for(let[y,w]of v.entries())d.set(y,w)}for(let b of l)t({type:"dependency",file:b});for(let[b,v]of d.entries())p.set(b,v);return f}}var Sg,Ag,Al,Cg=R(()=>{u();ft();Sg=pe(Ns());yf();ia();sc();_i();Uh();Yh();bg();vg();kg();Ag=new Sg.default({maxSize:100}),Al=new WeakMap});function _l(r){let e=new Set,t=new Set,i=new Set;if(r.walkAtRules(n=>{n.name==="apply"&&i.add(n),n.name==="import"&&(n.params==='"tailwindcss/base"'||n.params==="'tailwindcss/base'"?(n.name="tailwind",n.params="base"):n.params==='"tailwindcss/components"'||n.params==="'tailwindcss/components'"?(n.name="tailwind",n.params="components"):n.params==='"tailwindcss/utilities"'||n.params==="'tailwindcss/utilities'"?(n.name="tailwind",n.params="utilities"):(n.params==='"tailwindcss/screens"'||n.params==="'tailwindcss/screens'"||n.params==='"tailwindcss/variants"'||n.params==="'tailwindcss/variants'")&&(n.name="tailwind",n.params="variants")),n.name==="tailwind"&&(n.params==="screens"&&(n.params="variants"),e.add(n.params)),["layer","responsive","variants"].includes(n.name)&&(["responsive","variants"].includes(n.name)&&G.warn(`${n.name}-at-rule-deprecated`,[`The \`@${n.name}\` directive has been deprecated in Tailwind CSS v3.0.`,"Use `@layer utilities` or `@layer components` instead.","https://tailwindcss.com/docs/upgrade-guide#replace-variants-with-layer"]),t.add(n))}),!e.has("base")||!e.has("components")||!e.has("utilities")){for(let n of t)if(n.name==="layer"&&["base","components","utilities"].includes(n.params)){if(!e.has(n.params))throw n.error(`\`@layer ${n.params}\` is used but no matching \`@tailwind ${n.params}\` directive is present.`)}else if(n.name==="responsive"){if(!e.has("utilities"))throw n.error("`@responsive` is used but `@tailwind utilities` is missing.")}else if(n.name==="variants"&&!e.has("utilities"))throw n.error("`@variants` is used but `@tailwind utilities` is missing.")}return{tailwindDirectives:e,applyDirectives:i}}var _g=R(()=>{u();Be()});function Qt(r,e=void 0,t=void 0){return r.map(i=>{let n=i.clone();return t!==void 0&&(n.raws.tailwind={...n.raws.tailwind,...t}),e!==void 0&&Eg(n,a=>{if(a.raws.tailwind?.preserveSource===!0&&a.source)return!1;a.source=e}),n})}function Eg(r,e){e(r)!==!1&&r.each?.(t=>Eg(t,e))}var Og=R(()=>{u()});function El(r){return r=Array.isArray(r)?r:[r],r=r.map(e=>e instanceof RegExp?e.source:e),r.join("")}function Ne(r){return new RegExp(El(r),"g")}function qt(r){return`(?:${r.map(El).join("|")})`}function Ol(r){return`(?:${El(r)})?`}function Rg(r){return r&&w2.test(r)?r.replace(Tg,"\\$&"):r||""}var Tg,w2,Pg=R(()=>{u();Tg=/[\\^$.*+?()[\]{}|]/g,w2=RegExp(Tg.source)});function Ig(r){let e=Array.from(v2(r));return t=>{let i=[];for(let n of e)for(let a of t.match(n)??[])i.push(S2(a));for(let n of i.slice()){let a=ve(n,".");for(let s=0;s=a.length-1){i.push(o);continue}let l=Number(a[s+1]);isNaN(l)?i.push(o):s++}}return i}}function*v2(r){let e=r.tailwindConfig.separator,t=r.tailwindConfig.prefix!==""?Ol(Ne([/-?/,Rg(r.tailwindConfig.prefix)])):"",i=qt([/\[[^\s:'"`]+:[^\s\[\]]+\]/,/\[[^\s:'"`\]]+:[^\s]+?\[[^\s]+\][^\s]+?\]/,Ne([qt([/-?(?:\w+)/,/@(?:\w+)/]),Ol(qt([Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s:\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\><$]*)?/]),Ne([qt([/-(?:\w+-)*\['[^\s]+'\]/,/-(?:\w+-)*\["[^\s]+"\]/,/-(?:\w+-)*\[`[^\s]+`\]/,/-(?:\w+-)*\[(?:[^\s\[\]]+\[[^\s\[\]]+\])*[^\s\[\]]+\]/]),/(?![{([]])/,/(?:\/[^\s'"`\\$]*)?/]),/[-\/][^\s'"`\\$={><]*/]))])]),n=[qt([Ne([/@\[[^\s"'`]+\](\/[^\s"'`]+)?/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s"'`]+\]/,e]),Ne([/[^\s"'`\[\\]+/,e])]),qt([Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]\/[\w_-]+/,e]),Ne([/([^\s"'`\[\\]+-)?\[[^\s`]+\]/,e]),Ne([/[^\s`\[\\]+/,e])])];for(let a of n)yield Ne(["((?=((",a,")+))\\2)?",/!?/,t,i]);yield/[^<>"'`\s.(){}[\]#=%$][^<>"'`\s(){}[\]#=%$]*[^<>"'`\s.(){}[\]#=%:$]/g}function S2(r){if(!r.includes("-["))return r;let e=0,t=[],i=r.matchAll(x2);i=Array.from(i).flatMap(n=>{let[,...a]=n;return a.map((s,o)=>Object.assign([],n,{index:n.index+o,0:s}))});for(let n of i){let a=n[0],s=t[t.length-1];if(a===s?t.pop():(a==="'"||a==='"'||a==="`")&&t.push(a),!s){if(a==="["){e++;continue}else if(a==="]"){e--;continue}if(e<0)return r.substring(0,n.index-1);if(e===0&&!k2.test(a))return r.substring(0,n.index)}}return r}var x2,k2,Dg=R(()=>{u();Pg();zt();x2=/([\[\]'"`])([^\[\]'"`])?/g,k2=/[^"'`\s<>\]]+/});function A2(r,e){let t=r.tailwindConfig.content.extract;return t[e]||t.DEFAULT||$g[e]||$g.DEFAULT(r)}function C2(r,e){let t=r.content.transform;return t[e]||t.DEFAULT||Lg[e]||Lg.DEFAULT}function _2(r,e,t,i){qi.has(e)||qi.set(e,new qg.default({maxSize:25e3}));for(let n of r.split(` +`))if(n=n.trim(),!i.has(n))if(i.add(n),qi.get(e).has(n))for(let a of qi.get(e).get(n))t.add(a);else{let a=e(n).filter(o=>o!=="!*"),s=new Set(a);for(let o of s)t.add(o);qi.get(e).set(n,s)}}function E2(r,e){let t=e.offsets.sort(r),i={base:new Set,defaults:new Set,components:new Set,utilities:new Set,variants:new Set};for(let[n,a]of t)i[n.layer].add(a);return i}function Tl(r){return async e=>{let t={base:null,components:null,utilities:null,variants:null};if(e.walkAtRules(y=>{y.name==="tailwind"&&Object.keys(t).includes(y.params)&&(t[y.params]=y)}),Object.values(t).every(y=>y===null))return e;let i=new Set([...r.candidates??[],gt]),n=new Set;bt.DEBUG&&console.time("Reading changed files");let a=[];for(let y of r.changedContent){let w=C2(r.tailwindConfig,y.extension),k=A2(r,y.extension);a.push([y,{transformer:w,extractor:k}])}let s=500;for(let y=0;y{S=k?await be.promises.readFile(k,"utf8"):S,_2(E(S),O,i,n)}))}bt.DEBUG&&console.timeEnd("Reading changed files");let o=r.classCache.size;bt.DEBUG&&console.time("Generate rules"),bt.DEBUG&&console.time("Sorting candidates");let l=new Set([...i].sort((y,w)=>y===w?0:y{let w=y.raws.tailwind?.parentLayer;return w==="components"?t.components!==null:w==="utilities"?t.utilities!==null:!0});t.variants?(t.variants.before(Qt(b,t.variants.source,{layer:"variants"})),t.variants.remove()):b.length>0&&e.append(Qt(b,e.source,{layer:"variants"})),e.source.end=e.source.end??e.source.start;let v=b.some(y=>y.raws.tailwind?.parentLayer==="utilities");t.utilities&&p.size===0&&!v&&G.warn("content-problems",["No utility classes were detected in your source files. If this is unexpected, double-check the `content` option in your Tailwind CSS configuration.","https://tailwindcss.com/docs/content-configuration"]),bt.DEBUG&&(console.log("Potential classes: ",i.size),console.log("Active contexts: ",Zn.size)),r.changedContent=[],e.walkAtRules("layer",y=>{Object.keys(t).includes(y.params)&&y.remove()})}}var qg,bt,$g,Lg,qi,Mg=R(()=>{u();ft();qg=pe(Ns());It();as();Be();Og();Dg();bt=Je,$g={DEFAULT:Ig},Lg={DEFAULT:r=>r,svelte:r=>r.replace(/(?:^|\s)class:/g," ")};qi=new WeakMap});function ws(r){let e=new Map;ee.root({nodes:[r.clone()]}).walkRules(a=>{(0,bs.default)(s=>{s.walkClasses(o=>{let l=o.parent.toString(),c=e.get(l);c||e.set(l,c=new Set),c.add(o.value)})}).processSync(a.selector)});let i=Array.from(e.values(),a=>Array.from(a)),n=i.flat();return Object.assign(n,{groups:i})}function Rl(r){return O2.astSync(r)}function Ng(r,e){let t=new Set;for(let i of r)t.add(i.split(e).pop());return Array.from(t)}function Bg(r,e){let t=r.tailwindConfig.prefix;return typeof t=="function"?t(e):t+e}function*Fg(r){for(yield r;r.parent;)yield r.parent,r=r.parent}function T2(r,e={}){let t=r.nodes;r.nodes=[];let i=r.clone(e);return r.nodes=t,i}function R2(r){for(let e of Fg(r))if(r!==e){if(e.type==="root")break;r=T2(e,{nodes:[r]})}return r}function P2(r,e){let t=new Map;return r.walkRules(i=>{for(let s of Fg(i))if(s.raws.tailwind?.layer!==void 0)return;let n=R2(i),a=e.offsets.create("user");for(let s of ws(i)){let o=t.get(s)||[];t.set(s,o),o.push([{layer:"user",sort:a,important:!1},n])}}),t}function I2(r,e){for(let t of r){if(e.notClassCache.has(t)||e.applyClassCache.has(t))continue;if(e.classCache.has(t)){e.applyClassCache.set(t,e.classCache.get(t).map(([n,a])=>[n,a.clone()]));continue}let i=Array.from(Go(t,e));if(i.length===0){e.notClassCache.add(t);continue}e.applyClassCache.set(t,i)}return e.applyClassCache}function D2(r){let e=null;return{get:t=>(e=e||r(),e.get(t)),has:t=>(e=e||r(),e.has(t))}}function q2(r){return{get:e=>r.flatMap(t=>t.get(e)||[]),has:e=>r.some(t=>t.has(e))}}function jg(r){let e=r.split(/[\s\t\n]+/g);return e[e.length-1]==="!important"?[e.slice(0,-1),!0]:[e,!1]}function zg(r,e,t){let i=new Set,n=[];if(r.walkAtRules("apply",l=>{let[c]=jg(l.params);for(let f of c)i.add(f);n.push(l)}),n.length===0)return;let a=q2([t,I2(i,e)]);function s(l,c,f){let d=Rl(l),p=Rl(c),b=Rl(`.${Te(f)}`).nodes[0].nodes[0];return d.each(v=>{let y=new Set;p.each(w=>{let k=!1;w=w.clone(),w.walkClasses(S=>{S.value===b.value&&(k||(S.replaceWith(...v.nodes.map(E=>E.clone())),y.add(w),k=!0))})});for(let w of y){let k=[[]];for(let S of w.nodes)S.type==="combinator"?(k.push(S),k.push([])):k[k.length-1].push(S);w.nodes=[];for(let S of k)Array.isArray(S)&&S.sort((E,O)=>E.type==="tag"&&O.type==="class"?-1:E.type==="class"&&O.type==="tag"?1:E.type==="class"&&O.type==="pseudo"&&O.value.startsWith("::")?-1:E.type==="pseudo"&&E.value.startsWith("::")&&O.type==="class"?1:0),w.nodes=w.nodes.concat(S)}v.replaceWith(...y)}),d.toString()}let o=new Map;for(let l of n){let[c]=o.get(l.parent)||[[],l.source];o.set(l.parent,[c,l.source]);let[f,d]=jg(l.params);if(l.parent.type==="atrule"){if(l.parent.name==="screen"){let p=l.parent.params;throw l.error(`@apply is not supported within nested at-rules like @screen. We suggest you write this as @apply ${f.map(h=>`${p}:${h}`).join(" ")} instead.`)}throw l.error(`@apply is not supported within nested at-rules like @${l.parent.name}. You can fix this by un-nesting @${l.parent.name}.`)}for(let p of f){if([Bg(e,"group"),Bg(e,"peer")].includes(p))throw l.error(`@apply should not be used with the '${p}' utility`);if(!a.has(p))throw l.error(`The \`${p}\` class does not exist. If \`${p}\` is a custom class, make sure it is defined within a \`@layer\` directive.`);let h=a.get(p);for(let[,b]of h)b.type!=="atrule"&&b.walkRules(()=>{throw l.error([`The \`${p}\` class cannot be used with \`@apply\` because \`@apply\` does not currently support nested CSS.`,"Rewrite the selector without nesting or configure the `tailwindcss/nesting` plugin:","https://tailwindcss.com/docs/using-with-preprocessors#nesting"].join(` +`))});c.push([p,d,h])}}for(let[l,[c,f]]of o){let d=[];for(let[h,b,v]of c){let y=[h,...Ng([h],e.tailwindConfig.separator)];for(let[w,k]of v){let S=ws(l),E=ws(k);if(E=E.groups.filter(T=>T.some(F=>y.includes(F))).flat(),E=E.concat(Ng(E,e.tailwindConfig.separator)),S.some(T=>E.includes(T)))throw k.error(`You cannot \`@apply\` the \`${h}\` utility here because it creates a circular dependency.`);let B=ee.root({nodes:[k.clone()]});B.walk(T=>{T.source=f}),(k.type!=="atrule"||k.type==="atrule"&&k.name!=="keyframes")&&B.walkRules(T=>{if(!ws(T).some(U=>U===h)){T.remove();return}let F=typeof e.tailwindConfig.important=="string"?e.tailwindConfig.important:null,_=l.raws.tailwind!==void 0&&F&&l.selector.indexOf(F)===0?l.selector.slice(F.length):l.selector;_===""&&(_=l.selector),T.selector=s(_,T.selector,h),F&&_!==l.selector&&(T.selector=rs(T.selector,F)),T.walkDecls(U=>{U.important=w.important||b});let Q=(0,bs.default)().astSync(T.selector);Q.each(U=>pr(U)),T.selector=Q.toString()}),!!B.nodes[0]&&d.push([w.sort,B.nodes[0]])}}let p=e.offsets.sort(d).map(h=>h[1]);l.after(p)}for(let l of n)l.parent.nodes.length>1?l.remove():l.parent.remove();zg(r,e,t)}function Pl(r){return e=>{let t=D2(()=>P2(e,r));zg(e,r,t)}}var bs,O2,Ug=R(()=>{u();Ot();bs=pe(it());as();fr();Vo();es();O2=(0,bs.default)()});var Vg=x((rq,vs)=>{u();(function(){"use strict";function r(i,n,a){if(!i)return null;r.caseSensitive||(i=i.toLowerCase());var s=r.threshold===null?null:r.threshold*i.length,o=r.thresholdAbsolute,l;s!==null&&o!==null?l=Math.min(s,o):s!==null?l=s:o!==null?l=o:l=null;var c,f,d,p,h,b=n.length;for(h=0;ha)return a+1;var l=[],c,f,d,p,h;for(c=0;c<=o;c++)l[c]=[c];for(f=0;f<=s;f++)l[0][f]=f;for(c=1;c<=o;c++){for(d=e,p=1,c>a&&(p=c-a),h=o+1,h>a+c&&(h=a+c),f=1;f<=s;f++)fh?l[c][f]=a+1:n.charAt(c-1)===i.charAt(f-1)?l[c][f]=l[c-1][f-1]:l[c][f]=Math.min(l[c-1][f-1]+1,Math.min(l[c][f-1]+1,l[c-1][f]+1)),l[c][f]a)return a+1}return l[o][s]}})()});var Wg=x((iq,Hg)=>{u();var Il="(".charCodeAt(0),Dl=")".charCodeAt(0),xs="'".charCodeAt(0),ql='"'.charCodeAt(0),$l="\\".charCodeAt(0),yr="/".charCodeAt(0),Ll=",".charCodeAt(0),Ml=":".charCodeAt(0),ks="*".charCodeAt(0),$2="u".charCodeAt(0),L2="U".charCodeAt(0),M2="+".charCodeAt(0),N2=/^[a-f0-9?-]+$/i;Hg.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d{u();Gg.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n{u();function Yg(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Kg(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Kg(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Yg(r[i],e)+t;return t}return Yg(r,e)}Xg.exports=Kg});var ey=x((aq,Zg)=>{u();var Ss="-".charCodeAt(0),As="+".charCodeAt(0),Nl=".".charCodeAt(0),B2="e".charCodeAt(0),F2="E".charCodeAt(0);function j2(r){var e=r.charCodeAt(0),t;if(e===As||e===Ss){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===Nl&&i>=48&&i<=57}return e===Nl?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Zg.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!j2(r))return!1;for(i=r.charCodeAt(e),(i===As||i===Ss)&&e++;e57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===Nl&&n>=48&&n<=57)for(e+=2;e57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===B2||i===F2)&&(n>=48&&n<=57||(n===As||n===Ss)&&a>=48&&a<=57))for(e+=n===As||n===Ss?3:2;e57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var ny=x((oq,iy)=>{u();var z2=Wg(),ty=Qg(),ry=Jg();function $t(r){return this instanceof $t?(this.nodes=z2(r),this):new $t(r)}$t.prototype.toString=function(){return Array.isArray(this.nodes)?ry(this.nodes):""};$t.prototype.walk=function(r,e){return ty(this.nodes,r,e),this};$t.unit=ey();$t.walk=ty;$t.stringify=ry;iy.exports=$t});function Fl(r){return typeof r=="object"&&r!==null}function U2(r,e){let t=kt(e);do if(t.pop(),(0,$i.default)(r,t)!==void 0)break;while(t.length);return t.length?t:void 0}function br(r){return typeof r=="string"?r:r.reduce((e,t,i)=>t.includes(".")?`${e}[${t}]`:i===0?t:`${e}.${t}`,"")}function ay(r){return r.map(e=>`'${e}'`).join(", ")}function oy(r){return ay(Object.keys(r))}function jl(r,e,t,i={}){let n=Array.isArray(e)?br(e):e.replace(/^['"]+|['"]+$/g,""),a=Array.isArray(e)?e:kt(n),s=(0,$i.default)(r.theme,a,t);if(s===void 0){let l=`'${n}' does not exist in your theme config.`,c=a.slice(0,-1),f=(0,$i.default)(r.theme,c);if(Fl(f)){let d=Object.keys(f).filter(h=>jl(r,[...c,h]).isValid),p=(0,sy.default)(a[a.length-1],d);p?l+=` Did you mean '${br([...c,p])}'?`:d.length>0&&(l+=` '${br(c)}' has the following valid keys: ${ay(d)}`)}else{let d=U2(r.theme,n);if(d){let p=(0,$i.default)(r.theme,d);Fl(p)?l+=` '${br(d)}' has the following keys: ${oy(p)}`:l+=` '${br(d)}' is not an object.`}else l+=` Your theme has the following top-level keys: ${oy(r.theme)}`}return{isValid:!1,error:l}}if(!(typeof s=="string"||typeof s=="number"||typeof s=="function"||s instanceof String||s instanceof Number||Array.isArray(s))){let l=`'${n}' was found but does not resolve to a string.`;if(Fl(s)){let c=Object.keys(s).filter(f=>jl(r,[...a,f]).isValid);c.length&&(l+=` Did you mean something like '${br([...a,c[0]])}'?`)}return{isValid:!1,error:l}}let[o]=a;return{isValid:!0,value:mt(o)(s,i)}}function V2(r,e,t){e=e.map(n=>ly(r,n,t));let i=[""];for(let n of e)n.type==="div"&&n.value===","?i.push(""):i[i.length-1]+=Bl.default.stringify(n);return i}function ly(r,e,t){if(e.type==="function"&&t[e.value]!==void 0){let i=V2(r,e.nodes,t);e.type="word",e.value=t[e.value](r,...i)}return e}function H2(r,e,t){return Object.keys(t).some(n=>e.includes(`${n}(`))?(0,Bl.default)(e).walk(n=>{ly(r,n,t)}).toString():e}function*G2(r){r=r.replace(/^['"]+|['"]+$/g,"");let e=r.match(/^([^\s]+)(?![^\[]*\])(?:\s*\/\s*([^\/\s]+))$/),t;yield[r,void 0],e&&(r=e[1],t=e[2],yield[r,t])}function Q2(r,e,t){let i=Array.from(G2(e)).map(([n,a])=>Object.assign(jl(r,n,t,{opacityValue:a}),{resolvedPath:n,alpha:a}));return i.find(n=>n.isValid)??i[0]}function uy(r){let e=r.tailwindConfig,t={theme:(i,n,...a)=>{let{isValid:s,value:o,error:l,alpha:c}=Q2(e,n,a.length?a:void 0);if(!s){let p=i.parent,h=p?.raws.tailwind?.candidate;if(p&&h!==void 0){r.markInvalidUtilityNode(p),p.remove(),G.warn("invalid-theme-key-in-class",[`The utility \`${h}\` contains an invalid theme value and was not generated.`]);return}throw i.error(l)}let f=Xt(o),d=f!==void 0&&typeof f=="function";return(c!==void 0||d)&&(c===void 0&&(c=1),o=Ze(f,c,f)),o},screen:(i,n)=>{n=n.replace(/^['"]+/g,"").replace(/['"]+$/g,"");let s=Rt(e.theme.screens).find(({name:o})=>o===n);if(!s)throw i.error(`The '${n}' screen does not exist in your theme.`);return Tt(s)}};return i=>{i.walk(n=>{let a=W2[n.type];a!==void 0&&(n[a]=H2(n,n[a],t))})}}var $i,sy,Bl,W2,fy=R(()=>{u();$i=pe(Oa()),sy=pe(Vg());Si();Bl=pe(ny());Xn();Qn();Gi();Lr();Fr();Be();W2={atrule:"params",decl:"value"}});function cy({tailwindConfig:{theme:r}}){return function(e){e.walkAtRules("screen",t=>{let i=t.params,a=Rt(r.screens).find(({name:s})=>s===i);if(!a)throw t.error(`No \`${i}\` screen found.`);t.name="media",t.params=Tt(a)})}}var py=R(()=>{u();Xn();Qn()});function Y2(r){let e=r.filter(o=>o.type!=="pseudo"||o.nodes.length>0?!0:o.value.startsWith("::")||[":before",":after",":first-line",":first-letter"].includes(o.value)).reverse(),t=new Set(["tag","class","id","attribute"]),i=e.findIndex(o=>t.has(o.type));if(i===-1)return e.reverse().join("").trim();let n=e[i],a=dy[n.type]?dy[n.type](n):n;e=e.slice(0,i);let s=e.findIndex(o=>o.type==="combinator"&&o.value===">");return s!==-1&&(e.splice(0,s),e.unshift(Cs.default.universal())),[a,...e.reverse()].join("").trim()}function X2(r){return zl.has(r)||zl.set(r,K2.transformSync(r)),zl.get(r)}function Ul({tailwindConfig:r}){return e=>{let t=new Map,i=new Set;if(e.walkAtRules("defaults",n=>{if(n.nodes&&n.nodes.length>0){i.add(n);return}let a=n.params;t.has(a)||t.set(a,new Set),t.get(a).add(n.parent),n.remove()}),we(r,"optimizeUniversalDefaults"))for(let n of i){let a=new Map,s=t.get(n.params)??[];for(let o of s)for(let l of X2(o.selector)){let c=l.includes(":-")||l.includes("::-")||l.includes(":has")?l:"__DEFAULT__",f=a.get(c)??new Set;a.set(c,f),f.add(l)}if(a.size===0){n.remove();continue}for(let[,o]of a){let l=ee.rule({source:n.source});l.selectors=[...o],l.append(n.nodes.map(c=>c.clone())),n.before(l)}n.remove()}else if(i.size){let n=ee.rule({selectors:["*","::before","::after"]});for(let s of i)n.append(s.nodes),n.parent||s.before(n),n.source||(n.source=s.source),s.remove();let a=n.clone({selectors:["::backdrop"]});n.after(a)}}}var Cs,dy,K2,zl,hy=R(()=>{u();Ot();Cs=pe(it());ct();dy={id(r){return Cs.default.attribute({attribute:"id",operator:"=",value:r.value,quoteMark:'"'})}};K2=(0,Cs.default)(r=>r.map(e=>{let t=e.split(i=>i.type==="combinator"&&i.value===" ").pop();return Y2(t)})),zl=new Map});function Vl(){function r(e){let t=null;e.each(i=>{if(!J2.has(i.type)){t=null;return}if(t===null){t=i;return}let n=my[i.type];i.type==="atrule"&&i.name==="font-face"?t=i:n.every(a=>(i[a]??"").replace(/\s+/g," ")===(t[a]??"").replace(/\s+/g," "))?(i.nodes&&t.append(i.nodes),i.remove()):t=i}),e.each(i=>{i.type==="atrule"&&r(i)})}return e=>{r(e)}}var my,J2,gy=R(()=>{u();my={atrule:["name","params"],rule:["selector"]},J2=new Set(Object.keys(my))});function Hl(){return r=>{r.walkRules(e=>{let t=new Map,i=new Set([]),n=new Map;e.walkDecls(a=>{if(a.parent===e){if(t.has(a.prop)){if(t.get(a.prop).value===a.value){i.add(t.get(a.prop)),t.set(a.prop,a);return}n.has(a.prop)||n.set(a.prop,new Set),n.get(a.prop).add(t.get(a.prop)),n.get(a.prop).add(a)}t.set(a.prop,a)}});for(let a of i)a.remove();for(let a of n.values()){let s=new Map;for(let o of a){let l=eO(o.value);l!==null&&(s.has(l)||s.set(l,new Set),s.get(l).add(o))}for(let o of s.values()){let l=Array.from(o).slice(0,-1);for(let c of l)c.remove()}}})}}function eO(r){let e=/^-?\d*.?\d+([\w%]+)?$/g.exec(r);return e?e[1]??Z2:null}var Z2,yy=R(()=>{u();Z2=Symbol("unitless-number")});function tO(r){if(!r.walkAtRules)return;let e=new Set;if(r.walkAtRules("apply",t=>{e.add(t.parent)}),e.size!==0)for(let t of e){let i=[],n=[];for(let a of t.nodes)a.type==="atrule"&&a.name==="apply"?(n.length>0&&(i.push(n),n=[]),i.push([a])):n.push(a);if(n.length>0&&i.push(n),i.length!==1){for(let a of[...i].reverse()){let s=t.clone({nodes:[]});s.append(a),t.after(s)}t.remove()}}}function _s(){return r=>{tO(r)}}var by=R(()=>{u()});function Es(r){return async function(e,t){let{tailwindDirectives:i,applyDirectives:n}=_l(e);_s()(e,t);let a=r({tailwindDirectives:i,applyDirectives:n,registerDependency(s){t.messages.push({plugin:"tailwindcss",parent:t.opts.from,...s})},createContext(s,o){return tl(s,o,e)}})(e,t);if(a.tailwindConfig.separator==="-")throw new Error("The '-' character cannot be used as a custom separator in JIT mode due to parsing ambiguity. Please use another character like '_' instead.");Of(a.tailwindConfig),await Tl(a)(e,t),_s()(e,t),Pl(a)(e,t),uy(a)(e,t),cy(a)(e,t),Ul(a)(e,t),Vl(a)(e,t),Hl(a)(e,t)}}var wy=R(()=>{u();_g();Mg();Ug();fy();py();hy();gy();yy();by();_i();ct()});function vy(r,e){let t=null,i=null;return r.walkAtRules("config",n=>{if(i=n.source?.input.file??e.opts.from??null,i===null)throw n.error("The `@config` directive cannot be used without setting `from` in your PostCSS config.");if(t)throw n.error("Only one `@config` directive is allowed per file.");let a=n.params.match(/(['"])(.*?)\1/);if(!a)throw n.error("A path is required when using the `@config` directive.");let s=a[2];if(me.isAbsolute(s))throw n.error("The `@config` directive cannot be used with an absolute path.");if(t=me.resolve(me.dirname(i),s),!be.existsSync(t))throw n.error(`The config file at "${s}" does not exist. Make sure the path is correct and the file exists.`);n.remove()}),t||null}var xy=R(()=>{u();ft();et()});var ky=x((Vq,Wl)=>{u();Cg();wy();It();xy();Wl.exports=function(e){return{postcssPlugin:"tailwindcss",plugins:[Je.DEBUG&&function(t){return console.log(` +`),console.time("JIT TOTAL"),t},async function(t,i){e=vy(t,i)??e;let n=Cl(e);if(t.type==="document"){let a=t.nodes.filter(s=>s.type==="root");for(let s of a)s.type==="root"&&await Es(n)(s,i);return}await Es(n)(t,i)},Je.DEBUG&&function(t){return console.timeEnd("JIT TOTAL"),console.log(` +`),t}].filter(Boolean)}};Wl.exports.postcss=!0});var Ay=x((Hq,Sy)=>{u();Sy.exports=ky()});var Gl=x((Wq,Cy)=>{u();Cy.exports=()=>["and_chr 114","and_uc 15.5","chrome 114","chrome 113","chrome 109","edge 114","firefox 114","ios_saf 16.5","ios_saf 16.4","ios_saf 16.3","ios_saf 16.1","opera 99","safari 16.5","samsung 21"]});var Os={};Ge(Os,{agents:()=>rO,feature:()=>iO});function iO(){return{status:"cr",title:"CSS Feature Queries",stats:{ie:{"6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","5.5":"n"},edge:{"12":"y","13":"y","14":"y","15":"y","16":"y","17":"y","18":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y"},firefox:{"2":"n","3":"n","4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y","3.5":"n","3.6":"n"},chrome:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"n","10":"n","11":"n","12":"n","13":"n","14":"n","15":"n","16":"n","17":"n","18":"n","19":"n","20":"n","21":"n","22":"n","23":"n","24":"n","25":"n","26":"n","27":"n","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","59":"y","60":"y","61":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","101":"y","102":"y","103":"y","104":"y","105":"y","106":"y","107":"y","108":"y","109":"y","110":"y","111":"y","112":"y","113":"y","114":"y","115":"y","116":"y","117":"y"},safari:{"4":"n","5":"n","6":"n","7":"n","8":"n","9":"y","10":"y","11":"y","12":"y","13":"y","14":"y","15":"y","17":"y","9.1":"y","10.1":"y","11.1":"y","12.1":"y","13.1":"y","14.1":"y","15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y",TP:"y","3.1":"n","3.2":"n","5.1":"n","6.1":"n","7.1":"n"},opera:{"9":"n","11":"n","12":"n","15":"y","16":"y","17":"y","18":"y","19":"y","20":"y","21":"y","22":"y","23":"y","24":"y","25":"y","26":"y","27":"y","28":"y","29":"y","30":"y","31":"y","32":"y","33":"y","34":"y","35":"y","36":"y","37":"y","38":"y","39":"y","40":"y","41":"y","42":"y","43":"y","44":"y","45":"y","46":"y","47":"y","48":"y","49":"y","50":"y","51":"y","52":"y","53":"y","54":"y","55":"y","56":"y","57":"y","58":"y","60":"y","62":"y","63":"y","64":"y","65":"y","66":"y","67":"y","68":"y","69":"y","70":"y","71":"y","72":"y","73":"y","74":"y","75":"y","76":"y","77":"y","78":"y","79":"y","80":"y","81":"y","82":"y","83":"y","84":"y","85":"y","86":"y","87":"y","88":"y","89":"y","90":"y","91":"y","92":"y","93":"y","94":"y","95":"y","96":"y","97":"y","98":"y","99":"y","100":"y","12.1":"y","9.5-9.6":"n","10.0-10.1":"n","10.5":"n","10.6":"n","11.1":"n","11.5":"n","11.6":"n"},ios_saf:{"8":"n","17":"y","9.0-9.2":"y","9.3":"y","10.0-10.2":"y","10.3":"y","11.0-11.2":"y","11.3-11.4":"y","12.0-12.1":"y","12.2-12.5":"y","13.0-13.1":"y","13.2":"y","13.3":"y","13.4-13.7":"y","14.0-14.4":"y","14.5-14.8":"y","15.0-15.1":"y","15.2-15.3":"y","15.4":"y","15.5":"y","15.6":"y","16.0":"y","16.1":"y","16.2":"y","16.3":"y","16.4":"y","16.5":"y","16.6":"y","3.2":"n","4.0-4.1":"n","4.2-4.3":"n","5.0-5.1":"n","6.0-6.1":"n","7.0-7.1":"n","8.1-8.4":"n"},op_mini:{all:"y"},android:{"3":"n","4":"n","114":"y","4.4":"y","4.4.3-4.4.4":"y","2.1":"n","2.2":"n","2.3":"n","4.1":"n","4.2-4.3":"n"},bb:{"7":"n","10":"n"},op_mob:{"10":"n","11":"n","12":"n","73":"y","11.1":"n","11.5":"n","12.1":"n"},and_chr:{"114":"y"},and_ff:{"115":"y"},ie_mob:{"10":"n","11":"n"},and_uc:{"15.5":"y"},samsung:{"4":"y","20":"y","21":"y","5.0-5.4":"y","6.2-6.4":"y","7.2-7.4":"y","8.2":"y","9.2":"y","10.1":"y","11.1-11.2":"y","12.0":"y","13.0":"y","14.0":"y","15.0":"y","16.0":"y","17.0":"y","18.0":"y","19.0":"y"},and_qq:{"13.1":"y"},baidu:{"13.18":"y"},kaios:{"2.5":"y","3.0-3.1":"y"}}}}var rO,Ts=R(()=>{u();rO={ie:{prefix:"ms"},edge:{prefix:"webkit",prefix_exceptions:{"12":"ms","13":"ms","14":"ms","15":"ms","16":"ms","17":"ms","18":"ms"}},firefox:{prefix:"moz"},chrome:{prefix:"webkit"},safari:{prefix:"webkit"},opera:{prefix:"webkit",prefix_exceptions:{"9":"o","11":"o","12":"o","9.5-9.6":"o","10.0-10.1":"o","10.5":"o","10.6":"o","11.1":"o","11.5":"o","11.6":"o","12.1":"o"}},ios_saf:{prefix:"webkit"},op_mini:{prefix:"o"},android:{prefix:"webkit"},bb:{prefix:"webkit"},op_mob:{prefix:"o",prefix_exceptions:{"73":"webkit"}},and_chr:{prefix:"webkit"},and_ff:{prefix:"moz"},ie_mob:{prefix:"ms"},and_uc:{prefix:"webkit",prefix_exceptions:{"15.5":"webkit"}},samsung:{prefix:"webkit"},and_qq:{prefix:"webkit"},baidu:{prefix:"webkit"},kaios:{prefix:"moz"}}});var _y=x(()=>{u()});var _e=x((Yq,Lt)=>{u();var{list:Ql}=$e();Lt.exports.error=function(r){let e=new Error(r);throw e.autoprefixer=!0,e};Lt.exports.uniq=function(r){return[...new Set(r)]};Lt.exports.removeNote=function(r){return r.includes(" ")?r.split(" ")[0]:r};Lt.exports.escapeRegexp=function(r){return r.replace(/[$()*+-.?[\\\]^{|}]/g,"\\$&")};Lt.exports.regexp=function(r,e=!0){return e&&(r=this.escapeRegexp(r)),new RegExp(`(^|[\\s,(])(${r}($|[\\s(,]))`,"gi")};Lt.exports.editList=function(r,e){let t=Ql.comma(r),i=e(t,[]);if(t===i)return r;let n=r.match(/,\s*/);return n=n?n[0]:", ",i.join(n)};Lt.exports.splitSelector=function(r){return Ql.comma(r).map(e=>Ql.space(e).map(t=>t.split(/(?=\.|#)/g)))}});var Mt=x((Kq,Ty)=>{u();var nO=Gl(),Ey=(Ts(),Os).agents,sO=_e(),Oy=class{static prefixes(){if(this.prefixesCache)return this.prefixesCache;this.prefixesCache=[];for(let e in Ey)this.prefixesCache.push(`-${Ey[e].prefix}-`);return this.prefixesCache=sO.uniq(this.prefixesCache).sort((e,t)=>t.length-e.length),this.prefixesCache}static withPrefix(e){return this.prefixesRegexp||(this.prefixesRegexp=new RegExp(this.prefixes().join("|"))),this.prefixesRegexp.test(e)}constructor(e,t,i,n){this.data=e,this.options=i||{},this.browserslistOpts=n||{},this.selected=this.parse(t)}parse(e){let t={};for(let i in this.browserslistOpts)t[i]=this.browserslistOpts[i];return t.path=this.options.from,nO(e,t)}prefix(e){let[t,i]=e.split(" "),n=this.data[t],a=n.prefix_exceptions&&n.prefix_exceptions[i];return a||(a=n.prefix),`-${a}-`}isSelected(e){return this.selected.includes(e)}};Ty.exports=Oy});var Li=x((Xq,Ry)=>{u();Ry.exports={prefix(r){let e=r.match(/^(-\w+-)/);return e?e[0]:""},unprefixed(r){return r.replace(/^-\w+-/,"")}}});var wr=x((Jq,Iy)=>{u();var aO=Mt(),Py=Li(),oO=_e();function Yl(r,e){let t=new r.constructor;for(let i of Object.keys(r||{})){let n=r[i];i==="parent"&&typeof n=="object"?e&&(t[i]=e):i==="source"||i===null?t[i]=n:Array.isArray(n)?t[i]=n.map(a=>Yl(a,t)):i!=="_autoprefixerPrefix"&&i!=="_autoprefixerValues"&&i!=="proxyCache"&&(typeof n=="object"&&n!==null&&(n=Yl(n,t)),t[i]=n)}return t}var Rs=class{static hack(e){return this.hacks||(this.hacks={}),e.names.map(t=>(this.hacks[t]=e,this.hacks[t]))}static load(e,t,i){let n=this.hacks&&this.hacks[e];return n?new n(e,t,i):new this(e,t,i)}static clone(e,t){let i=Yl(e);for(let n in t)i[n]=t[n];return i}constructor(e,t,i){this.prefixes=t,this.name=e,this.all=i}parentPrefix(e){let t;return typeof e._autoprefixerPrefix!="undefined"?t=e._autoprefixerPrefix:e.type==="decl"&&e.prop[0]==="-"?t=Py.prefix(e.prop):e.type==="root"?t=!1:e.type==="rule"&&e.selector.includes(":-")&&/:(-\w+-)/.test(e.selector)?t=e.selector.match(/:(-\w+-)/)[1]:e.type==="atrule"&&e.name[0]==="-"?t=Py.prefix(e.name):t=this.parentPrefix(e.parent),aO.prefixes().includes(t)||(t=!1),e._autoprefixerPrefix=t,e._autoprefixerPrefix}process(e,t){if(!this.check(e))return;let i=this.parentPrefix(e),n=this.prefixes.filter(s=>!i||i===oO.removeNote(s)),a=[];for(let s of n)this.add(e,s,a.concat([s]),t)&&a.push(s);return a}clone(e,t){return Rs.clone(e,t)}};Iy.exports=Rs});var j=x((Zq,$y)=>{u();var lO=wr(),uO=Mt(),Dy=_e(),qy=class extends lO{check(){return!0}prefixed(e,t){return t+e}normalize(e){return e}otherPrefixes(e,t){for(let i of uO.prefixes())if(i!==t&&e.includes(i))return!0;return!1}set(e,t){return e.prop=this.prefixed(e.prop,t),e}needCascade(e){return e._autoprefixerCascade||(e._autoprefixerCascade=this.all.options.cascade!==!1&&e.raw("before").includes(` +`)),e._autoprefixerCascade}maxPrefixed(e,t){if(t._autoprefixerMax)return t._autoprefixerMax;let i=0;for(let n of e)n=Dy.removeNote(n),n.length>i&&(i=n.length);return t._autoprefixerMax=i,t._autoprefixerMax}calcBefore(e,t,i=""){let a=this.maxPrefixed(e,t)-Dy.removeNote(i).length,s=t.raw("before");return a>0&&(s+=Array(a).fill(" ").join("")),s}restoreBefore(e){let t=e.raw("before").split(` +`),i=t[t.length-1];this.all.group(e).up(n=>{let a=n.raw("before").split(` +`),s=a[a.length-1];s.lengths.prop===n.prop&&s.value===n.value)))return this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,n)}isAlready(e,t){let i=this.all.group(e).up(n=>n.prop===t);return i||(i=this.all.group(e).down(n=>n.prop===t)),i}add(e,t,i,n){let a=this.prefixed(e.prop,t);if(!(this.isAlready(e,a)||this.otherPrefixes(e.value,t)))return this.insert(e,t,i,n)}process(e,t){if(!this.needCascade(e)){super.process(e,t);return}let i=super.process(e,t);!i||!i.length||(this.restoreBefore(e),e.raws.before=this.calcBefore(i,e))}old(e,t){return[this.prefixed(e,t)]}};$y.exports=qy});var My=x((e$,Ly)=>{u();Ly.exports=function r(e){return{mul:t=>new r(e*t),div:t=>new r(e/t),simplify:()=>new r(e),toString:()=>e.toString()}}});var Fy=x((t$,By)=>{u();var fO=My(),cO=wr(),Kl=_e(),pO=/(min|max)-resolution\s*:\s*\d*\.?\d+(dppx|dpcm|dpi|x)/gi,dO=/(min|max)-resolution(\s*:\s*)(\d*\.?\d+)(dppx|dpcm|dpi|x)/i,Ny=class extends cO{prefixName(e,t){return e==="-moz-"?t+"--moz-device-pixel-ratio":e+t+"-device-pixel-ratio"}prefixQuery(e,t,i,n,a){return n=new fO(n),a==="dpi"?n=n.div(96):a==="dpcm"&&(n=n.mul(2.54).div(96)),n=n.simplify(),e==="-o-"&&(n=n.n+"/"+n.d),this.prefixName(e,t)+i+n}clean(e){if(!this.bad){this.bad=[];for(let t of this.prefixes)this.bad.push(this.prefixName(t,"min")),this.bad.push(this.prefixName(t,"max"))}e.params=Kl.editList(e.params,t=>t.filter(i=>this.bad.every(n=>!i.includes(n))))}process(e){let t=this.parentPrefix(e),i=t?[t]:this.prefixes;e.params=Kl.editList(e.params,(n,a)=>{for(let s of n){if(!s.includes("min-resolution")&&!s.includes("max-resolution")){a.push(s);continue}for(let o of i){let l=s.replace(pO,c=>{let f=c.match(dO);return this.prefixQuery(o,f[1],f[2],f[3],f[4])});a.push(l)}a.push(s)}return Kl.uniq(a)})}};By.exports=Ny});var zy=x((r$,jy)=>{u();var Xl="(".charCodeAt(0),Jl=")".charCodeAt(0),Ps="'".charCodeAt(0),Zl='"'.charCodeAt(0),eu="\\".charCodeAt(0),vr="/".charCodeAt(0),tu=",".charCodeAt(0),ru=":".charCodeAt(0),Is="*".charCodeAt(0),hO="u".charCodeAt(0),mO="U".charCodeAt(0),gO="+".charCodeAt(0),yO=/^[a-f0-9?-]+$/i;jy.exports=function(r){for(var e=[],t=r,i,n,a,s,o,l,c,f,d=0,p=t.charCodeAt(d),h=t.length,b=[{nodes:e}],v=0,y,w="",k="",S="";d{u();Uy.exports=function r(e,t,i){var n,a,s,o;for(n=0,a=e.length;n{u();function Hy(r,e){var t=r.type,i=r.value,n,a;return e&&(a=e(r))!==void 0?a:t==="word"||t==="space"?i:t==="string"?(n=r.quote||"",n+i+(r.unclosed?"":n)):t==="comment"?"/*"+i+(r.unclosed?"":"*/"):t==="div"?(r.before||"")+i+(r.after||""):Array.isArray(r.nodes)?(n=Wy(r.nodes,e),t!=="function"?n:i+"("+(r.before||"")+n+(r.after||"")+(r.unclosed?"":")")):i}function Wy(r,e){var t,i;if(Array.isArray(r)){for(t="",i=r.length-1;~i;i-=1)t=Hy(r[i],e)+t;return t}return Hy(r,e)}Gy.exports=Wy});var Ky=x((s$,Yy)=>{u();var Ds="-".charCodeAt(0),qs="+".charCodeAt(0),iu=".".charCodeAt(0),bO="e".charCodeAt(0),wO="E".charCodeAt(0);function vO(r){var e=r.charCodeAt(0),t;if(e===qs||e===Ds){if(t=r.charCodeAt(1),t>=48&&t<=57)return!0;var i=r.charCodeAt(2);return t===iu&&i>=48&&i<=57}return e===iu?(t=r.charCodeAt(1),t>=48&&t<=57):e>=48&&e<=57}Yy.exports=function(r){var e=0,t=r.length,i,n,a;if(t===0||!vO(r))return!1;for(i=r.charCodeAt(e),(i===qs||i===Ds)&&e++;e57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),i===iu&&n>=48&&n<=57)for(e+=2;e57));)e+=1;if(i=r.charCodeAt(e),n=r.charCodeAt(e+1),a=r.charCodeAt(e+2),(i===bO||i===wO)&&(n>=48&&n<=57||(n===qs||n===Ds)&&a>=48&&a<=57))for(e+=n===qs||n===Ds?3:2;e57));)e+=1;return{number:r.slice(0,e),unit:r.slice(e)}}});var $s=x((a$,Zy)=>{u();var xO=zy(),Xy=Vy(),Jy=Qy();function Nt(r){return this instanceof Nt?(this.nodes=xO(r),this):new Nt(r)}Nt.prototype.toString=function(){return Array.isArray(this.nodes)?Jy(this.nodes):""};Nt.prototype.walk=function(r,e){return Xy(this.nodes,r,e),this};Nt.unit=Ky();Nt.walk=Xy;Nt.stringify=Jy;Zy.exports=Nt});var nb=x((o$,ib)=>{u();var{list:kO}=$e(),eb=$s(),SO=Mt(),tb=Li(),rb=class{constructor(e){this.props=["transition","transition-property"],this.prefixes=e}add(e,t){let i,n,a=this.prefixes.add[e.prop],s=this.ruleVendorPrefixes(e),o=s||a&&a.prefixes||[],l=this.parse(e.value),c=l.map(h=>this.findProp(h)),f=[];if(c.some(h=>h[0]==="-"))return;for(let h of l){if(n=this.findProp(h),n[0]==="-")continue;let b=this.prefixes.add[n];if(!(!b||!b.prefixes))for(i of b.prefixes){if(s&&!s.some(y=>i.includes(y)))continue;let v=this.prefixes.prefixed(n,i);v!=="-ms-transform"&&!c.includes(v)&&(this.disabled(n,i)||f.push(this.clone(n,v,h)))}}l=l.concat(f);let d=this.stringify(l),p=this.stringify(this.cleanFromUnprefixed(l,"-webkit-"));if(o.includes("-webkit-")&&this.cloneBefore(e,`-webkit-${e.prop}`,p),this.cloneBefore(e,e.prop,p),o.includes("-o-")){let h=this.stringify(this.cleanFromUnprefixed(l,"-o-"));this.cloneBefore(e,`-o-${e.prop}`,h)}for(i of o)if(i!=="-webkit-"&&i!=="-o-"){let h=this.stringify(this.cleanOtherPrefixes(l,i));this.cloneBefore(e,i+e.prop,h)}d!==e.value&&!this.already(e,e.prop,d)&&(this.checkForWarning(t,e),e.cloneBefore(),e.value=d)}findProp(e){let t=e[0].value;if(/^\d/.test(t)){for(let[i,n]of e.entries())if(i!==0&&n.type==="word")return n.value}return t}already(e,t,i){return e.parent.some(n=>n.prop===t&&n.value===i)}cloneBefore(e,t,i){this.already(e,t,i)||e.cloneBefore({prop:t,value:i})}checkForWarning(e,t){if(t.prop!=="transition-property")return;let i=!1,n=!1;t.parent.each(a=>{if(a.type!=="decl"||a.prop.indexOf("transition-")!==0)return;let s=kO.comma(a.value);if(a.prop==="transition-property"){s.forEach(o=>{let l=this.prefixes.add[o];l&&l.prefixes&&l.prefixes.length>0&&(i=!0)});return}return n=n||s.length>1,!1}),i&&n&&t.warn(e,"Replace transition-property to transition, because Autoprefixer could not support any cases of transition-property and other transition-*")}remove(e){let t=this.parse(e.value);t=t.filter(s=>{let o=this.prefixes.remove[this.findProp(s)];return!o||!o.remove});let i=this.stringify(t);if(e.value===i)return;if(t.length===0){e.remove();return}let n=e.parent.some(s=>s.prop===e.prop&&s.value===i),a=e.parent.some(s=>s!==e&&s.prop===e.prop&&s.value.length>i.length);if(n||a){e.remove();return}e.value=i}parse(e){let t=eb(e),i=[],n=[];for(let a of t.nodes)n.push(a),a.type==="div"&&a.value===","&&(i.push(n),n=[]);return i.push(n),i.filter(a=>a.length>0)}stringify(e){if(e.length===0)return"";let t=[];for(let i of e)i[i.length-1].type!=="div"&&i.push(this.div(e)),t=t.concat(i);return t[0].type==="div"&&(t=t.slice(1)),t[t.length-1].type==="div"&&(t=t.slice(0,-2+1||void 0)),eb.stringify({nodes:t})}clone(e,t,i){let n=[],a=!1;for(let s of i)!a&&s.type==="word"&&s.value===e?(n.push({type:"word",value:t}),a=!0):n.push(s);return n}div(e){for(let t of e)for(let i of t)if(i.type==="div"&&i.value===",")return i;return{type:"div",value:",",after:" "}}cleanOtherPrefixes(e,t){return e.filter(i=>{let n=tb.prefix(this.findProp(i));return n===""||n===t})}cleanFromUnprefixed(e,t){let i=e.map(a=>this.findProp(a)).filter(a=>a.slice(0,t.length)===t).map(a=>this.prefixes.unprefixed(a)),n=[];for(let a of e){let s=this.findProp(a),o=tb.prefix(s);!i.includes(s)&&(o===t||o==="")&&n.push(a)}return n}disabled(e,t){let i=["order","justify-content","align-self","align-content"];if(e.includes("flex")||i.includes(e)){if(this.prefixes.options.flexbox===!1)return!0;if(this.prefixes.options.flexbox==="no-2009")return t.includes("2009")}}ruleVendorPrefixes(e){let{parent:t}=e;if(t.type!=="rule")return!1;if(!t.selector.includes(":-"))return!1;let i=SO.prefixes().filter(n=>t.selector.includes(":"+n));return i.length>0?i:!1}};ib.exports=rb});var xr=x((l$,ab)=>{u();var AO=_e(),sb=class{constructor(e,t,i,n){this.unprefixed=e,this.prefixed=t,this.string=i||t,this.regexp=n||AO.regexp(t)}check(e){return e.includes(this.string)?!!e.match(this.regexp):!1}};ab.exports=sb});var He=x((u$,lb)=>{u();var CO=wr(),_O=xr(),EO=Li(),OO=_e(),ob=class extends CO{static save(e,t){let i=t.prop,n=[];for(let a in t._autoprefixerValues){let s=t._autoprefixerValues[a];if(s===t.value)continue;let o,l=EO.prefix(i);if(l==="-pie-")continue;if(l===a){o=t.value=s,n.push(o);continue}let c=e.prefixed(i,a),f=t.parent;if(!f.every(b=>b.prop!==c)){n.push(o);continue}let d=s.replace(/\s+/," ");if(f.some(b=>b.prop===t.prop&&b.value.replace(/\s+/," ")===d)){n.push(o);continue}let h=this.clone(t,{value:s});o=t.parent.insertBefore(t,h),n.push(o)}return n}check(e){let t=e.value;return t.includes(this.name)?!!t.match(this.regexp()):!1}regexp(){return this.regexpCache||(this.regexpCache=OO.regexp(this.name))}replace(e,t){return e.replace(this.regexp(),`$1${t}$2`)}value(e){return e.raws.value&&e.raws.value.value===e.value?e.raws.value.raw:e.value}add(e,t){e._autoprefixerValues||(e._autoprefixerValues={});let i=e._autoprefixerValues[t]||this.value(e),n;do if(n=i,i=this.replace(i,t),i===!1)return;while(i!==n);e._autoprefixerValues[t]=i}old(e){return new _O(this.name,e+this.name)}};lb.exports=ob});var Bt=x((f$,ub)=>{u();ub.exports={}});var su=x((c$,pb)=>{u();var fb=$s(),TO=He(),RO=Bt().insertAreas,PO=/(^|[^-])linear-gradient\(\s*(top|left|right|bottom)/i,IO=/(^|[^-])radial-gradient\(\s*\d+(\w*|%)\s+\d+(\w*|%)\s*,/i,DO=/(!\s*)?autoprefixer:\s*ignore\s+next/i,qO=/(!\s*)?autoprefixer\s*grid:\s*(on|off|(no-)?autoplace)/i,$O=["width","height","min-width","max-width","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size"];function nu(r){return r.parent.some(e=>e.prop==="grid-template"||e.prop==="grid-template-areas")}function LO(r){let e=r.parent.some(i=>i.prop==="grid-template-rows"),t=r.parent.some(i=>i.prop==="grid-template-columns");return e&&t}var cb=class{constructor(e){this.prefixes=e}add(e,t){let i=this.prefixes.add["@resolution"],n=this.prefixes.add["@keyframes"],a=this.prefixes.add["@viewport"],s=this.prefixes.add["@supports"];e.walkAtRules(f=>{if(f.name==="keyframes"){if(!this.disabled(f,t))return n&&n.process(f)}else if(f.name==="viewport"){if(!this.disabled(f,t))return a&&a.process(f)}else if(f.name==="supports"){if(this.prefixes.options.supports!==!1&&!this.disabled(f,t))return s.process(f)}else if(f.name==="media"&&f.params.includes("-resolution")&&!this.disabled(f,t))return i&&i.process(f)}),e.walkRules(f=>{if(!this.disabled(f,t))return this.prefixes.add.selectors.map(d=>d.process(f,t))});function o(f){return f.parent.nodes.some(d=>{if(d.type!=="decl")return!1;let p=d.prop==="display"&&/(inline-)?grid/.test(d.value),h=d.prop.startsWith("grid-template"),b=/^grid-([A-z]+-)?gap/.test(d.prop);return p||h||b})}function l(f){return f.parent.some(d=>d.prop==="display"&&/(inline-)?flex/.test(d.value))}let c=this.gridStatus(e,t)&&this.prefixes.add["grid-area"]&&this.prefixes.add["grid-area"].prefixes;return e.walkDecls(f=>{if(this.disabledDecl(f,t))return;let d=f.parent,p=f.prop,h=f.value;if(p==="grid-row-span"){t.warn("grid-row-span is not part of final Grid Layout. Use grid-row.",{node:f});return}else if(p==="grid-column-span"){t.warn("grid-column-span is not part of final Grid Layout. Use grid-column.",{node:f});return}else if(p==="display"&&h==="box"){t.warn("You should write display: flex by final spec instead of display: box",{node:f});return}else if(p==="text-emphasis-position")(h==="under"||h==="over")&&t.warn("You should use 2 values for text-emphasis-position For example, `under left` instead of just `under`.",{node:f});else if(/^(align|justify|place)-(items|content)$/.test(p)&&l(f))(h==="start"||h==="end")&&t.warn(`${h} value has mixed support, consider using flex-${h} instead`,{node:f});else if(p==="text-decoration-skip"&&h==="ink")t.warn("Replace text-decoration-skip: ink to text-decoration-skip-ink: auto, because spec had been changed",{node:f});else{if(c&&this.gridStatus(f,t))if(f.value==="subgrid"&&t.warn("IE does not support subgrid",{node:f}),/^(align|justify|place)-items$/.test(p)&&o(f)){let v=p.replace("-items","-self");t.warn(`IE does not support ${p} on grid containers. Try using ${v} on child elements instead: ${f.parent.selector} > * { ${v}: ${f.value} }`,{node:f})}else if(/^(align|justify|place)-content$/.test(p)&&o(f))t.warn(`IE does not support ${f.prop} on grid containers`,{node:f});else if(p==="display"&&f.value==="contents"){t.warn("Please do not use display: contents; if you have grid setting enabled",{node:f});return}else if(f.prop==="grid-gap"){let v=this.gridStatus(f,t);v==="autoplace"&&!LO(f)&&!nu(f)?t.warn("grid-gap only works if grid-template(-areas) is being used or both rows and columns have been declared and cells have not been manually placed inside the explicit grid",{node:f}):(v===!0||v==="no-autoplace")&&!nu(f)&&t.warn("grid-gap only works if grid-template(-areas) is being used",{node:f})}else if(p==="grid-auto-columns"){t.warn("grid-auto-columns is not supported by IE",{node:f});return}else if(p==="grid-auto-rows"){t.warn("grid-auto-rows is not supported by IE",{node:f});return}else if(p==="grid-auto-flow"){let v=d.some(w=>w.prop==="grid-template-rows"),y=d.some(w=>w.prop==="grid-template-columns");nu(f)?t.warn("grid-auto-flow is not supported by IE",{node:f}):h.includes("dense")?t.warn("grid-auto-flow: dense is not supported by IE",{node:f}):!v&&!y&&t.warn("grid-auto-flow works only if grid-template-rows and grid-template-columns are present in the same rule",{node:f});return}else if(h.includes("auto-fit")){t.warn("auto-fit value is not supported by IE",{node:f,word:"auto-fit"});return}else if(h.includes("auto-fill")){t.warn("auto-fill value is not supported by IE",{node:f,word:"auto-fill"});return}else p.startsWith("grid-template")&&h.includes("[")&&t.warn("Autoprefixer currently does not support line names. Try using grid-template-areas instead.",{node:f,word:"["});if(h.includes("radial-gradient"))if(IO.test(f.value))t.warn("Gradient has outdated direction syntax. New syntax is like `closest-side at 0 0` instead of `0 0, closest-side`.",{node:f});else{let v=fb(h);for(let y of v.nodes)if(y.type==="function"&&y.value==="radial-gradient")for(let w of y.nodes)w.type==="word"&&(w.value==="cover"?t.warn("Gradient has outdated direction syntax. Replace `cover` to `farthest-corner`.",{node:f}):w.value==="contain"&&t.warn("Gradient has outdated direction syntax. Replace `contain` to `closest-side`.",{node:f}))}h.includes("linear-gradient")&&PO.test(h)&&t.warn("Gradient has outdated direction syntax. New syntax is like `to left` instead of `right`.",{node:f})}$O.includes(f.prop)&&(f.value.includes("-fill-available")||(f.value.includes("fill-available")?t.warn("Replace fill-available to stretch, because spec had been changed",{node:f}):f.value.includes("fill")&&fb(h).nodes.some(y=>y.type==="word"&&y.value==="fill")&&t.warn("Replace fill to stretch, because spec had been changed",{node:f})));let b;if(f.prop==="transition"||f.prop==="transition-property")return this.prefixes.transition.add(f,t);if(f.prop==="align-self"){if(this.displayType(f)!=="grid"&&this.prefixes.options.flexbox!==!1&&(b=this.prefixes.add["align-self"],b&&b.prefixes&&b.process(f)),this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-row-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="justify-self"){if(this.gridStatus(f,t)!==!1&&(b=this.prefixes.add["grid-column-align"],b&&b.prefixes))return b.process(f,t)}else if(f.prop==="place-self"){if(b=this.prefixes.add["place-self"],b&&b.prefixes&&this.gridStatus(f,t)!==!1)return b.process(f,t)}else if(b=this.prefixes.add[f.prop],b&&b.prefixes)return b.process(f,t)}),this.gridStatus(e,t)&&RO(e,this.disabled),e.walkDecls(f=>{if(this.disabledValue(f,t))return;let d=this.prefixes.unprefixed(f.prop),p=this.prefixes.values("add",d);if(Array.isArray(p))for(let h of p)h.process&&h.process(f,t);TO.save(this.prefixes,f)})}remove(e,t){let i=this.prefixes.remove["@resolution"];e.walkAtRules((n,a)=>{this.prefixes.remove[`@${n.name}`]?this.disabled(n,t)||n.parent.removeChild(a):n.name==="media"&&n.params.includes("-resolution")&&i&&i.clean(n)});for(let n of this.prefixes.remove.selectors)e.walkRules((a,s)=>{n.check(a)&&(this.disabled(a,t)||a.parent.removeChild(s))});return e.walkDecls((n,a)=>{if(this.disabled(n,t))return;let s=n.parent,o=this.prefixes.unprefixed(n.prop);if((n.prop==="transition"||n.prop==="transition-property")&&this.prefixes.transition.remove(n),this.prefixes.remove[n.prop]&&this.prefixes.remove[n.prop].remove){let l=this.prefixes.group(n).down(c=>this.prefixes.normalize(c.prop)===o);if(o==="flex-flow"&&(l=!0),n.prop==="-webkit-box-orient"){let c={"flex-direction":!0,"flex-flow":!0};if(!n.parent.some(f=>c[f.prop]))return}if(l&&!this.withHackValue(n)){n.raw("before").includes(` +`)&&this.reduceSpaces(n),s.removeChild(a);return}}for(let l of this.prefixes.values("remove",o)){if(!l.check||!l.check(n.value))continue;if(o=l.unprefixed,this.prefixes.group(n).down(f=>f.value.includes(o))){s.removeChild(a);return}}})}withHackValue(e){return e.prop==="-webkit-background-clip"&&e.value==="text"}disabledValue(e,t){return this.gridStatus(e,t)===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("grid")||this.prefixes.options.flexbox===!1&&e.type==="decl"&&e.prop==="display"&&e.value.includes("flex")||e.type==="decl"&&e.prop==="content"?!0:this.disabled(e,t)}disabledDecl(e,t){if(this.gridStatus(e,t)===!1&&e.type==="decl"&&(e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.prefixes.options.flexbox===!1&&e.type==="decl"){let i=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||i.includes(e.prop))return!0}return this.disabled(e,t)}disabled(e,t){if(!e)return!1;if(e._autoprefixerDisabled!==void 0)return e._autoprefixerDisabled;if(e.parent){let n=e.prev();if(n&&n.type==="comment"&&DO.test(n.text))return e._autoprefixerDisabled=!0,e._autoprefixerSelfDisabled=!0,!0}let i=null;if(e.nodes){let n;e.each(a=>{a.type==="comment"&&/(!\s*)?autoprefixer:\s*(off|on)/i.test(a.text)&&(typeof n!="undefined"?t.warn("Second Autoprefixer control comment was ignored. Autoprefixer applies control comment to whole block, not to next rules.",{node:a}):n=/on/i.test(a.text))}),n!==void 0&&(i=!n)}if(!e.nodes||i===null)if(e.parent){let n=this.disabled(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else i=!1;return e._autoprefixerDisabled=i,i}reduceSpaces(e){let t=!1;if(this.prefixes.group(e).up(()=>(t=!0,!0)),t)return;let i=e.raw("before").split(` +`),n=i[i.length-1].length,a=!1;this.prefixes.group(e).down(s=>{i=s.raw("before").split(` +`);let o=i.length-1;i[o].length>n&&(a===!1&&(a=i[o].length-n),i[o]=i[o].slice(0,-a),s.raws.before=i.join(` +`))})}displayType(e){for(let t of e.parent.nodes)if(t.prop==="display"){if(t.value.includes("flex"))return"flex";if(t.value.includes("grid"))return"grid"}return!1}gridStatus(e,t){if(!e)return!1;if(e._autoprefixerGridStatus!==void 0)return e._autoprefixerGridStatus;let i=null;if(e.nodes){let n;e.each(a=>{if(a.type==="comment"&&qO.test(a.text)){let s=/:\s*autoplace/i.test(a.text),o=/no-autoplace/i.test(a.text);typeof n!="undefined"?t.warn("Second Autoprefixer grid control comment was ignored. Autoprefixer applies control comments to the whole block, not to the next rules.",{node:a}):s?n="autoplace":o?n=!0:n=/on/i.test(a.text)}}),n!==void 0&&(i=n)}if(e.type==="atrule"&&e.name==="supports"){let n=e.params;n.includes("grid")&&n.includes("auto")&&(i=!1)}if(!e.nodes||i===null)if(e.parent){let n=this.gridStatus(e.parent,t);e.parent._autoprefixerSelfDisabled===!0?i=!1:i=n}else typeof this.prefixes.options.grid!="undefined"?i=this.prefixes.options.grid:typeof m.env.AUTOPREFIXER_GRID!="undefined"?m.env.AUTOPREFIXER_GRID==="autoplace"?i="autoplace":i=!0:i=!1;return e._autoprefixerGridStatus=i,i}};pb.exports=cb});var hb=x((p$,db)=>{u();db.exports={A:{A:{"2":"K E F G A B JC"},B:{"1":"C L M H N D O P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I"},C:{"1":"2 3 4 5 6 7 8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 KC zB J K E F G A B C L M H N D O k l LC MC"},D:{"1":"8 9 AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB 0B dB 1B eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R S T U V W X Y Z a b c d e f g h i j n o p q r s t u v w x y z I uB 3B 4B","2":"0 1 2 3 4 5 6 7 J K E F G A B C L M H N D O k l"},E:{"1":"G A B C L M H D RC 6B vB wB 7B SC TC 8B 9B xB AC yB BC CC DC EC FC GC UC","2":"0 J K E F NC 5B OC PC QC"},F:{"1":"1 2 3 4 5 6 7 8 9 H N D O k l AB BB CB DB EB FB GB HB IB JB KB LB MB NB OB PB QB RB SB TB UB VB WB XB YB ZB aB bB cB dB eB fB gB hB iB jB kB lB mB nB oB m pB qB rB sB tB P Q R 2B S T U V W X Y Z a b c d e f g h i j wB","2":"G B C VC WC XC YC vB HC ZC"},G:{"1":"D fC gC hC iC jC kC lC mC nC oC pC qC rC sC tC 8B 9B xB AC yB BC CC DC EC FC GC","2":"F 5B aC IC bC cC dC eC"},H:{"1":"uC"},I:{"1":"I zC 0C","2":"zB J vC wC xC yC IC"},J:{"2":"E A"},K:{"1":"m","2":"A B C vB HC wB"},L:{"1":"I"},M:{"1":"uB"},N:{"2":"A B"},O:{"1":"xB"},P:{"1":"J k l 1C 2C 3C 4C 5C 6B 6C 7C 8C 9C AD yB BD CD DD"},Q:{"1":"7B"},R:{"1":"ED"},S:{"1":"FD GD"}},B:4,C:"CSS Feature Queries"}});var bb=x((d$,yb)=>{u();function mb(r){return r[r.length-1]}var gb={parse(r){let e=[""],t=[e];for(let i of r){if(i==="("){e=[""],mb(t).push(e),t.push(e);continue}if(i===")"){t.pop(),e=mb(t),e.push("");continue}e[e.length-1]+=i}return t[0]},stringify(r){let e="";for(let t of r){if(typeof t=="object"){e+=`(${gb.stringify(t)})`;continue}e+=t}return e}};yb.exports=gb});var Sb=x((h$,kb)=>{u();var MO=hb(),{feature:NO}=(Ts(),Os),{parse:BO}=$e(),FO=Mt(),au=bb(),jO=He(),zO=_e(),wb=NO(MO),vb=[];for(let r in wb.stats){let e=wb.stats[r];for(let t in e){let i=e[t];/y/.test(i)&&vb.push(r+" "+t)}}var xb=class{constructor(e,t){this.Prefixes=e,this.all=t}prefixer(){if(this.prefixerCache)return this.prefixerCache;let e=this.all.browsers.selected.filter(i=>vb.includes(i)),t=new FO(this.all.browsers.data,e,this.all.options);return this.prefixerCache=new this.Prefixes(this.all.data,t,this.all.options),this.prefixerCache}parse(e){let t=e.split(":"),i=t[0],n=t[1];return n||(n=""),[i.trim(),n.trim()]}virtual(e){let[t,i]=this.parse(e),n=BO("a{}").first;return n.append({prop:t,value:i,raws:{before:""}}),n}prefixed(e){let t=this.virtual(e);if(this.disabled(t.first))return t.nodes;let i={warn:()=>null},n=this.prefixer().add[t.first.prop];n&&n.process&&n.process(t.first,i);for(let a of t.nodes){for(let s of this.prefixer().values("add",t.first.prop))s.process(a);jO.save(this.all,a)}return t.nodes}isNot(e){return typeof e=="string"&&/not\s*/i.test(e)}isOr(e){return typeof e=="string"&&/\s*or\s*/i.test(e)}isProp(e){return typeof e=="object"&&e.length===1&&typeof e[0]=="string"}isHack(e,t){return!new RegExp(`(\\(|\\s)${zO.escapeRegexp(t)}:`).test(e)}toRemove(e,t){let[i,n]=this.parse(e),a=this.all.unprefixed(i),s=this.all.cleaner();if(s.remove[i]&&s.remove[i].remove&&!this.isHack(t,a))return!0;for(let o of s.values("remove",a))if(o.check(n))return!0;return!1}remove(e,t){let i=0;for(;itypeof t!="object"?t:t.length===1&&typeof t[0]=="object"?this.cleanBrackets(t[0]):this.cleanBrackets(t))}convert(e){let t=[""];for(let i of e)t.push([`${i.prop}: ${i.value}`]),t.push(" or ");return t[t.length-1]="",t}normalize(e){if(typeof e!="object")return e;if(e=e.filter(t=>t!==""),typeof e[0]=="string"){let t=e[0].trim();if(t.includes(":")||t==="selector"||t==="not selector")return[au.stringify(e)]}return e.map(t=>this.normalize(t))}add(e,t){return e.map(i=>{if(this.isProp(i)){let n=this.prefixed(i[0]);return n.length>1?this.convert(n):i}return typeof i=="object"?this.add(i,t):i})}process(e){let t=au.parse(e.params);t=this.normalize(t),t=this.remove(t,e.params),t=this.add(t,e.params),t=this.cleanBrackets(t),e.params=au.stringify(t)}disabled(e){if(!this.all.options.grid&&(e.prop==="display"&&e.value.includes("grid")||e.prop.includes("grid")||e.prop==="justify-items"))return!0;if(this.all.options.flexbox===!1){if(e.prop==="display"&&e.value.includes("flex"))return!0;let t=["order","justify-content","align-items","align-content"];if(e.prop.includes("flex")||t.includes(e.prop))return!0}return!1}};kb.exports=xb});var _b=x((m$,Cb)=>{u();var Ab=class{constructor(e,t){this.prefix=t,this.prefixed=e.prefixed(this.prefix),this.regexp=e.regexp(this.prefix),this.prefixeds=e.possible().map(i=>[e.prefixed(i),e.regexp(i)]),this.unprefixed=e.name,this.nameRegexp=e.regexp()}isHack(e){let t=e.parent.index(e)+1,i=e.parent.nodes;for(;t{u();var{list:UO}=$e(),VO=_b(),HO=wr(),WO=Mt(),GO=_e(),Eb=class extends HO{constructor(e,t,i){super(e,t,i);this.regexpCache=new Map}check(e){return e.selector.includes(this.name)?!!e.selector.match(this.regexp()):!1}prefixed(e){return this.name.replace(/^(\W*)/,`$1${e}`)}regexp(e){if(!this.regexpCache.has(e)){let t=e?this.prefixed(e):this.name;this.regexpCache.set(e,new RegExp(`(^|[^:"'=])${GO.escapeRegexp(t)}`,"gi"))}return this.regexpCache.get(e)}possible(){return WO.prefixes()}prefixeds(e){if(e._autoprefixerPrefixeds){if(e._autoprefixerPrefixeds[this.name])return e._autoprefixerPrefixeds}else e._autoprefixerPrefixeds={};let t={};if(e.selector.includes(",")){let n=UO.comma(e.selector).filter(a=>a.includes(this.name));for(let a of this.possible())t[a]=n.map(s=>this.replace(s,a)).join(", ")}else for(let i of this.possible())t[i]=this.replace(e.selector,i);return e._autoprefixerPrefixeds[this.name]=t,e._autoprefixerPrefixeds}already(e,t,i){let n=e.parent.index(e)-1;for(;n>=0;){let a=e.parent.nodes[n];if(a.type!=="rule")return!1;let s=!1;for(let o in t[this.name]){let l=t[this.name][o];if(a.selector===l){if(i===o)return!0;s=!0;break}}if(!s)return!1;n-=1}return!1}replace(e,t){return e.replace(this.regexp(),`$1${this.prefixed(t)}`)}add(e,t){let i=this.prefixeds(e);if(this.already(e,i,t))return;let n=this.clone(e,{selector:i[this.name][t]});e.parent.insertBefore(e,n)}old(e){return new VO(this,e)}};Ob.exports=Eb});var Pb=x((y$,Rb)=>{u();var QO=wr(),Tb=class extends QO{add(e,t){let i=t+e.name;if(e.parent.some(s=>s.name===i&&s.params===e.params))return;let a=this.clone(e,{name:i});return e.parent.insertBefore(e,a)}process(e){let t=this.parentPrefix(e);for(let i of this.prefixes)(!t||t===i)&&this.add(e,i)}};Rb.exports=Tb});var Db=x((b$,Ib)=>{u();var YO=kr(),ou=class extends YO{prefixed(e){return e==="-webkit-"?":-webkit-full-screen":e==="-moz-"?":-moz-full-screen":`:${e}fullscreen`}};ou.names=[":fullscreen"];Ib.exports=ou});var $b=x((w$,qb)=>{u();var KO=kr(),lu=class extends KO{possible(){return super.possible().concat(["-moz- old","-ms- old"])}prefixed(e){return e==="-webkit-"?"::-webkit-input-placeholder":e==="-ms-"?"::-ms-input-placeholder":e==="-ms- old"?":-ms-input-placeholder":e==="-moz- old"?":-moz-placeholder":`::${e}placeholder`}};lu.names=["::placeholder"];qb.exports=lu});var Mb=x((v$,Lb)=>{u();var XO=kr(),uu=class extends XO{prefixed(e){return e==="-ms-"?":-ms-input-placeholder":`:${e}placeholder-shown`}};uu.names=[":placeholder-shown"];Lb.exports=uu});var Bb=x((x$,Nb)=>{u();var JO=kr(),ZO=_e(),fu=class extends JO{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=ZO.uniq(this.prefixes.map(n=>"-webkit-")))}prefixed(e){return e==="-webkit-"?"::-webkit-file-upload-button":`::${e}file-selector-button`}};fu.names=["::file-selector-button"];Nb.exports=fu});var Pe=x((k$,Fb)=>{u();Fb.exports=function(r){let e;return r==="-webkit- 2009"||r==="-moz-"?e=2009:r==="-ms-"?e=2012:r==="-webkit-"&&(e="final"),r==="-webkit- 2009"&&(r="-webkit-"),[e,r]}});var Vb=x((S$,Ub)=>{u();var jb=$e().list,zb=Pe(),eT=j(),Sr=class extends eT{prefixed(e,t){let i;return[i,t]=zb(t),i===2009?t+"box-flex":super.prefixed(e,t)}normalize(){return"flex"}set(e,t){let i=zb(t)[0];if(i===2009)return e.value=jb.space(e.value)[0],e.value=Sr.oldValues[e.value]||e.value,super.set(e,t);if(i===2012){let n=jb.space(e.value);n.length===3&&n[2]==="0"&&(e.value=n.slice(0,2).concat("0px").join(" "))}return super.set(e,t)}};Sr.names=["flex","box-flex"];Sr.oldValues={auto:"1",none:"0"};Ub.exports=Sr});var Gb=x((A$,Wb)=>{u();var Hb=Pe(),tT=j(),cu=class extends tT{prefixed(e,t){let i;return[i,t]=Hb(t),i===2009?t+"box-ordinal-group":i===2012?t+"flex-order":super.prefixed(e,t)}normalize(){return"order"}set(e,t){return Hb(t)[0]===2009&&/\d/.test(e.value)?(e.value=(parseInt(e.value)+1).toString(),super.set(e,t)):super.set(e,t)}};cu.names=["order","flex-order","box-ordinal-group"];Wb.exports=cu});var Yb=x((C$,Qb)=>{u();var rT=j(),pu=class extends rT{check(e){let t=e.value;return!t.toLowerCase().includes("alpha(")&&!t.includes("DXImageTransform.Microsoft")&&!t.includes("data:image/svg+xml")}};pu.names=["filter"];Qb.exports=pu});var Xb=x((_$,Kb)=>{u();var iT=j(),du=class extends iT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=this.clone(e),s=e.prop.replace(/end$/,"start"),o=t+e.prop.replace(/end$/,"span");if(!e.parent.some(l=>l.prop===o)){if(a.prop=o,e.value.includes("span"))a.value=e.value.replace(/span\s/i,"");else{let l;if(e.parent.walkDecls(s,c=>{l=c}),l){let c=Number(e.value)-Number(l.value)+"";a.value=c}else e.warn(n,`Can not prefix ${e.prop} (${s} is not found)`)}e.cloneBefore(a)}}};du.names=["grid-row-end","grid-column-end"];Kb.exports=du});var Zb=x((E$,Jb)=>{u();var nT=j(),hu=class extends nT{check(e){return!e.value.split(/\s+/).some(t=>{let i=t.toLowerCase();return i==="reverse"||i==="alternate-reverse"})}};hu.names=["animation","animation-direction"];Jb.exports=hu});var tw=x((O$,ew)=>{u();var sT=Pe(),aT=j(),mu=class extends aT{insert(e,t,i){let n;if([n,t]=sT(t),n!==2009)return super.insert(e,t,i);let a=e.value.split(/\s+/).filter(d=>d!=="wrap"&&d!=="nowrap"&&"wrap-reverse");if(a.length===0||e.parent.some(d=>d.prop===t+"box-orient"||d.prop===t+"box-direction"))return;let o=a[0],l=o.includes("row")?"horizontal":"vertical",c=o.includes("reverse")?"reverse":"normal",f=this.clone(e);return f.prop=t+"box-orient",f.value=l,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f),f=this.clone(e),f.prop=t+"box-direction",f.value=c,this.needCascade(e)&&(f.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,f)}};mu.names=["flex-flow","box-direction","box-orient"];ew.exports=mu});var iw=x((T$,rw)=>{u();var oT=Pe(),lT=j(),gu=class extends lT{normalize(){return"flex"}prefixed(e,t){let i;return[i,t]=oT(t),i===2009?t+"box-flex":i===2012?t+"flex-positive":super.prefixed(e,t)}};gu.names=["flex-grow","flex-positive"];rw.exports=gu});var sw=x((R$,nw)=>{u();var uT=Pe(),fT=j(),yu=class extends fT{set(e,t){if(uT(t)[0]!==2009)return super.set(e,t)}};yu.names=["flex-wrap"];nw.exports=yu});var ow=x((P$,aw)=>{u();var cT=j(),Ar=Bt(),bu=class extends cT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=Ar.parse(e),[s,o]=Ar.translate(a,0,2),[l,c]=Ar.translate(a,1,3);[["grid-row",s],["grid-row-span",o],["grid-column",l],["grid-column-span",c]].forEach(([f,d])=>{Ar.insertDecl(e,f,d)}),Ar.warnTemplateSelectorNotFound(e,n),Ar.warnIfGridRowColumnExists(e,n)}};bu.names=["grid-area"];aw.exports=bu});var uw=x((I$,lw)=>{u();var pT=j(),Mi=Bt(),wu=class extends pT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(s=>s.prop==="-ms-grid-row-align"))return;let[[n,a]]=Mi.parse(e);a?(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",a)):(Mi.insertDecl(e,"grid-row-align",n),Mi.insertDecl(e,"grid-column-align",n))}};wu.names=["place-self"];lw.exports=wu});var cw=x((D$,fw)=>{u();var dT=j(),vu=class extends dT{check(e){let t=e.value;return!t.includes("/")||t.includes("span")}normalize(e){return e.replace("-start","")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-ms-"&&(i=i.replace("-start","")),i}};vu.names=["grid-row-start","grid-column-start"];fw.exports=vu});var hw=x((q$,dw)=>{u();var pw=Pe(),hT=j(),Cr=class extends hT{check(e){return e.parent&&!e.parent.some(t=>t.prop&&t.prop.startsWith("grid-"))}prefixed(e,t){let i;return[i,t]=pw(t),i===2012?t+"flex-item-align":super.prefixed(e,t)}normalize(){return"align-self"}set(e,t){let i=pw(t)[0];if(i===2012)return e.value=Cr.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Cr.names=["align-self","flex-item-align"];Cr.oldValues={"flex-end":"end","flex-start":"start"};dw.exports=Cr});var gw=x(($$,mw)=>{u();var mT=j(),gT=_e(),xu=class extends mT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=gT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};xu.names=["appearance"];mw.exports=xu});var ww=x((L$,bw)=>{u();var yw=Pe(),yT=j(),ku=class extends yT{normalize(){return"flex-basis"}prefixed(e,t){let i;return[i,t]=yw(t),i===2012?t+"flex-preferred-size":super.prefixed(e,t)}set(e,t){let i;if([i,t]=yw(t),i===2012||i==="final")return super.set(e,t)}};ku.names=["flex-basis","flex-preferred-size"];bw.exports=ku});var xw=x((M$,vw)=>{u();var bT=j(),Su=class extends bT{normalize(){return this.name.replace("box-image","border")}prefixed(e,t){let i=super.prefixed(e,t);return t==="-webkit-"&&(i=i.replace("border","box-image")),i}};Su.names=["mask-border","mask-border-source","mask-border-slice","mask-border-width","mask-border-outset","mask-border-repeat","mask-box-image","mask-box-image-source","mask-box-image-slice","mask-box-image-width","mask-box-image-outset","mask-box-image-repeat"];vw.exports=Su});var Sw=x((N$,kw)=>{u();var wT=j(),lt=class extends wT{insert(e,t,i){let n=e.prop==="mask-composite",a;n?a=e.value.split(","):a=e.value.match(lt.regexp)||[],a=a.map(c=>c.trim()).filter(c=>c);let s=a.length,o;if(s&&(o=this.clone(e),o.value=a.map(c=>lt.oldValues[c]||c).join(", "),a.includes("intersect")&&(o.value+=", xor"),o.prop=t+"mask-composite"),n)return s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):void 0;let l=this.clone(e);return l.prop=t+l.prop,s&&(l.value=l.value.replace(lt.regexp,"")),this.needCascade(e)&&(l.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,l),s?(this.needCascade(e)&&(o.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,o)):e}};lt.names=["mask","mask-composite"];lt.oldValues={add:"source-over",subtract:"source-out",intersect:"source-in",exclude:"xor"};lt.regexp=new RegExp(`\\s+(${Object.keys(lt.oldValues).join("|")})\\b(?!\\))\\s*(?=[,])`,"ig");kw.exports=lt});var _w=x((B$,Cw)=>{u();var Aw=Pe(),vT=j(),_r=class extends vT{prefixed(e,t){let i;return[i,t]=Aw(t),i===2009?t+"box-align":i===2012?t+"flex-align":super.prefixed(e,t)}normalize(){return"align-items"}set(e,t){let i=Aw(t)[0];return(i===2009||i===2012)&&(e.value=_r.oldValues[e.value]||e.value),super.set(e,t)}};_r.names=["align-items","flex-align","box-align"];_r.oldValues={"flex-end":"end","flex-start":"start"};Cw.exports=_r});var Ow=x((F$,Ew)=>{u();var xT=j(),Au=class extends xT{set(e,t){return t==="-ms-"&&e.value==="contain"&&(e.value="element"),super.set(e,t)}insert(e,t,i){if(!(e.value==="all"&&t==="-ms-"))return super.insert(e,t,i)}};Au.names=["user-select"];Ew.exports=Au});var Pw=x((j$,Rw)=>{u();var Tw=Pe(),kT=j(),Cu=class extends kT{normalize(){return"flex-shrink"}prefixed(e,t){let i;return[i,t]=Tw(t),i===2012?t+"flex-negative":super.prefixed(e,t)}set(e,t){let i;if([i,t]=Tw(t),i===2012||i==="final")return super.set(e,t)}};Cu.names=["flex-shrink","flex-negative"];Rw.exports=Cu});var Dw=x((z$,Iw)=>{u();var ST=j(),_u=class extends ST{prefixed(e,t){return`${t}column-${e}`}normalize(e){return e.includes("inside")?"break-inside":e.includes("before")?"break-before":"break-after"}set(e,t){return(e.prop==="break-inside"&&e.value==="avoid-column"||e.value==="avoid-page")&&(e.value="avoid"),super.set(e,t)}insert(e,t,i){if(e.prop!=="break-inside")return super.insert(e,t,i);if(!(/region/i.test(e.value)||/page/i.test(e.value)))return super.insert(e,t,i)}};_u.names=["break-inside","page-break-inside","column-break-inside","break-before","page-break-before","column-break-before","break-after","page-break-after","column-break-after"];Iw.exports=_u});var $w=x((U$,qw)=>{u();var AT=j(),Eu=class extends AT{prefixed(e,t){return t+"print-color-adjust"}normalize(){return"color-adjust"}};Eu.names=["color-adjust","print-color-adjust"];qw.exports=Eu});var Mw=x((V$,Lw)=>{u();var CT=j(),Er=class extends CT{insert(e,t,i){if(t==="-ms-"){let n=this.set(this.clone(e),t);this.needCascade(e)&&(n.raws.before=this.calcBefore(i,e,t));let a="ltr";return e.parent.nodes.forEach(s=>{s.prop==="direction"&&(s.value==="rtl"||s.value==="ltr")&&(a=s.value)}),n.value=Er.msValues[a][e.value]||e.value,e.parent.insertBefore(e,n)}return super.insert(e,t,i)}};Er.names=["writing-mode"];Er.msValues={ltr:{"horizontal-tb":"lr-tb","vertical-rl":"tb-rl","vertical-lr":"tb-lr"},rtl:{"horizontal-tb":"rl-tb","vertical-rl":"bt-rl","vertical-lr":"bt-lr"}};Lw.exports=Er});var Bw=x((H$,Nw)=>{u();var _T=j(),Ou=class extends _T{set(e,t){return e.value=e.value.replace(/\s+fill(\s)/,"$1"),super.set(e,t)}};Ou.names=["border-image"];Nw.exports=Ou});var zw=x((W$,jw)=>{u();var Fw=Pe(),ET=j(),Or=class extends ET{prefixed(e,t){let i;return[i,t]=Fw(t),i===2012?t+"flex-line-pack":super.prefixed(e,t)}normalize(){return"align-content"}set(e,t){let i=Fw(t)[0];if(i===2012)return e.value=Or.oldValues[e.value]||e.value,super.set(e,t);if(i==="final")return super.set(e,t)}};Or.names=["align-content","flex-line-pack"];Or.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};jw.exports=Or});var Vw=x((G$,Uw)=>{u();var OT=j(),We=class extends OT{prefixed(e,t){return t==="-moz-"?t+(We.toMozilla[e]||e):super.prefixed(e,t)}normalize(e){return We.toNormal[e]||e}};We.names=["border-radius"];We.toMozilla={};We.toNormal={};for(let r of["top","bottom"])for(let e of["left","right"]){let t=`border-${r}-${e}-radius`,i=`border-radius-${r}${e}`;We.names.push(t),We.names.push(i),We.toMozilla[t]=i,We.toNormal[i]=t}Uw.exports=We});var Ww=x((Q$,Hw)=>{u();var TT=j(),Tu=class extends TT{prefixed(e,t){return e.includes("-start")?t+e.replace("-block-start","-before"):t+e.replace("-block-end","-after")}normalize(e){return e.includes("-before")?e.replace("-before","-block-start"):e.replace("-after","-block-end")}};Tu.names=["border-block-start","border-block-end","margin-block-start","margin-block-end","padding-block-start","padding-block-end","border-before","border-after","margin-before","margin-after","padding-before","padding-after"];Hw.exports=Tu});var Qw=x((Y$,Gw)=>{u();var RT=j(),{parseTemplate:PT,warnMissedAreas:IT,getGridGap:DT,warnGridGap:qT,inheritGridGap:$T}=Bt(),Ru=class extends RT{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);if(e.parent.some(h=>h.prop==="-ms-grid-rows"))return;let a=DT(e),s=$T(e,a),{rows:o,columns:l,areas:c}=PT({decl:e,gap:s||a}),f=Object.keys(c).length>0,d=Boolean(o),p=Boolean(l);return qT({gap:a,hasColumns:p,decl:e,result:n}),IT(c,e,n),(d&&p||f)&&e.cloneBefore({prop:"-ms-grid-rows",value:o,raws:{}}),p&&e.cloneBefore({prop:"-ms-grid-columns",value:l,raws:{}}),e}};Ru.names=["grid-template"];Gw.exports=Ru});var Kw=x((K$,Yw)=>{u();var LT=j(),Pu=class extends LT{prefixed(e,t){return t+e.replace("-inline","")}normalize(e){return e.replace(/(margin|padding|border)-(start|end)/,"$1-inline-$2")}};Pu.names=["border-inline-start","border-inline-end","margin-inline-start","margin-inline-end","padding-inline-start","padding-inline-end","border-start","border-end","margin-start","margin-end","padding-start","padding-end"];Yw.exports=Pu});var Jw=x((X$,Xw)=>{u();var MT=j(),Iu=class extends MT{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-row-align"}normalize(){return"align-self"}};Iu.names=["grid-row-align"];Xw.exports=Iu});var e0=x((J$,Zw)=>{u();var NT=j(),Tr=class extends NT{keyframeParents(e){let{parent:t}=e;for(;t;){if(t.type==="atrule"&&t.name==="keyframes")return!0;({parent:t}=t)}return!1}contain3d(e){if(e.prop==="transform-origin")return!1;for(let t of Tr.functions3d)if(e.value.includes(`${t}(`))return!0;return!1}set(e,t){return e=super.set(e,t),t==="-ms-"&&(e.value=e.value.replace(/rotatez/gi,"rotate")),e}insert(e,t,i){if(t==="-ms-"){if(!this.contain3d(e)&&!this.keyframeParents(e))return super.insert(e,t,i)}else if(t==="-o-"){if(!this.contain3d(e))return super.insert(e,t,i)}else return super.insert(e,t,i)}};Tr.names=["transform","transform-origin"];Tr.functions3d=["matrix3d","translate3d","translateZ","scale3d","scaleZ","rotate3d","rotateX","rotateY","perspective"];Zw.exports=Tr});var i0=x((Z$,r0)=>{u();var t0=Pe(),BT=j(),Du=class extends BT{normalize(){return"flex-direction"}insert(e,t,i){let n;if([n,t]=t0(t),n!==2009)return super.insert(e,t,i);if(e.parent.some(f=>f.prop===t+"box-orient"||f.prop===t+"box-direction"))return;let s=e.value,o,l;s==="inherit"||s==="initial"||s==="unset"?(o=s,l=s):(o=s.includes("row")?"horizontal":"vertical",l=s.includes("reverse")?"reverse":"normal");let c=this.clone(e);return c.prop=t+"box-orient",c.value=o,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c),c=this.clone(e),c.prop=t+"box-direction",c.value=l,this.needCascade(e)&&(c.raws.before=this.calcBefore(i,e,t)),e.parent.insertBefore(e,c)}old(e,t){let i;return[i,t]=t0(t),i===2009?[t+"box-orient",t+"box-direction"]:super.old(e,t)}};Du.names=["flex-direction","box-direction","box-orient"];r0.exports=Du});var s0=x((eL,n0)=>{u();var FT=j(),qu=class extends FT{check(e){return e.value==="pixelated"}prefixed(e,t){return t==="-ms-"?"-ms-interpolation-mode":super.prefixed(e,t)}set(e,t){return t!=="-ms-"?super.set(e,t):(e.prop="-ms-interpolation-mode",e.value="nearest-neighbor",e)}normalize(){return"image-rendering"}process(e,t){return super.process(e,t)}};qu.names=["image-rendering","interpolation-mode"];n0.exports=qu});var o0=x((tL,a0)=>{u();var jT=j(),zT=_e(),$u=class extends jT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=zT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}};$u.names=["backdrop-filter"];a0.exports=$u});var u0=x((rL,l0)=>{u();var UT=j(),VT=_e(),Lu=class extends UT{constructor(e,t,i){super(e,t,i);this.prefixes&&(this.prefixes=VT.uniq(this.prefixes.map(n=>n==="-ms-"?"-webkit-":n)))}check(e){return e.value.toLowerCase()==="text"}};Lu.names=["background-clip"];l0.exports=Lu});var c0=x((iL,f0)=>{u();var HT=j(),WT=["none","underline","overline","line-through","blink","inherit","initial","unset"],Mu=class extends HT{check(e){return e.value.split(/\s+/).some(t=>!WT.includes(t))}};Mu.names=["text-decoration"];f0.exports=Mu});var h0=x((nL,d0)=>{u();var p0=Pe(),GT=j(),Rr=class extends GT{prefixed(e,t){let i;return[i,t]=p0(t),i===2009?t+"box-pack":i===2012?t+"flex-pack":super.prefixed(e,t)}normalize(){return"justify-content"}set(e,t){let i=p0(t)[0];if(i===2009||i===2012){let n=Rr.oldValues[e.value]||e.value;if(e.value=n,i!==2009||n!=="distribute")return super.set(e,t)}else if(i==="final")return super.set(e,t)}};Rr.names=["justify-content","flex-pack","box-pack"];Rr.oldValues={"flex-end":"end","flex-start":"start","space-between":"justify","space-around":"distribute"};d0.exports=Rr});var g0=x((sL,m0)=>{u();var QT=j(),Nu=class extends QT{set(e,t){let i=e.value.toLowerCase();return t==="-webkit-"&&!i.includes(" ")&&i!=="contain"&&i!=="cover"&&(e.value=e.value+" "+e.value),super.set(e,t)}};Nu.names=["background-size"];m0.exports=Nu});var b0=x((aL,y0)=>{u();var YT=j(),Bu=Bt(),Fu=class extends YT{insert(e,t,i){if(t!=="-ms-")return super.insert(e,t,i);let n=Bu.parse(e),[a,s]=Bu.translate(n,0,1);n[0]&&n[0].includes("span")&&(s=n[0].join("").replace(/\D/g,"")),[[e.prop,a],[`${e.prop}-span`,s]].forEach(([l,c])=>{Bu.insertDecl(e,l,c)})}};Fu.names=["grid-row","grid-column"];y0.exports=Fu});var x0=x((oL,v0)=>{u();var KT=j(),{prefixTrackProp:w0,prefixTrackValue:XT,autoplaceGridItems:JT,getGridGap:ZT,inheritGridGap:eR}=Bt(),tR=su(),ju=class extends KT{prefixed(e,t){return t==="-ms-"?w0({prop:e,prefix:t}):super.prefixed(e,t)}normalize(e){return e.replace(/^grid-(rows|columns)/,"grid-template-$1")}insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let{parent:a,prop:s,value:o}=e,l=s.includes("rows"),c=s.includes("columns"),f=a.some(k=>k.prop==="grid-template"||k.prop==="grid-template-areas");if(f&&l)return!1;let d=new tR({options:{}}),p=d.gridStatus(a,n),h=ZT(e);h=eR(e,h)||h;let b=l?h.row:h.column;(p==="no-autoplace"||p===!0)&&!f&&(b=null);let v=XT({value:o,gap:b});e.cloneBefore({prop:w0({prop:s,prefix:t}),value:v});let y=a.nodes.find(k=>k.prop==="grid-auto-flow"),w="row";if(y&&!d.disabled(y,n)&&(w=y.value.trim()),p==="autoplace"){let k=a.nodes.find(E=>E.prop==="grid-template-rows");if(!k&&f)return;if(!k&&!f){e.warn(n,"Autoplacement does not work without grid-template-rows property");return}!a.nodes.find(E=>E.prop==="grid-template-columns")&&!f&&e.warn(n,"Autoplacement does not work without grid-template-columns property"),c&&!f&&JT(e,n,h,w)}}};ju.names=["grid-template-rows","grid-template-columns","grid-rows","grid-columns"];v0.exports=ju});var S0=x((lL,k0)=>{u();var rR=j(),zu=class extends rR{check(e){return!e.value.includes("flex-")&&e.value!=="baseline"}prefixed(e,t){return t+"grid-column-align"}normalize(){return"justify-self"}};zu.names=["grid-column-align"];k0.exports=zu});var C0=x((uL,A0)=>{u();var iR=j(),Uu=class extends iR{prefixed(e,t){return t+"scroll-chaining"}normalize(){return"overscroll-behavior"}set(e,t){return e.value==="auto"?e.value="chained":(e.value==="none"||e.value==="contain")&&(e.value="none"),super.set(e,t)}};Uu.names=["overscroll-behavior","scroll-chaining"];A0.exports=Uu});var O0=x((fL,E0)=>{u();var nR=j(),{parseGridAreas:sR,warnMissedAreas:aR,prefixTrackProp:oR,prefixTrackValue:_0,getGridGap:lR,warnGridGap:uR,inheritGridGap:fR}=Bt();function cR(r){return r.trim().slice(1,-1).split(/["']\s*["']?/g)}var Vu=class extends nR{insert(e,t,i,n){if(t!=="-ms-")return super.insert(e,t,i);let a=!1,s=!1,o=e.parent,l=lR(e);l=fR(e,l)||l,o.walkDecls(/-ms-grid-rows/,d=>d.remove()),o.walkDecls(/grid-template-(rows|columns)/,d=>{if(d.prop==="grid-template-rows"){s=!0;let{prop:p,value:h}=d;d.cloneBefore({prop:oR({prop:p,prefix:t}),value:_0({value:h,gap:l.row})})}else a=!0});let c=cR(e.value);a&&!s&&l.row&&c.length>1&&e.cloneBefore({prop:"-ms-grid-rows",value:_0({value:`repeat(${c.length}, auto)`,gap:l.row}),raws:{}}),uR({gap:l,hasColumns:a,decl:e,result:n});let f=sR({rows:c,gap:l});return aR(f,e,n),e}};Vu.names=["grid-template-areas"];E0.exports=Vu});var R0=x((cL,T0)=>{u();var pR=j(),Hu=class extends pR{set(e,t){return t==="-webkit-"&&(e.value=e.value.replace(/\s*(right|left)\s*/i,"")),super.set(e,t)}};Hu.names=["text-emphasis-position"];T0.exports=Hu});var I0=x((pL,P0)=>{u();var dR=j(),Wu=class extends dR{set(e,t){return e.prop==="text-decoration-skip-ink"&&e.value==="auto"?(e.prop=t+"text-decoration-skip",e.value="ink",e):super.set(e,t)}};Wu.names=["text-decoration-skip-ink","text-decoration-skip"];P0.exports=Wu});var N0=x((dL,M0)=>{u();"use strict";M0.exports={wrap:D0,limit:q0,validate:$0,test:Gu,curry:hR,name:L0};function D0(r,e,t){var i=e-r;return((t-r)%i+i)%i+r}function q0(r,e,t){return Math.max(r,Math.min(e,t))}function $0(r,e,t,i,n){if(!Gu(r,e,t,i,n))throw new Error(t+" is outside of range ["+r+","+e+")");return t}function Gu(r,e,t,i,n){return!(te||n&&t===e||i&&t===r)}function L0(r,e,t,i){return(t?"(":"[")+r+","+e+(i?")":"]")}function hR(r,e,t,i){var n=L0.bind(null,r,e,t,i);return{wrap:D0.bind(null,r,e),limit:q0.bind(null,r,e),validate:function(a){return $0(r,e,a,t,i)},test:function(a){return Gu(r,e,a,t,i)},toString:n,name:n}}});var j0=x((hL,F0)=>{u();var Qu=$s(),mR=N0(),gR=xr(),yR=He(),bR=_e(),B0=/top|left|right|bottom/gi,wt=class extends yR{replace(e,t){let i=Qu(e);for(let n of i.nodes)if(n.type==="function"&&n.value===this.name)if(n.nodes=this.newDirection(n.nodes),n.nodes=this.normalize(n.nodes),t==="-webkit- old"){if(!this.oldWebkit(n))return!1}else n.nodes=this.convertDirection(n.nodes),n.value=t+n.value;return i.toString()}replaceFirst(e,...t){return t.map(n=>n===" "?{type:"space",value:n}:{type:"word",value:n}).concat(e.slice(1))}normalizeUnit(e,t){return`${parseFloat(e)/t*360}deg`}normalize(e){if(!e[0])return e;if(/-?\d+(.\d+)?grad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,400);else if(/-?\d+(.\d+)?rad/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,2*Math.PI);else if(/-?\d+(.\d+)?turn/.test(e[0].value))e[0].value=this.normalizeUnit(e[0].value,1);else if(e[0].value.includes("deg")){let t=parseFloat(e[0].value);t=mR.wrap(0,360,t),e[0].value=`${t}deg`}return e[0].value==="0deg"?e=this.replaceFirst(e,"to"," ","top"):e[0].value==="90deg"?e=this.replaceFirst(e,"to"," ","right"):e[0].value==="180deg"?e=this.replaceFirst(e,"to"," ","bottom"):e[0].value==="270deg"&&(e=this.replaceFirst(e,"to"," ","left")),e}newDirection(e){if(e[0].value==="to"||(B0.lastIndex=0,!B0.test(e[0].value)))return e;e.unshift({type:"word",value:"to"},{type:"space",value:" "});for(let t=2;t0&&(e[0].value==="to"?this.fixDirection(e):e[0].value.includes("deg")?this.fixAngle(e):this.isRadial(e)&&this.fixRadial(e)),e}fixDirection(e){e.splice(0,2);for(let t of e){if(t.type==="div")break;t.type==="word"&&(t.value=this.revertDirection(t.value))}}fixAngle(e){let t=e[0].value;t=parseFloat(t),t=Math.abs(450-t)%360,t=this.roundFloat(t,3),e[0].value=`${t}deg`}fixRadial(e){let t=[],i=[],n,a,s,o,l;for(o=0;o{u();var wR=xr(),vR=He();function z0(r){return new RegExp(`(^|[\\s,(])(${r}($|[\\s),]))`,"gi")}var Yu=class extends vR{regexp(){return this.regexpCache||(this.regexpCache=z0(this.name)),this.regexpCache}isStretch(){return this.name==="stretch"||this.name==="fill"||this.name==="fill-available"}replace(e,t){return t==="-moz-"&&this.isStretch()?e.replace(this.regexp(),"$1-moz-available$3"):t==="-webkit-"&&this.isStretch()?e.replace(this.regexp(),"$1-webkit-fill-available$3"):super.replace(e,t)}old(e){let t=e+this.name;return this.isStretch()&&(e==="-moz-"?t="-moz-available":e==="-webkit-"&&(t="-webkit-fill-available")),new wR(this.name,t,t,z0(t))}add(e,t){if(!(e.prop.includes("grid")&&t!=="-webkit-"))return super.add(e,t)}};Yu.names=["max-content","min-content","fit-content","fill","fill-available","stretch"];U0.exports=Yu});var G0=x((gL,W0)=>{u();var H0=xr(),xR=He(),Ku=class extends xR{replace(e,t){return t==="-webkit-"?e.replace(this.regexp(),"$1-webkit-optimize-contrast"):t==="-moz-"?e.replace(this.regexp(),"$1-moz-crisp-edges"):super.replace(e,t)}old(e){return e==="-webkit-"?new H0(this.name,"-webkit-optimize-contrast"):e==="-moz-"?new H0(this.name,"-moz-crisp-edges"):super.old(e)}};Ku.names=["pixelated"];W0.exports=Ku});var Y0=x((yL,Q0)=>{u();var kR=He(),Xu=class extends kR{replace(e,t){let i=super.replace(e,t);return t==="-webkit-"&&(i=i.replace(/("[^"]+"|'[^']+')(\s+\d+\w)/gi,"url($1)$2")),i}};Xu.names=["image-set"];Q0.exports=Xu});var X0=x((bL,K0)=>{u();var SR=$e().list,AR=He(),Ju=class extends AR{replace(e,t){return SR.space(e).map(i=>{if(i.slice(0,+this.name.length+1)!==this.name+"(")return i;let n=i.lastIndexOf(")"),a=i.slice(n+1),s=i.slice(this.name.length+1,n);if(t==="-webkit-"){let o=s.match(/\d*.?\d+%?/);o?(s=s.slice(o[0].length).trim(),s+=`, ${o[0]}`):s+=", 0.5"}return t+this.name+"("+s+")"+a}).join(" ")}};Ju.names=["cross-fade"];K0.exports=Ju});var Z0=x((wL,J0)=>{u();var CR=Pe(),_R=xr(),ER=He(),Zu=class extends ER{constructor(e,t){super(e,t);e==="display-flex"&&(this.name="flex")}check(e){return e.prop==="display"&&e.value===this.name}prefixed(e){let t,i;return[t,e]=CR(e),t===2009?this.name==="flex"?i="box":i="inline-box":t===2012?this.name==="flex"?i="flexbox":i="inline-flexbox":t==="final"&&(i=this.name),e+i}replace(e,t){return this.prefixed(t)}old(e){let t=this.prefixed(e);if(!!t)return new _R(this.name,t)}};Zu.names=["display-flex","inline-flex"];J0.exports=Zu});var tv=x((vL,ev)=>{u();var OR=He(),ef=class extends OR{constructor(e,t){super(e,t);e==="display-grid"&&(this.name="grid")}check(e){return e.prop==="display"&&e.value===this.name}};ef.names=["display-grid","inline-grid"];ev.exports=ef});var iv=x((xL,rv)=>{u();var TR=He(),tf=class extends TR{constructor(e,t){super(e,t);e==="filter-function"&&(this.name="filter")}};tf.names=["filter","filter-function"];rv.exports=tf});var ov=x((kL,av)=>{u();var nv=Li(),z=j(),sv=Fy(),RR=nb(),PR=su(),IR=Sb(),rf=Mt(),Pr=kr(),DR=Pb(),ut=He(),Ir=_e(),qR=Db(),$R=$b(),LR=Mb(),MR=Bb(),NR=Vb(),BR=Gb(),FR=Yb(),jR=Xb(),zR=Zb(),UR=tw(),VR=iw(),HR=sw(),WR=ow(),GR=uw(),QR=cw(),YR=hw(),KR=gw(),XR=ww(),JR=xw(),ZR=Sw(),e5=_w(),t5=Ow(),r5=Pw(),i5=Dw(),n5=$w(),s5=Mw(),a5=Bw(),o5=zw(),l5=Vw(),u5=Ww(),f5=Qw(),c5=Kw(),p5=Jw(),d5=e0(),h5=i0(),m5=s0(),g5=o0(),y5=u0(),b5=c0(),w5=h0(),v5=g0(),x5=b0(),k5=x0(),S5=S0(),A5=C0(),C5=O0(),_5=R0(),E5=I0(),O5=j0(),T5=V0(),R5=G0(),P5=Y0(),I5=X0(),D5=Z0(),q5=tv(),$5=iv();Pr.hack(qR);Pr.hack($R);Pr.hack(LR);Pr.hack(MR);z.hack(NR);z.hack(BR);z.hack(FR);z.hack(jR);z.hack(zR);z.hack(UR);z.hack(VR);z.hack(HR);z.hack(WR);z.hack(GR);z.hack(QR);z.hack(YR);z.hack(KR);z.hack(XR);z.hack(JR);z.hack(ZR);z.hack(e5);z.hack(t5);z.hack(r5);z.hack(i5);z.hack(n5);z.hack(s5);z.hack(a5);z.hack(o5);z.hack(l5);z.hack(u5);z.hack(f5);z.hack(c5);z.hack(p5);z.hack(d5);z.hack(h5);z.hack(m5);z.hack(g5);z.hack(y5);z.hack(b5);z.hack(w5);z.hack(v5);z.hack(x5);z.hack(k5);z.hack(S5);z.hack(A5);z.hack(C5);z.hack(_5);z.hack(E5);ut.hack(O5);ut.hack(T5);ut.hack(R5);ut.hack(P5);ut.hack(I5);ut.hack(D5);ut.hack(q5);ut.hack($5);var nf=new Map,Ni=class{constructor(e,t,i={}){this.data=e,this.browsers=t,this.options=i,[this.add,this.remove]=this.preprocess(this.select(this.data)),this.transition=new RR(this),this.processor=new PR(this)}cleaner(){if(this.cleanerCache)return this.cleanerCache;if(this.browsers.selected.length){let e=new rf(this.browsers.data,[]);this.cleanerCache=new Ni(this.data,e,this.options)}else return this;return this.cleanerCache}select(e){let t={add:{},remove:{}};for(let i in e){let n=e[i],a=n.browsers.map(l=>{let c=l.split(" ");return{browser:`${c[0]} ${c[1]}`,note:c[2]}}),s=a.filter(l=>l.note).map(l=>`${this.browsers.prefix(l.browser)} ${l.note}`);s=Ir.uniq(s),a=a.filter(l=>this.browsers.isSelected(l.browser)).map(l=>{let c=this.browsers.prefix(l.browser);return l.note?`${c} ${l.note}`:c}),a=this.sort(Ir.uniq(a)),this.options.flexbox==="no-2009"&&(a=a.filter(l=>!l.includes("2009")));let o=n.browsers.map(l=>this.browsers.prefix(l));n.mistakes&&(o=o.concat(n.mistakes)),o=o.concat(s),o=Ir.uniq(o),a.length?(t.add[i]=a,a.length!a.includes(l)))):t.remove[i]=o}return t}sort(e){return e.sort((t,i)=>{let n=Ir.removeNote(t).length,a=Ir.removeNote(i).length;return n===a?i.length-t.length:a-n})}preprocess(e){let t={selectors:[],"@supports":new IR(Ni,this)};for(let n in e.add){let a=e.add[n];if(n==="@keyframes"||n==="@viewport")t[n]=new DR(n,a,this);else if(n==="@resolution")t[n]=new sv(n,a,this);else if(this.data[n].selector)t.selectors.push(Pr.load(n,a,this));else{let s=this.data[n].props;if(s){let o=ut.load(n,a,this);for(let l of s)t[l]||(t[l]={values:[]}),t[l].values.push(o)}else{let o=t[n]&&t[n].values||[];t[n]=z.load(n,a,this),t[n].values=o}}}let i={selectors:[]};for(let n in e.remove){let a=e.remove[n];if(this.data[n].selector){let s=Pr.load(n,a);for(let o of a)i.selectors.push(s.old(o))}else if(n==="@keyframes"||n==="@viewport")for(let s of a){let o=`@${s}${n.slice(1)}`;i[o]={remove:!0}}else if(n==="@resolution")i[n]=new sv(n,a,this);else{let s=this.data[n].props;if(s){let o=ut.load(n,[],this);for(let l of a){let c=o.old(l);if(c)for(let f of s)i[f]||(i[f]={}),i[f].values||(i[f].values=[]),i[f].values.push(c)}}else for(let o of a){let l=this.decl(n).old(n,o);if(n==="align-self"){let c=t[n]&&t[n].prefixes;if(c){if(o==="-webkit- 2009"&&c.includes("-webkit-"))continue;if(o==="-webkit-"&&c.includes("-webkit- 2009"))continue}}for(let c of l)i[c]||(i[c]={}),i[c].remove=!0}}}return[t,i]}decl(e){return nf.has(e)||nf.set(e,z.load(e)),nf.get(e)}unprefixed(e){let t=this.normalize(nv.unprefixed(e));return t==="flex-direction"&&(t="flex-flow"),t}normalize(e){return this.decl(e).normalize(e)}prefixed(e,t){return e=nv.unprefixed(e),this.decl(e).prefixed(e,t)}values(e,t){let i=this[e],n=i["*"]&&i["*"].values,a=i[t]&&i[t].values;return n&&a?Ir.uniq(n.concat(a)):n||a||[]}group(e){let t=e.parent,i=t.index(e),{length:n}=t.nodes,a=this.unprefixed(e.prop),s=(o,l)=>{for(i+=o;i>=0&&i{u();lv.exports={"backdrop-filter":{feature:"css-backdrop-filter",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},element:{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-element-function",browsers:["firefox 114"]},"user-select":{mistakes:["-khtml-"],feature:"user-select-none",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"background-clip":{feature:"background-clip-text",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},hyphens:{feature:"css-hyphens",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},fill:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"fill-available":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},stretch:{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"fit-content":{props:["width","min-width","max-width","height","min-height","max-height","inline-size","min-inline-size","max-inline-size","block-size","min-block-size","max-block-size","grid","grid-template","grid-template-rows","grid-template-columns","grid-auto-columns","grid-auto-rows"],feature:"intrinsic-width",browsers:["firefox 114"]},"text-decoration-style":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-color":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-line":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-decoration-skip-ink":{feature:"text-decoration",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"text-size-adjust":{feature:"text-size-adjust",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5"]},"mask-clip":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-composite":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-image":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-origin":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-repeat":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-source":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},mask:{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-position":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-size":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-outset":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-width":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"mask-border-slice":{feature:"css-masks",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},"clip-path":{feature:"css-clip-path",browsers:["samsung 21"]},"box-decoration-break":{feature:"css-boxdecorationbreak",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","opera 99","safari 16.5","samsung 21"]},appearance:{feature:"css-appearance",browsers:["samsung 21"]},"image-set":{props:["background","background-image","border-image","cursor","mask","mask-image","list-style","list-style-image","content"],feature:"css-image-set",browsers:["and_uc 15.5","chrome 109","samsung 21"]},"cross-fade":{props:["background","background-image","border-image","mask","list-style","list-style-image","content","mask-image"],feature:"css-cross-fade",browsers:["and_chr 114","and_uc 15.5","chrome 109","chrome 113","chrome 114","edge 114","opera 99","samsung 21"]},isolate:{props:["unicode-bidi"],feature:"css-unicode-bidi",browsers:["ios_saf 16.1","ios_saf 16.3","ios_saf 16.4","ios_saf 16.5","safari 16.5"]},"color-adjust":{feature:"css-color-adjust",browsers:["chrome 109","chrome 113","chrome 114","edge 114","opera 99"]}}});var cv=x((AL,fv)=>{u();fv.exports={}});var mv=x((CL,hv)=>{u();var L5=Gl(),{agents:M5}=(Ts(),Os),sf=_y(),N5=Mt(),B5=ov(),F5=uv(),j5=cv(),pv={browsers:M5,prefixes:F5},dv=` + Replace Autoprefixer \`browsers\` option to Browserslist config. + Use \`browserslist\` key in \`package.json\` or \`.browserslistrc\` file. + + Using \`browsers\` option can cause errors. Browserslist config can + be used for Babel, Autoprefixer, postcss-normalize and other tools. + + If you really need to use option, rename it to \`overrideBrowserslist\`. + + Learn more at: + https://github.com/browserslist/browserslist#readme + https://twitter.com/browserslist + +`;function z5(r){return Object.prototype.toString.apply(r)==="[object Object]"}var af=new Map;function U5(r,e){e.browsers.selected.length!==0&&(e.add.selectors.length>0||Object.keys(e.add).length>2||r.warn(`Autoprefixer target browsers do not need any prefixes.You do not need Autoprefixer anymore. +Check your Browserslist config to be sure that your targets are set up correctly. + + Learn more at: + https://github.com/postcss/autoprefixer#readme + https://github.com/browserslist/browserslist#readme + +`))}hv.exports=Dr;function Dr(...r){let e;if(r.length===1&&z5(r[0])?(e=r[0],r=void 0):r.length===0||r.length===1&&!r[0]?r=void 0:r.length<=2&&(Array.isArray(r[0])||!r[0])?(e=r[1],r=r[0]):typeof r[r.length-1]=="object"&&(e=r.pop()),e||(e={}),e.browser)throw new Error("Change `browser` option to `overrideBrowserslist` in Autoprefixer");if(e.browserslist)throw new Error("Change `browserslist` option to `overrideBrowserslist` in Autoprefixer");e.overrideBrowserslist?r=e.overrideBrowserslist:e.browsers&&(typeof console!="undefined"&&console.warn&&(sf.red?console.warn(sf.red(dv.replace(/`[^`]+`/g,n=>sf.yellow(n.slice(1,-1))))):console.warn(dv)),r=e.browsers);let t={ignoreUnknownVersions:e.ignoreUnknownVersions,stats:e.stats,env:e.env};function i(n){let a=pv,s=new N5(a.browsers,r,n,t),o=s.selected.join(", ")+JSON.stringify(e);return af.has(o)||af.set(o,new B5(a.prefixes,s,e)),af.get(o)}return{postcssPlugin:"autoprefixer",prepare(n){let a=i({from:n.opts.from,env:e.env});return{OnceExit(s){U5(n,a),e.remove!==!1&&a.processor.remove(s,n),e.add!==!1&&a.processor.add(s,n)}}},info(n){return n=n||{},n.from=n.from||m.cwd(),j5(i(n))},options:e,browsers:r}}Dr.postcss=!0;Dr.data=pv;Dr.defaults=L5.defaults;Dr.info=()=>Dr().info()});var gv={};Ge(gv,{default:()=>V5});var V5,yv=R(()=>{u();V5=[]});var wv={};Ge(wv,{default:()=>H5});var bv,H5,vv=R(()=>{u();Yi();bv=pe(en()),H5=St(bv.default.theme)});var kv={};Ge(kv,{default:()=>W5});var xv,W5,Sv=R(()=>{u();Yi();xv=pe(en()),W5=St(xv.default)});u();"use strict";var G5=vt(Ay()),Q5=vt($e()),Y5=vt(mv()),K5=vt((yv(),gv)),X5=vt((vv(),wv)),J5=vt((Sv(),kv)),Z5=vt((zs(),Af)),eP=vt((nl(),il)),tP=vt((ia(),ic));function vt(r){return r&&r.__esModule?r:{default:r}}console.warn("cdn.tailwindcss.com should not be used in production. To use Tailwind CSS in production, install it as a PostCSS plugin or use the Tailwind CLI: https://tailwindcss.com/docs/installation");var Ls="tailwind",of="text/tailwindcss",Av="/template.html",Yt,Cv=!0,_v=0,lf=new Set,uf,Ev="",Ov=(r=!1)=>({get(e,t){return(!r||t==="config")&&typeof e[t]=="object"&&e[t]!==null?new Proxy(e[t],Ov()):e[t]},set(e,t,i){return e[t]=i,(!r||t==="config")&&ff(!0),!0}});window[Ls]=new Proxy({config:{},defaultTheme:X5.default,defaultConfig:J5.default,colors:Z5.default,plugin:eP.default,resolveConfig:tP.default},Ov(!0));function Tv(r){uf.observe(r,{attributes:!0,attributeFilter:["type"],characterData:!0,subtree:!0,childList:!0})}new MutationObserver(async r=>{let e=!1;if(!uf){uf=new MutationObserver(async()=>await ff(!0));for(let t of document.querySelectorAll(`style[type="${of}"]`))Tv(t)}for(let t of r)for(let i of t.addedNodes)i.nodeType===1&&i.tagName==="STYLE"&&i.getAttribute("type")===of&&(Tv(i),e=!0);await ff(e)}).observe(document.documentElement,{attributes:!0,attributeFilter:["class"],childList:!0,subtree:!0});async function ff(r=!1){r&&(_v++,lf.clear());let e="";for(let i of document.querySelectorAll(`style[type="${of}"]`))e+=i.textContent;let t=new Set;for(let i of document.querySelectorAll("[class]"))for(let n of i.classList)lf.has(n)||t.add(n);if(document.body&&(Cv||t.size>0||e!==Ev||!Yt||!Yt.isConnected)){for(let n of t)lf.add(n);Cv=!1,Ev=e,self[Av]=Array.from(t).join(" ");let{css:i}=await(0,Q5.default)([(0,G5.default)({...window[Ls].config,_hash:_v,content:{files:[Av],extract:{html:n=>n.split(" ")}},plugins:[...K5.default,...Array.isArray(window[Ls].config.plugins)?window[Ls].config.plugins:[]]}),(0,Y5.default)({remove:!1})]).process(`@tailwind base;@tailwind components;@tailwind utilities;${e}`);(!Yt||!Yt.isConnected)&&(Yt=document.createElement("style"),document.head.append(Yt)),Yt.textContent=i}}})(); +/*! + * fill-range + * + * Copyright (c) 2014-present, Jon Schlinkert. + * Licensed under the MIT License. + */ +/*! + * is-number + * + * Copyright (c) 2014-present, Jon Schlinkert. + * Released under the MIT License. + */ +/*! + * to-regex-range + * + * Copyright (c) 2015-present, Jon Schlinkert. + * Released under the MIT License. + */ +/*! https://mths.be/cssesc v3.0.0 by @mathias */ + diff --git a/examples/server/public/deps_vue.esm-browser.js b/examples/server/public/deps_vue.esm-browser.js new file mode 100644 index 000000000..4679d9614 --- /dev/null +++ b/examples/server/public/deps_vue.esm-browser.js @@ -0,0 +1,18160 @@ +/** +* vue v3.5.12 +* (c) 2018-present Yuxi (Evan) You and Vue contributors +* @license MIT +**/ +/*! #__NO_SIDE_EFFECTS__ */ +// @__NO_SIDE_EFFECTS__ +function makeMap(str) { + const map = /* @__PURE__ */ Object.create(null); + for (const key of str.split(",")) map[key] = 1; + return (val) => val in map; +} + +const EMPTY_OBJ = Object.freeze({}) ; +const EMPTY_ARR = Object.freeze([]) ; +const NOOP = () => { +}; +const NO = () => false; +const isOn = (key) => key.charCodeAt(0) === 111 && key.charCodeAt(1) === 110 && // uppercase letter +(key.charCodeAt(2) > 122 || key.charCodeAt(2) < 97); +const isModelListener = (key) => key.startsWith("onUpdate:"); +const extend = Object.assign; +const remove = (arr, el) => { + const i = arr.indexOf(el); + if (i > -1) { + arr.splice(i, 1); + } +}; +const hasOwnProperty$1 = Object.prototype.hasOwnProperty; +const hasOwn = (val, key) => hasOwnProperty$1.call(val, key); +const isArray = Array.isArray; +const isMap = (val) => toTypeString(val) === "[object Map]"; +const isSet = (val) => toTypeString(val) === "[object Set]"; +const isDate = (val) => toTypeString(val) === "[object Date]"; +const isRegExp = (val) => toTypeString(val) === "[object RegExp]"; +const isFunction = (val) => typeof val === "function"; +const isString = (val) => typeof val === "string"; +const isSymbol = (val) => typeof val === "symbol"; +const isObject = (val) => val !== null && typeof val === "object"; +const isPromise = (val) => { + return (isObject(val) || isFunction(val)) && isFunction(val.then) && isFunction(val.catch); +}; +const objectToString = Object.prototype.toString; +const toTypeString = (value) => objectToString.call(value); +const toRawType = (value) => { + return toTypeString(value).slice(8, -1); +}; +const isPlainObject = (val) => toTypeString(val) === "[object Object]"; +const isIntegerKey = (key) => isString(key) && key !== "NaN" && key[0] !== "-" && "" + parseInt(key, 10) === key; +const isReservedProp = /* @__PURE__ */ makeMap( + // the leading comma is intentional so empty string "" is also included + ",key,ref,ref_for,ref_key,onVnodeBeforeMount,onVnodeMounted,onVnodeBeforeUpdate,onVnodeUpdated,onVnodeBeforeUnmount,onVnodeUnmounted" +); +const isBuiltInDirective = /* @__PURE__ */ makeMap( + "bind,cloak,else-if,else,for,html,if,model,on,once,pre,show,slot,text,memo" +); +const cacheStringFunction = (fn) => { + const cache = /* @__PURE__ */ Object.create(null); + return (str) => { + const hit = cache[str]; + return hit || (cache[str] = fn(str)); + }; +}; +const camelizeRE = /-(\w)/g; +const camelize = cacheStringFunction( + (str) => { + return str.replace(camelizeRE, (_, c) => c ? c.toUpperCase() : ""); + } +); +const hyphenateRE = /\B([A-Z])/g; +const hyphenate = cacheStringFunction( + (str) => str.replace(hyphenateRE, "-$1").toLowerCase() +); +const capitalize = cacheStringFunction((str) => { + return str.charAt(0).toUpperCase() + str.slice(1); +}); +const toHandlerKey = cacheStringFunction( + (str) => { + const s = str ? `on${capitalize(str)}` : ``; + return s; + } +); +const hasChanged = (value, oldValue) => !Object.is(value, oldValue); +const invokeArrayFns = (fns, ...arg) => { + for (let i = 0; i < fns.length; i++) { + fns[i](...arg); + } +}; +const def = (obj, key, value, writable = false) => { + Object.defineProperty(obj, key, { + configurable: true, + enumerable: false, + writable, + value + }); +}; +const looseToNumber = (val) => { + const n = parseFloat(val); + return isNaN(n) ? val : n; +}; +const toNumber = (val) => { + const n = isString(val) ? Number(val) : NaN; + return isNaN(n) ? val : n; +}; +let _globalThis; +const getGlobalThis = () => { + return _globalThis || (_globalThis = typeof globalThis !== "undefined" ? globalThis : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : {}); +}; +function genCacheKey(source, options) { + return source + JSON.stringify( + options, + (_, val) => typeof val === "function" ? val.toString() : val + ); +} + +const PatchFlagNames = { + [1]: `TEXT`, + [2]: `CLASS`, + [4]: `STYLE`, + [8]: `PROPS`, + [16]: `FULL_PROPS`, + [32]: `NEED_HYDRATION`, + [64]: `STABLE_FRAGMENT`, + [128]: `KEYED_FRAGMENT`, + [256]: `UNKEYED_FRAGMENT`, + [512]: `NEED_PATCH`, + [1024]: `DYNAMIC_SLOTS`, + [2048]: `DEV_ROOT_FRAGMENT`, + [-1]: `HOISTED`, + [-2]: `BAIL` +}; + +const slotFlagsText = { + [1]: "STABLE", + [2]: "DYNAMIC", + [3]: "FORWARDED" +}; + +const GLOBALS_ALLOWED = "Infinity,undefined,NaN,isFinite,isNaN,parseFloat,parseInt,decodeURI,decodeURIComponent,encodeURI,encodeURIComponent,Math,Number,Date,Array,Object,Boolean,String,RegExp,Map,Set,JSON,Intl,BigInt,console,Error,Symbol"; +const isGloballyAllowed = /* @__PURE__ */ makeMap(GLOBALS_ALLOWED); + +const range = 2; +function generateCodeFrame(source, start = 0, end = source.length) { + start = Math.max(0, Math.min(start, source.length)); + end = Math.max(0, Math.min(end, source.length)); + if (start > end) return ""; + let lines = source.split(/(\r?\n)/); + const newlineSequences = lines.filter((_, idx) => idx % 2 === 1); + lines = lines.filter((_, idx) => idx % 2 === 0); + let count = 0; + const res = []; + for (let i = 0; i < lines.length; i++) { + count += lines[i].length + (newlineSequences[i] && newlineSequences[i].length || 0); + if (count >= start) { + for (let j = i - range; j <= i + range || end > count; j++) { + if (j < 0 || j >= lines.length) continue; + const line = j + 1; + res.push( + `${line}${" ".repeat(Math.max(3 - String(line).length, 0))}| ${lines[j]}` + ); + const lineLength = lines[j].length; + const newLineSeqLength = newlineSequences[j] && newlineSequences[j].length || 0; + if (j === i) { + const pad = start - (count - (lineLength + newLineSeqLength)); + const length = Math.max( + 1, + end > count ? lineLength - pad : end - start + ); + res.push(` | ` + " ".repeat(pad) + "^".repeat(length)); + } else if (j > i) { + if (end > count) { + const length = Math.max(Math.min(end - count, lineLength), 1); + res.push(` | ` + "^".repeat(length)); + } + count += lineLength + newLineSeqLength; + } + } + break; + } + } + return res.join("\n"); +} + +function normalizeStyle(value) { + if (isArray(value)) { + const res = {}; + for (let i = 0; i < value.length; i++) { + const item = value[i]; + const normalized = isString(item) ? parseStringStyle(item) : normalizeStyle(item); + if (normalized) { + for (const key in normalized) { + res[key] = normalized[key]; + } + } + } + return res; + } else if (isString(value) || isObject(value)) { + return value; + } +} +const listDelimiterRE = /;(?![^(]*\))/g; +const propertyDelimiterRE = /:([^]+)/; +const styleCommentRE = /\/\*[^]*?\*\//g; +function parseStringStyle(cssText) { + const ret = {}; + cssText.replace(styleCommentRE, "").split(listDelimiterRE).forEach((item) => { + if (item) { + const tmp = item.split(propertyDelimiterRE); + tmp.length > 1 && (ret[tmp[0].trim()] = tmp[1].trim()); + } + }); + return ret; +} +function stringifyStyle(styles) { + let ret = ""; + if (!styles || isString(styles)) { + return ret; + } + for (const key in styles) { + const value = styles[key]; + if (isString(value) || typeof value === "number") { + const normalizedKey = key.startsWith(`--`) ? key : hyphenate(key); + ret += `${normalizedKey}:${value};`; + } + } + return ret; +} +function normalizeClass(value) { + let res = ""; + if (isString(value)) { + res = value; + } else if (isArray(value)) { + for (let i = 0; i < value.length; i++) { + const normalized = normalizeClass(value[i]); + if (normalized) { + res += normalized + " "; + } + } + } else if (isObject(value)) { + for (const name in value) { + if (value[name]) { + res += name + " "; + } + } + } + return res.trim(); +} +function normalizeProps(props) { + if (!props) return null; + let { class: klass, style } = props; + if (klass && !isString(klass)) { + props.class = normalizeClass(klass); + } + if (style) { + props.style = normalizeStyle(style); + } + return props; +} + +const HTML_TAGS = "html,body,base,head,link,meta,style,title,address,article,aside,footer,header,hgroup,h1,h2,h3,h4,h5,h6,nav,section,div,dd,dl,dt,figcaption,figure,picture,hr,img,li,main,ol,p,pre,ul,a,b,abbr,bdi,bdo,br,cite,code,data,dfn,em,i,kbd,mark,q,rp,rt,ruby,s,samp,small,span,strong,sub,sup,time,u,var,wbr,area,audio,map,track,video,embed,object,param,source,canvas,script,noscript,del,ins,caption,col,colgroup,table,thead,tbody,td,th,tr,button,datalist,fieldset,form,input,label,legend,meter,optgroup,option,output,progress,select,textarea,details,dialog,menu,summary,template,blockquote,iframe,tfoot"; +const SVG_TAGS = "svg,animate,animateMotion,animateTransform,circle,clipPath,color-profile,defs,desc,discard,ellipse,feBlend,feColorMatrix,feComponentTransfer,feComposite,feConvolveMatrix,feDiffuseLighting,feDisplacementMap,feDistantLight,feDropShadow,feFlood,feFuncA,feFuncB,feFuncG,feFuncR,feGaussianBlur,feImage,feMerge,feMergeNode,feMorphology,feOffset,fePointLight,feSpecularLighting,feSpotLight,feTile,feTurbulence,filter,foreignObject,g,hatch,hatchpath,image,line,linearGradient,marker,mask,mesh,meshgradient,meshpatch,meshrow,metadata,mpath,path,pattern,polygon,polyline,radialGradient,rect,set,solidcolor,stop,switch,symbol,text,textPath,title,tspan,unknown,use,view"; +const MATH_TAGS = "annotation,annotation-xml,maction,maligngroup,malignmark,math,menclose,merror,mfenced,mfrac,mfraction,mglyph,mi,mlabeledtr,mlongdiv,mmultiscripts,mn,mo,mover,mpadded,mphantom,mprescripts,mroot,mrow,ms,mscarries,mscarry,msgroup,msline,mspace,msqrt,msrow,mstack,mstyle,msub,msubsup,msup,mtable,mtd,mtext,mtr,munder,munderover,none,semantics"; +const VOID_TAGS = "area,base,br,col,embed,hr,img,input,link,meta,param,source,track,wbr"; +const isHTMLTag = /* @__PURE__ */ makeMap(HTML_TAGS); +const isSVGTag = /* @__PURE__ */ makeMap(SVG_TAGS); +const isMathMLTag = /* @__PURE__ */ makeMap(MATH_TAGS); +const isVoidTag = /* @__PURE__ */ makeMap(VOID_TAGS); + +const specialBooleanAttrs = `itemscope,allowfullscreen,formnovalidate,ismap,nomodule,novalidate,readonly`; +const isSpecialBooleanAttr = /* @__PURE__ */ makeMap(specialBooleanAttrs); +const isBooleanAttr = /* @__PURE__ */ makeMap( + specialBooleanAttrs + `,async,autofocus,autoplay,controls,default,defer,disabled,hidden,inert,loop,open,required,reversed,scoped,seamless,checked,muted,multiple,selected` +); +function includeBooleanAttr(value) { + return !!value || value === ""; +} +const isKnownHtmlAttr = /* @__PURE__ */ makeMap( + `accept,accept-charset,accesskey,action,align,allow,alt,async,autocapitalize,autocomplete,autofocus,autoplay,background,bgcolor,border,buffered,capture,challenge,charset,checked,cite,class,code,codebase,color,cols,colspan,content,contenteditable,contextmenu,controls,coords,crossorigin,csp,data,datetime,decoding,default,defer,dir,dirname,disabled,download,draggable,dropzone,enctype,enterkeyhint,for,form,formaction,formenctype,formmethod,formnovalidate,formtarget,headers,height,hidden,high,href,hreflang,http-equiv,icon,id,importance,inert,integrity,ismap,itemprop,keytype,kind,label,lang,language,loading,list,loop,low,manifest,max,maxlength,minlength,media,min,multiple,muted,name,novalidate,open,optimum,pattern,ping,placeholder,poster,preload,radiogroup,readonly,referrerpolicy,rel,required,reversed,rows,rowspan,sandbox,scope,scoped,selected,shape,size,sizes,slot,span,spellcheck,src,srcdoc,srclang,srcset,start,step,style,summary,tabindex,target,title,translate,type,usemap,value,width,wrap` +); +const isKnownSvgAttr = /* @__PURE__ */ makeMap( + `xmlns,accent-height,accumulate,additive,alignment-baseline,alphabetic,amplitude,arabic-form,ascent,attributeName,attributeType,azimuth,baseFrequency,baseline-shift,baseProfile,bbox,begin,bias,by,calcMode,cap-height,class,clip,clipPathUnits,clip-path,clip-rule,color,color-interpolation,color-interpolation-filters,color-profile,color-rendering,contentScriptType,contentStyleType,crossorigin,cursor,cx,cy,d,decelerate,descent,diffuseConstant,direction,display,divisor,dominant-baseline,dur,dx,dy,edgeMode,elevation,enable-background,end,exponent,fill,fill-opacity,fill-rule,filter,filterRes,filterUnits,flood-color,flood-opacity,font-family,font-size,font-size-adjust,font-stretch,font-style,font-variant,font-weight,format,from,fr,fx,fy,g1,g2,glyph-name,glyph-orientation-horizontal,glyph-orientation-vertical,glyphRef,gradientTransform,gradientUnits,hanging,height,href,hreflang,horiz-adv-x,horiz-origin-x,id,ideographic,image-rendering,in,in2,intercept,k,k1,k2,k3,k4,kernelMatrix,kernelUnitLength,kerning,keyPoints,keySplines,keyTimes,lang,lengthAdjust,letter-spacing,lighting-color,limitingConeAngle,local,marker-end,marker-mid,marker-start,markerHeight,markerUnits,markerWidth,mask,maskContentUnits,maskUnits,mathematical,max,media,method,min,mode,name,numOctaves,offset,opacity,operator,order,orient,orientation,origin,overflow,overline-position,overline-thickness,panose-1,paint-order,path,pathLength,patternContentUnits,patternTransform,patternUnits,ping,pointer-events,points,pointsAtX,pointsAtY,pointsAtZ,preserveAlpha,preserveAspectRatio,primitiveUnits,r,radius,referrerPolicy,refX,refY,rel,rendering-intent,repeatCount,repeatDur,requiredExtensions,requiredFeatures,restart,result,rotate,rx,ry,scale,seed,shape-rendering,slope,spacing,specularConstant,specularExponent,speed,spreadMethod,startOffset,stdDeviation,stemh,stemv,stitchTiles,stop-color,stop-opacity,strikethrough-position,strikethrough-thickness,string,stroke,stroke-dasharray,stroke-dashoffset,stroke-linecap,stroke-linejoin,stroke-miterlimit,stroke-opacity,stroke-width,style,surfaceScale,systemLanguage,tabindex,tableValues,target,targetX,targetY,text-anchor,text-decoration,text-rendering,textLength,to,transform,transform-origin,type,u1,u2,underline-position,underline-thickness,unicode,unicode-bidi,unicode-range,units-per-em,v-alphabetic,v-hanging,v-ideographic,v-mathematical,values,vector-effect,version,vert-adv-y,vert-origin-x,vert-origin-y,viewBox,viewTarget,visibility,width,widths,word-spacing,writing-mode,x,x-height,x1,x2,xChannelSelector,xlink:actuate,xlink:arcrole,xlink:href,xlink:role,xlink:show,xlink:title,xlink:type,xmlns:xlink,xml:base,xml:lang,xml:space,y,y1,y2,yChannelSelector,z,zoomAndPan` +); +function isRenderableAttrValue(value) { + if (value == null) { + return false; + } + const type = typeof value; + return type === "string" || type === "number" || type === "boolean"; +} + +const cssVarNameEscapeSymbolsRE = /[ !"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g; +function getEscapedCssVarName(key, doubleEscape) { + return key.replace( + cssVarNameEscapeSymbolsRE, + (s) => `\\${s}` + ); +} + +function looseCompareArrays(a, b) { + if (a.length !== b.length) return false; + let equal = true; + for (let i = 0; equal && i < a.length; i++) { + equal = looseEqual(a[i], b[i]); + } + return equal; +} +function looseEqual(a, b) { + if (a === b) return true; + let aValidType = isDate(a); + let bValidType = isDate(b); + if (aValidType || bValidType) { + return aValidType && bValidType ? a.getTime() === b.getTime() : false; + } + aValidType = isSymbol(a); + bValidType = isSymbol(b); + if (aValidType || bValidType) { + return a === b; + } + aValidType = isArray(a); + bValidType = isArray(b); + if (aValidType || bValidType) { + return aValidType && bValidType ? looseCompareArrays(a, b) : false; + } + aValidType = isObject(a); + bValidType = isObject(b); + if (aValidType || bValidType) { + if (!aValidType || !bValidType) { + return false; + } + const aKeysCount = Object.keys(a).length; + const bKeysCount = Object.keys(b).length; + if (aKeysCount !== bKeysCount) { + return false; + } + for (const key in a) { + const aHasKey = a.hasOwnProperty(key); + const bHasKey = b.hasOwnProperty(key); + if (aHasKey && !bHasKey || !aHasKey && bHasKey || !looseEqual(a[key], b[key])) { + return false; + } + } + } + return String(a) === String(b); +} +function looseIndexOf(arr, val) { + return arr.findIndex((item) => looseEqual(item, val)); +} + +const isRef$1 = (val) => { + return !!(val && val["__v_isRef"] === true); +}; +const toDisplayString = (val) => { + return isString(val) ? val : val == null ? "" : isArray(val) || isObject(val) && (val.toString === objectToString || !isFunction(val.toString)) ? isRef$1(val) ? toDisplayString(val.value) : JSON.stringify(val, replacer, 2) : String(val); +}; +const replacer = (_key, val) => { + if (isRef$1(val)) { + return replacer(_key, val.value); + } else if (isMap(val)) { + return { + [`Map(${val.size})`]: [...val.entries()].reduce( + (entries, [key, val2], i) => { + entries[stringifySymbol(key, i) + " =>"] = val2; + return entries; + }, + {} + ) + }; + } else if (isSet(val)) { + return { + [`Set(${val.size})`]: [...val.values()].map((v) => stringifySymbol(v)) + }; + } else if (isSymbol(val)) { + return stringifySymbol(val); + } else if (isObject(val) && !isArray(val) && !isPlainObject(val)) { + return String(val); + } + return val; +}; +const stringifySymbol = (v, i = "") => { + var _a; + return ( + // Symbol.description in es2019+ so we need to cast here to pass + // the lib: es2016 check + isSymbol(v) ? `Symbol(${(_a = v.description) != null ? _a : i})` : v + ); +}; + +function warn$2(msg, ...args) { + console.warn(`[Vue warn] ${msg}`, ...args); +} + +let activeEffectScope; +class EffectScope { + constructor(detached = false) { + this.detached = detached; + /** + * @internal + */ + this._active = true; + /** + * @internal + */ + this.effects = []; + /** + * @internal + */ + this.cleanups = []; + this._isPaused = false; + this.parent = activeEffectScope; + if (!detached && activeEffectScope) { + this.index = (activeEffectScope.scopes || (activeEffectScope.scopes = [])).push( + this + ) - 1; + } + } + get active() { + return this._active; + } + pause() { + if (this._active) { + this._isPaused = true; + let i, l; + if (this.scopes) { + for (i = 0, l = this.scopes.length; i < l; i++) { + this.scopes[i].pause(); + } + } + for (i = 0, l = this.effects.length; i < l; i++) { + this.effects[i].pause(); + } + } + } + /** + * Resumes the effect scope, including all child scopes and effects. + */ + resume() { + if (this._active) { + if (this._isPaused) { + this._isPaused = false; + let i, l; + if (this.scopes) { + for (i = 0, l = this.scopes.length; i < l; i++) { + this.scopes[i].resume(); + } + } + for (i = 0, l = this.effects.length; i < l; i++) { + this.effects[i].resume(); + } + } + } + } + run(fn) { + if (this._active) { + const currentEffectScope = activeEffectScope; + try { + activeEffectScope = this; + return fn(); + } finally { + activeEffectScope = currentEffectScope; + } + } else { + warn$2(`cannot run an inactive effect scope.`); + } + } + /** + * This should only be called on non-detached scopes + * @internal + */ + on() { + activeEffectScope = this; + } + /** + * This should only be called on non-detached scopes + * @internal + */ + off() { + activeEffectScope = this.parent; + } + stop(fromParent) { + if (this._active) { + let i, l; + for (i = 0, l = this.effects.length; i < l; i++) { + this.effects[i].stop(); + } + for (i = 0, l = this.cleanups.length; i < l; i++) { + this.cleanups[i](); + } + if (this.scopes) { + for (i = 0, l = this.scopes.length; i < l; i++) { + this.scopes[i].stop(true); + } + } + if (!this.detached && this.parent && !fromParent) { + const last = this.parent.scopes.pop(); + if (last && last !== this) { + this.parent.scopes[this.index] = last; + last.index = this.index; + } + } + this.parent = void 0; + this._active = false; + } + } +} +function effectScope(detached) { + return new EffectScope(detached); +} +function getCurrentScope() { + return activeEffectScope; +} +function onScopeDispose(fn, failSilently = false) { + if (activeEffectScope) { + activeEffectScope.cleanups.push(fn); + } else if (!failSilently) { + warn$2( + `onScopeDispose() is called when there is no active effect scope to be associated with.` + ); + } +} + +let activeSub; +const pausedQueueEffects = /* @__PURE__ */ new WeakSet(); +class ReactiveEffect { + constructor(fn) { + this.fn = fn; + /** + * @internal + */ + this.deps = void 0; + /** + * @internal + */ + this.depsTail = void 0; + /** + * @internal + */ + this.flags = 1 | 4; + /** + * @internal + */ + this.next = void 0; + /** + * @internal + */ + this.cleanup = void 0; + this.scheduler = void 0; + if (activeEffectScope && activeEffectScope.active) { + activeEffectScope.effects.push(this); + } + } + pause() { + this.flags |= 64; + } + resume() { + if (this.flags & 64) { + this.flags &= ~64; + if (pausedQueueEffects.has(this)) { + pausedQueueEffects.delete(this); + this.trigger(); + } + } + } + /** + * @internal + */ + notify() { + if (this.flags & 2 && !(this.flags & 32)) { + return; + } + if (!(this.flags & 8)) { + batch(this); + } + } + run() { + if (!(this.flags & 1)) { + return this.fn(); + } + this.flags |= 2; + cleanupEffect(this); + prepareDeps(this); + const prevEffect = activeSub; + const prevShouldTrack = shouldTrack; + activeSub = this; + shouldTrack = true; + try { + return this.fn(); + } finally { + if (activeSub !== this) { + warn$2( + "Active effect was not restored correctly - this is likely a Vue internal bug." + ); + } + cleanupDeps(this); + activeSub = prevEffect; + shouldTrack = prevShouldTrack; + this.flags &= ~2; + } + } + stop() { + if (this.flags & 1) { + for (let link = this.deps; link; link = link.nextDep) { + removeSub(link); + } + this.deps = this.depsTail = void 0; + cleanupEffect(this); + this.onStop && this.onStop(); + this.flags &= ~1; + } + } + trigger() { + if (this.flags & 64) { + pausedQueueEffects.add(this); + } else if (this.scheduler) { + this.scheduler(); + } else { + this.runIfDirty(); + } + } + /** + * @internal + */ + runIfDirty() { + if (isDirty(this)) { + this.run(); + } + } + get dirty() { + return isDirty(this); + } +} +let batchDepth = 0; +let batchedSub; +let batchedComputed; +function batch(sub, isComputed = false) { + sub.flags |= 8; + if (isComputed) { + sub.next = batchedComputed; + batchedComputed = sub; + return; + } + sub.next = batchedSub; + batchedSub = sub; +} +function startBatch() { + batchDepth++; +} +function endBatch() { + if (--batchDepth > 0) { + return; + } + if (batchedComputed) { + let e = batchedComputed; + batchedComputed = void 0; + while (e) { + const next = e.next; + e.next = void 0; + e.flags &= ~8; + e = next; + } + } + let error; + while (batchedSub) { + let e = batchedSub; + batchedSub = void 0; + while (e) { + const next = e.next; + e.next = void 0; + e.flags &= ~8; + if (e.flags & 1) { + try { + ; + e.trigger(); + } catch (err) { + if (!error) error = err; + } + } + e = next; + } + } + if (error) throw error; +} +function prepareDeps(sub) { + for (let link = sub.deps; link; link = link.nextDep) { + link.version = -1; + link.prevActiveLink = link.dep.activeLink; + link.dep.activeLink = link; + } +} +function cleanupDeps(sub) { + let head; + let tail = sub.depsTail; + let link = tail; + while (link) { + const prev = link.prevDep; + if (link.version === -1) { + if (link === tail) tail = prev; + removeSub(link); + removeDep(link); + } else { + head = link; + } + link.dep.activeLink = link.prevActiveLink; + link.prevActiveLink = void 0; + link = prev; + } + sub.deps = head; + sub.depsTail = tail; +} +function isDirty(sub) { + for (let link = sub.deps; link; link = link.nextDep) { + if (link.dep.version !== link.version || link.dep.computed && (refreshComputed(link.dep.computed) || link.dep.version !== link.version)) { + return true; + } + } + if (sub._dirty) { + return true; + } + return false; +} +function refreshComputed(computed) { + if (computed.flags & 4 && !(computed.flags & 16)) { + return; + } + computed.flags &= ~16; + if (computed.globalVersion === globalVersion) { + return; + } + computed.globalVersion = globalVersion; + const dep = computed.dep; + computed.flags |= 2; + if (dep.version > 0 && !computed.isSSR && computed.deps && !isDirty(computed)) { + computed.flags &= ~2; + return; + } + const prevSub = activeSub; + const prevShouldTrack = shouldTrack; + activeSub = computed; + shouldTrack = true; + try { + prepareDeps(computed); + const value = computed.fn(computed._value); + if (dep.version === 0 || hasChanged(value, computed._value)) { + computed._value = value; + dep.version++; + } + } catch (err) { + dep.version++; + throw err; + } finally { + activeSub = prevSub; + shouldTrack = prevShouldTrack; + cleanupDeps(computed); + computed.flags &= ~2; + } +} +function removeSub(link, soft = false) { + const { dep, prevSub, nextSub } = link; + if (prevSub) { + prevSub.nextSub = nextSub; + link.prevSub = void 0; + } + if (nextSub) { + nextSub.prevSub = prevSub; + link.nextSub = void 0; + } + if (dep.subsHead === link) { + dep.subsHead = nextSub; + } + if (dep.subs === link) { + dep.subs = prevSub; + if (!prevSub && dep.computed) { + dep.computed.flags &= ~4; + for (let l = dep.computed.deps; l; l = l.nextDep) { + removeSub(l, true); + } + } + } + if (!soft && !--dep.sc && dep.map) { + dep.map.delete(dep.key); + } +} +function removeDep(link) { + const { prevDep, nextDep } = link; + if (prevDep) { + prevDep.nextDep = nextDep; + link.prevDep = void 0; + } + if (nextDep) { + nextDep.prevDep = prevDep; + link.nextDep = void 0; + } +} +function effect(fn, options) { + if (fn.effect instanceof ReactiveEffect) { + fn = fn.effect.fn; + } + const e = new ReactiveEffect(fn); + if (options) { + extend(e, options); + } + try { + e.run(); + } catch (err) { + e.stop(); + throw err; + } + const runner = e.run.bind(e); + runner.effect = e; + return runner; +} +function stop(runner) { + runner.effect.stop(); +} +let shouldTrack = true; +const trackStack = []; +function pauseTracking() { + trackStack.push(shouldTrack); + shouldTrack = false; +} +function resetTracking() { + const last = trackStack.pop(); + shouldTrack = last === void 0 ? true : last; +} +function cleanupEffect(e) { + const { cleanup } = e; + e.cleanup = void 0; + if (cleanup) { + const prevSub = activeSub; + activeSub = void 0; + try { + cleanup(); + } finally { + activeSub = prevSub; + } + } +} + +let globalVersion = 0; +class Link { + constructor(sub, dep) { + this.sub = sub; + this.dep = dep; + this.version = dep.version; + this.nextDep = this.prevDep = this.nextSub = this.prevSub = this.prevActiveLink = void 0; + } +} +class Dep { + constructor(computed) { + this.computed = computed; + this.version = 0; + /** + * Link between this dep and the current active effect + */ + this.activeLink = void 0; + /** + * Doubly linked list representing the subscribing effects (tail) + */ + this.subs = void 0; + /** + * For object property deps cleanup + */ + this.map = void 0; + this.key = void 0; + /** + * Subscriber counter + */ + this.sc = 0; + { + this.subsHead = void 0; + } + } + track(debugInfo) { + if (!activeSub || !shouldTrack || activeSub === this.computed) { + return; + } + let link = this.activeLink; + if (link === void 0 || link.sub !== activeSub) { + link = this.activeLink = new Link(activeSub, this); + if (!activeSub.deps) { + activeSub.deps = activeSub.depsTail = link; + } else { + link.prevDep = activeSub.depsTail; + activeSub.depsTail.nextDep = link; + activeSub.depsTail = link; + } + addSub(link); + } else if (link.version === -1) { + link.version = this.version; + if (link.nextDep) { + const next = link.nextDep; + next.prevDep = link.prevDep; + if (link.prevDep) { + link.prevDep.nextDep = next; + } + link.prevDep = activeSub.depsTail; + link.nextDep = void 0; + activeSub.depsTail.nextDep = link; + activeSub.depsTail = link; + if (activeSub.deps === link) { + activeSub.deps = next; + } + } + } + if (activeSub.onTrack) { + activeSub.onTrack( + extend( + { + effect: activeSub + }, + debugInfo + ) + ); + } + return link; + } + trigger(debugInfo) { + this.version++; + globalVersion++; + this.notify(debugInfo); + } + notify(debugInfo) { + startBatch(); + try { + if (true) { + for (let head = this.subsHead; head; head = head.nextSub) { + if (head.sub.onTrigger && !(head.sub.flags & 8)) { + head.sub.onTrigger( + extend( + { + effect: head.sub + }, + debugInfo + ) + ); + } + } + } + for (let link = this.subs; link; link = link.prevSub) { + if (link.sub.notify()) { + ; + link.sub.dep.notify(); + } + } + } finally { + endBatch(); + } + } +} +function addSub(link) { + link.dep.sc++; + if (link.sub.flags & 4) { + const computed = link.dep.computed; + if (computed && !link.dep.subs) { + computed.flags |= 4 | 16; + for (let l = computed.deps; l; l = l.nextDep) { + addSub(l); + } + } + const currentTail = link.dep.subs; + if (currentTail !== link) { + link.prevSub = currentTail; + if (currentTail) currentTail.nextSub = link; + } + if (link.dep.subsHead === void 0) { + link.dep.subsHead = link; + } + link.dep.subs = link; + } +} +const targetMap = /* @__PURE__ */ new WeakMap(); +const ITERATE_KEY = Symbol( + "Object iterate" +); +const MAP_KEY_ITERATE_KEY = Symbol( + "Map keys iterate" +); +const ARRAY_ITERATE_KEY = Symbol( + "Array iterate" +); +function track(target, type, key) { + if (shouldTrack && activeSub) { + let depsMap = targetMap.get(target); + if (!depsMap) { + targetMap.set(target, depsMap = /* @__PURE__ */ new Map()); + } + let dep = depsMap.get(key); + if (!dep) { + depsMap.set(key, dep = new Dep()); + dep.map = depsMap; + dep.key = key; + } + { + dep.track({ + target, + type, + key + }); + } + } +} +function trigger(target, type, key, newValue, oldValue, oldTarget) { + const depsMap = targetMap.get(target); + if (!depsMap) { + globalVersion++; + return; + } + const run = (dep) => { + if (dep) { + { + dep.trigger({ + target, + type, + key, + newValue, + oldValue, + oldTarget + }); + } + } + }; + startBatch(); + if (type === "clear") { + depsMap.forEach(run); + } else { + const targetIsArray = isArray(target); + const isArrayIndex = targetIsArray && isIntegerKey(key); + if (targetIsArray && key === "length") { + const newLength = Number(newValue); + depsMap.forEach((dep, key2) => { + if (key2 === "length" || key2 === ARRAY_ITERATE_KEY || !isSymbol(key2) && key2 >= newLength) { + run(dep); + } + }); + } else { + if (key !== void 0 || depsMap.has(void 0)) { + run(depsMap.get(key)); + } + if (isArrayIndex) { + run(depsMap.get(ARRAY_ITERATE_KEY)); + } + switch (type) { + case "add": + if (!targetIsArray) { + run(depsMap.get(ITERATE_KEY)); + if (isMap(target)) { + run(depsMap.get(MAP_KEY_ITERATE_KEY)); + } + } else if (isArrayIndex) { + run(depsMap.get("length")); + } + break; + case "delete": + if (!targetIsArray) { + run(depsMap.get(ITERATE_KEY)); + if (isMap(target)) { + run(depsMap.get(MAP_KEY_ITERATE_KEY)); + } + } + break; + case "set": + if (isMap(target)) { + run(depsMap.get(ITERATE_KEY)); + } + break; + } + } + } + endBatch(); +} +function getDepFromReactive(object, key) { + const depMap = targetMap.get(object); + return depMap && depMap.get(key); +} + +function reactiveReadArray(array) { + const raw = toRaw(array); + if (raw === array) return raw; + track(raw, "iterate", ARRAY_ITERATE_KEY); + return isShallow(array) ? raw : raw.map(toReactive); +} +function shallowReadArray(arr) { + track(arr = toRaw(arr), "iterate", ARRAY_ITERATE_KEY); + return arr; +} +const arrayInstrumentations = { + __proto__: null, + [Symbol.iterator]() { + return iterator(this, Symbol.iterator, toReactive); + }, + concat(...args) { + return reactiveReadArray(this).concat( + ...args.map((x) => isArray(x) ? reactiveReadArray(x) : x) + ); + }, + entries() { + return iterator(this, "entries", (value) => { + value[1] = toReactive(value[1]); + return value; + }); + }, + every(fn, thisArg) { + return apply(this, "every", fn, thisArg, void 0, arguments); + }, + filter(fn, thisArg) { + return apply(this, "filter", fn, thisArg, (v) => v.map(toReactive), arguments); + }, + find(fn, thisArg) { + return apply(this, "find", fn, thisArg, toReactive, arguments); + }, + findIndex(fn, thisArg) { + return apply(this, "findIndex", fn, thisArg, void 0, arguments); + }, + findLast(fn, thisArg) { + return apply(this, "findLast", fn, thisArg, toReactive, arguments); + }, + findLastIndex(fn, thisArg) { + return apply(this, "findLastIndex", fn, thisArg, void 0, arguments); + }, + // flat, flatMap could benefit from ARRAY_ITERATE but are not straight-forward to implement + forEach(fn, thisArg) { + return apply(this, "forEach", fn, thisArg, void 0, arguments); + }, + includes(...args) { + return searchProxy(this, "includes", args); + }, + indexOf(...args) { + return searchProxy(this, "indexOf", args); + }, + join(separator) { + return reactiveReadArray(this).join(separator); + }, + // keys() iterator only reads `length`, no optimisation required + lastIndexOf(...args) { + return searchProxy(this, "lastIndexOf", args); + }, + map(fn, thisArg) { + return apply(this, "map", fn, thisArg, void 0, arguments); + }, + pop() { + return noTracking(this, "pop"); + }, + push(...args) { + return noTracking(this, "push", args); + }, + reduce(fn, ...args) { + return reduce(this, "reduce", fn, args); + }, + reduceRight(fn, ...args) { + return reduce(this, "reduceRight", fn, args); + }, + shift() { + return noTracking(this, "shift"); + }, + // slice could use ARRAY_ITERATE but also seems to beg for range tracking + some(fn, thisArg) { + return apply(this, "some", fn, thisArg, void 0, arguments); + }, + splice(...args) { + return noTracking(this, "splice", args); + }, + toReversed() { + return reactiveReadArray(this).toReversed(); + }, + toSorted(comparer) { + return reactiveReadArray(this).toSorted(comparer); + }, + toSpliced(...args) { + return reactiveReadArray(this).toSpliced(...args); + }, + unshift(...args) { + return noTracking(this, "unshift", args); + }, + values() { + return iterator(this, "values", toReactive); + } +}; +function iterator(self, method, wrapValue) { + const arr = shallowReadArray(self); + const iter = arr[method](); + if (arr !== self && !isShallow(self)) { + iter._next = iter.next; + iter.next = () => { + const result = iter._next(); + if (result.value) { + result.value = wrapValue(result.value); + } + return result; + }; + } + return iter; +} +const arrayProto = Array.prototype; +function apply(self, method, fn, thisArg, wrappedRetFn, args) { + const arr = shallowReadArray(self); + const needsWrap = arr !== self && !isShallow(self); + const methodFn = arr[method]; + if (methodFn !== arrayProto[method]) { + const result2 = methodFn.apply(self, args); + return needsWrap ? toReactive(result2) : result2; + } + let wrappedFn = fn; + if (arr !== self) { + if (needsWrap) { + wrappedFn = function(item, index) { + return fn.call(this, toReactive(item), index, self); + }; + } else if (fn.length > 2) { + wrappedFn = function(item, index) { + return fn.call(this, item, index, self); + }; + } + } + const result = methodFn.call(arr, wrappedFn, thisArg); + return needsWrap && wrappedRetFn ? wrappedRetFn(result) : result; +} +function reduce(self, method, fn, args) { + const arr = shallowReadArray(self); + let wrappedFn = fn; + if (arr !== self) { + if (!isShallow(self)) { + wrappedFn = function(acc, item, index) { + return fn.call(this, acc, toReactive(item), index, self); + }; + } else if (fn.length > 3) { + wrappedFn = function(acc, item, index) { + return fn.call(this, acc, item, index, self); + }; + } + } + return arr[method](wrappedFn, ...args); +} +function searchProxy(self, method, args) { + const arr = toRaw(self); + track(arr, "iterate", ARRAY_ITERATE_KEY); + const res = arr[method](...args); + if ((res === -1 || res === false) && isProxy(args[0])) { + args[0] = toRaw(args[0]); + return arr[method](...args); + } + return res; +} +function noTracking(self, method, args = []) { + pauseTracking(); + startBatch(); + const res = toRaw(self)[method].apply(self, args); + endBatch(); + resetTracking(); + return res; +} + +const isNonTrackableKeys = /* @__PURE__ */ makeMap(`__proto__,__v_isRef,__isVue`); +const builtInSymbols = new Set( + /* @__PURE__ */ Object.getOwnPropertyNames(Symbol).filter((key) => key !== "arguments" && key !== "caller").map((key) => Symbol[key]).filter(isSymbol) +); +function hasOwnProperty(key) { + if (!isSymbol(key)) key = String(key); + const obj = toRaw(this); + track(obj, "has", key); + return obj.hasOwnProperty(key); +} +class BaseReactiveHandler { + constructor(_isReadonly = false, _isShallow = false) { + this._isReadonly = _isReadonly; + this._isShallow = _isShallow; + } + get(target, key, receiver) { + const isReadonly2 = this._isReadonly, isShallow2 = this._isShallow; + if (key === "__v_isReactive") { + return !isReadonly2; + } else if (key === "__v_isReadonly") { + return isReadonly2; + } else if (key === "__v_isShallow") { + return isShallow2; + } else if (key === "__v_raw") { + if (receiver === (isReadonly2 ? isShallow2 ? shallowReadonlyMap : readonlyMap : isShallow2 ? shallowReactiveMap : reactiveMap).get(target) || // receiver is not the reactive proxy, but has the same prototype + // this means the receiver is a user proxy of the reactive proxy + Object.getPrototypeOf(target) === Object.getPrototypeOf(receiver)) { + return target; + } + return; + } + const targetIsArray = isArray(target); + if (!isReadonly2) { + let fn; + if (targetIsArray && (fn = arrayInstrumentations[key])) { + return fn; + } + if (key === "hasOwnProperty") { + return hasOwnProperty; + } + } + const res = Reflect.get( + target, + key, + // if this is a proxy wrapping a ref, return methods using the raw ref + // as receiver so that we don't have to call `toRaw` on the ref in all + // its class methods + isRef(target) ? target : receiver + ); + if (isSymbol(key) ? builtInSymbols.has(key) : isNonTrackableKeys(key)) { + return res; + } + if (!isReadonly2) { + track(target, "get", key); + } + if (isShallow2) { + return res; + } + if (isRef(res)) { + return targetIsArray && isIntegerKey(key) ? res : res.value; + } + if (isObject(res)) { + return isReadonly2 ? readonly(res) : reactive(res); + } + return res; + } +} +class MutableReactiveHandler extends BaseReactiveHandler { + constructor(isShallow2 = false) { + super(false, isShallow2); + } + set(target, key, value, receiver) { + let oldValue = target[key]; + if (!this._isShallow) { + const isOldValueReadonly = isReadonly(oldValue); + if (!isShallow(value) && !isReadonly(value)) { + oldValue = toRaw(oldValue); + value = toRaw(value); + } + if (!isArray(target) && isRef(oldValue) && !isRef(value)) { + if (isOldValueReadonly) { + return false; + } else { + oldValue.value = value; + return true; + } + } + } + const hadKey = isArray(target) && isIntegerKey(key) ? Number(key) < target.length : hasOwn(target, key); + const result = Reflect.set( + target, + key, + value, + isRef(target) ? target : receiver + ); + if (target === toRaw(receiver)) { + if (!hadKey) { + trigger(target, "add", key, value); + } else if (hasChanged(value, oldValue)) { + trigger(target, "set", key, value, oldValue); + } + } + return result; + } + deleteProperty(target, key) { + const hadKey = hasOwn(target, key); + const oldValue = target[key]; + const result = Reflect.deleteProperty(target, key); + if (result && hadKey) { + trigger(target, "delete", key, void 0, oldValue); + } + return result; + } + has(target, key) { + const result = Reflect.has(target, key); + if (!isSymbol(key) || !builtInSymbols.has(key)) { + track(target, "has", key); + } + return result; + } + ownKeys(target) { + track( + target, + "iterate", + isArray(target) ? "length" : ITERATE_KEY + ); + return Reflect.ownKeys(target); + } +} +class ReadonlyReactiveHandler extends BaseReactiveHandler { + constructor(isShallow2 = false) { + super(true, isShallow2); + } + set(target, key) { + { + warn$2( + `Set operation on key "${String(key)}" failed: target is readonly.`, + target + ); + } + return true; + } + deleteProperty(target, key) { + { + warn$2( + `Delete operation on key "${String(key)}" failed: target is readonly.`, + target + ); + } + return true; + } +} +const mutableHandlers = /* @__PURE__ */ new MutableReactiveHandler(); +const readonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler(); +const shallowReactiveHandlers = /* @__PURE__ */ new MutableReactiveHandler(true); +const shallowReadonlyHandlers = /* @__PURE__ */ new ReadonlyReactiveHandler(true); + +const toShallow = (value) => value; +const getProto = (v) => Reflect.getPrototypeOf(v); +function createIterableMethod(method, isReadonly2, isShallow2) { + return function(...args) { + const target = this["__v_raw"]; + const rawTarget = toRaw(target); + const targetIsMap = isMap(rawTarget); + const isPair = method === "entries" || method === Symbol.iterator && targetIsMap; + const isKeyOnly = method === "keys" && targetIsMap; + const innerIterator = target[method](...args); + const wrap = isShallow2 ? toShallow : isReadonly2 ? toReadonly : toReactive; + !isReadonly2 && track( + rawTarget, + "iterate", + isKeyOnly ? MAP_KEY_ITERATE_KEY : ITERATE_KEY + ); + return { + // iterator protocol + next() { + const { value, done } = innerIterator.next(); + return done ? { value, done } : { + value: isPair ? [wrap(value[0]), wrap(value[1])] : wrap(value), + done + }; + }, + // iterable protocol + [Symbol.iterator]() { + return this; + } + }; + }; +} +function createReadonlyMethod(type) { + return function(...args) { + { + const key = args[0] ? `on key "${args[0]}" ` : ``; + warn$2( + `${capitalize(type)} operation ${key}failed: target is readonly.`, + toRaw(this) + ); + } + return type === "delete" ? false : type === "clear" ? void 0 : this; + }; +} +function createInstrumentations(readonly, shallow) { + const instrumentations = { + get(key) { + const target = this["__v_raw"]; + const rawTarget = toRaw(target); + const rawKey = toRaw(key); + if (!readonly) { + if (hasChanged(key, rawKey)) { + track(rawTarget, "get", key); + } + track(rawTarget, "get", rawKey); + } + const { has } = getProto(rawTarget); + const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive; + if (has.call(rawTarget, key)) { + return wrap(target.get(key)); + } else if (has.call(rawTarget, rawKey)) { + return wrap(target.get(rawKey)); + } else if (target !== rawTarget) { + target.get(key); + } + }, + get size() { + const target = this["__v_raw"]; + !readonly && track(toRaw(target), "iterate", ITERATE_KEY); + return Reflect.get(target, "size", target); + }, + has(key) { + const target = this["__v_raw"]; + const rawTarget = toRaw(target); + const rawKey = toRaw(key); + if (!readonly) { + if (hasChanged(key, rawKey)) { + track(rawTarget, "has", key); + } + track(rawTarget, "has", rawKey); + } + return key === rawKey ? target.has(key) : target.has(key) || target.has(rawKey); + }, + forEach(callback, thisArg) { + const observed = this; + const target = observed["__v_raw"]; + const rawTarget = toRaw(target); + const wrap = shallow ? toShallow : readonly ? toReadonly : toReactive; + !readonly && track(rawTarget, "iterate", ITERATE_KEY); + return target.forEach((value, key) => { + return callback.call(thisArg, wrap(value), wrap(key), observed); + }); + } + }; + extend( + instrumentations, + readonly ? { + add: createReadonlyMethod("add"), + set: createReadonlyMethod("set"), + delete: createReadonlyMethod("delete"), + clear: createReadonlyMethod("clear") + } : { + add(value) { + if (!shallow && !isShallow(value) && !isReadonly(value)) { + value = toRaw(value); + } + const target = toRaw(this); + const proto = getProto(target); + const hadKey = proto.has.call(target, value); + if (!hadKey) { + target.add(value); + trigger(target, "add", value, value); + } + return this; + }, + set(key, value) { + if (!shallow && !isShallow(value) && !isReadonly(value)) { + value = toRaw(value); + } + const target = toRaw(this); + const { has, get } = getProto(target); + let hadKey = has.call(target, key); + if (!hadKey) { + key = toRaw(key); + hadKey = has.call(target, key); + } else { + checkIdentityKeys(target, has, key); + } + const oldValue = get.call(target, key); + target.set(key, value); + if (!hadKey) { + trigger(target, "add", key, value); + } else if (hasChanged(value, oldValue)) { + trigger(target, "set", key, value, oldValue); + } + return this; + }, + delete(key) { + const target = toRaw(this); + const { has, get } = getProto(target); + let hadKey = has.call(target, key); + if (!hadKey) { + key = toRaw(key); + hadKey = has.call(target, key); + } else { + checkIdentityKeys(target, has, key); + } + const oldValue = get ? get.call(target, key) : void 0; + const result = target.delete(key); + if (hadKey) { + trigger(target, "delete", key, void 0, oldValue); + } + return result; + }, + clear() { + const target = toRaw(this); + const hadItems = target.size !== 0; + const oldTarget = isMap(target) ? new Map(target) : new Set(target) ; + const result = target.clear(); + if (hadItems) { + trigger( + target, + "clear", + void 0, + void 0, + oldTarget + ); + } + return result; + } + } + ); + const iteratorMethods = [ + "keys", + "values", + "entries", + Symbol.iterator + ]; + iteratorMethods.forEach((method) => { + instrumentations[method] = createIterableMethod(method, readonly, shallow); + }); + return instrumentations; +} +function createInstrumentationGetter(isReadonly2, shallow) { + const instrumentations = createInstrumentations(isReadonly2, shallow); + return (target, key, receiver) => { + if (key === "__v_isReactive") { + return !isReadonly2; + } else if (key === "__v_isReadonly") { + return isReadonly2; + } else if (key === "__v_raw") { + return target; + } + return Reflect.get( + hasOwn(instrumentations, key) && key in target ? instrumentations : target, + key, + receiver + ); + }; +} +const mutableCollectionHandlers = { + get: /* @__PURE__ */ createInstrumentationGetter(false, false) +}; +const shallowCollectionHandlers = { + get: /* @__PURE__ */ createInstrumentationGetter(false, true) +}; +const readonlyCollectionHandlers = { + get: /* @__PURE__ */ createInstrumentationGetter(true, false) +}; +const shallowReadonlyCollectionHandlers = { + get: /* @__PURE__ */ createInstrumentationGetter(true, true) +}; +function checkIdentityKeys(target, has, key) { + const rawKey = toRaw(key); + if (rawKey !== key && has.call(target, rawKey)) { + const type = toRawType(target); + warn$2( + `Reactive ${type} contains both the raw and reactive versions of the same object${type === `Map` ? ` as keys` : ``}, which can lead to inconsistencies. Avoid differentiating between the raw and reactive versions of an object and only use the reactive version if possible.` + ); + } +} + +const reactiveMap = /* @__PURE__ */ new WeakMap(); +const shallowReactiveMap = /* @__PURE__ */ new WeakMap(); +const readonlyMap = /* @__PURE__ */ new WeakMap(); +const shallowReadonlyMap = /* @__PURE__ */ new WeakMap(); +function targetTypeMap(rawType) { + switch (rawType) { + case "Object": + case "Array": + return 1 /* COMMON */; + case "Map": + case "Set": + case "WeakMap": + case "WeakSet": + return 2 /* COLLECTION */; + default: + return 0 /* INVALID */; + } +} +function getTargetType(value) { + return value["__v_skip"] || !Object.isExtensible(value) ? 0 /* INVALID */ : targetTypeMap(toRawType(value)); +} +function reactive(target) { + if (isReadonly(target)) { + return target; + } + return createReactiveObject( + target, + false, + mutableHandlers, + mutableCollectionHandlers, + reactiveMap + ); +} +function shallowReactive(target) { + return createReactiveObject( + target, + false, + shallowReactiveHandlers, + shallowCollectionHandlers, + shallowReactiveMap + ); +} +function readonly(target) { + return createReactiveObject( + target, + true, + readonlyHandlers, + readonlyCollectionHandlers, + readonlyMap + ); +} +function shallowReadonly(target) { + return createReactiveObject( + target, + true, + shallowReadonlyHandlers, + shallowReadonlyCollectionHandlers, + shallowReadonlyMap + ); +} +function createReactiveObject(target, isReadonly2, baseHandlers, collectionHandlers, proxyMap) { + if (!isObject(target)) { + { + warn$2( + `value cannot be made ${isReadonly2 ? "readonly" : "reactive"}: ${String( + target + )}` + ); + } + return target; + } + if (target["__v_raw"] && !(isReadonly2 && target["__v_isReactive"])) { + return target; + } + const existingProxy = proxyMap.get(target); + if (existingProxy) { + return existingProxy; + } + const targetType = getTargetType(target); + if (targetType === 0 /* INVALID */) { + return target; + } + const proxy = new Proxy( + target, + targetType === 2 /* COLLECTION */ ? collectionHandlers : baseHandlers + ); + proxyMap.set(target, proxy); + return proxy; +} +function isReactive(value) { + if (isReadonly(value)) { + return isReactive(value["__v_raw"]); + } + return !!(value && value["__v_isReactive"]); +} +function isReadonly(value) { + return !!(value && value["__v_isReadonly"]); +} +function isShallow(value) { + return !!(value && value["__v_isShallow"]); +} +function isProxy(value) { + return value ? !!value["__v_raw"] : false; +} +function toRaw(observed) { + const raw = observed && observed["__v_raw"]; + return raw ? toRaw(raw) : observed; +} +function markRaw(value) { + if (!hasOwn(value, "__v_skip") && Object.isExtensible(value)) { + def(value, "__v_skip", true); + } + return value; +} +const toReactive = (value) => isObject(value) ? reactive(value) : value; +const toReadonly = (value) => isObject(value) ? readonly(value) : value; + +function isRef(r) { + return r ? r["__v_isRef"] === true : false; +} +function ref(value) { + return createRef(value, false); +} +function shallowRef(value) { + return createRef(value, true); +} +function createRef(rawValue, shallow) { + if (isRef(rawValue)) { + return rawValue; + } + return new RefImpl(rawValue, shallow); +} +class RefImpl { + constructor(value, isShallow2) { + this.dep = new Dep(); + this["__v_isRef"] = true; + this["__v_isShallow"] = false; + this._rawValue = isShallow2 ? value : toRaw(value); + this._value = isShallow2 ? value : toReactive(value); + this["__v_isShallow"] = isShallow2; + } + get value() { + { + this.dep.track({ + target: this, + type: "get", + key: "value" + }); + } + return this._value; + } + set value(newValue) { + const oldValue = this._rawValue; + const useDirectValue = this["__v_isShallow"] || isShallow(newValue) || isReadonly(newValue); + newValue = useDirectValue ? newValue : toRaw(newValue); + if (hasChanged(newValue, oldValue)) { + this._rawValue = newValue; + this._value = useDirectValue ? newValue : toReactive(newValue); + { + this.dep.trigger({ + target: this, + type: "set", + key: "value", + newValue, + oldValue + }); + } + } + } +} +function triggerRef(ref2) { + if (ref2.dep) { + { + ref2.dep.trigger({ + target: ref2, + type: "set", + key: "value", + newValue: ref2._value + }); + } + } +} +function unref(ref2) { + return isRef(ref2) ? ref2.value : ref2; +} +function toValue(source) { + return isFunction(source) ? source() : unref(source); +} +const shallowUnwrapHandlers = { + get: (target, key, receiver) => key === "__v_raw" ? target : unref(Reflect.get(target, key, receiver)), + set: (target, key, value, receiver) => { + const oldValue = target[key]; + if (isRef(oldValue) && !isRef(value)) { + oldValue.value = value; + return true; + } else { + return Reflect.set(target, key, value, receiver); + } + } +}; +function proxyRefs(objectWithRefs) { + return isReactive(objectWithRefs) ? objectWithRefs : new Proxy(objectWithRefs, shallowUnwrapHandlers); +} +class CustomRefImpl { + constructor(factory) { + this["__v_isRef"] = true; + this._value = void 0; + const dep = this.dep = new Dep(); + const { get, set } = factory(dep.track.bind(dep), dep.trigger.bind(dep)); + this._get = get; + this._set = set; + } + get value() { + return this._value = this._get(); + } + set value(newVal) { + this._set(newVal); + } +} +function customRef(factory) { + return new CustomRefImpl(factory); +} +function toRefs(object) { + if (!isProxy(object)) { + warn$2(`toRefs() expects a reactive object but received a plain one.`); + } + const ret = isArray(object) ? new Array(object.length) : {}; + for (const key in object) { + ret[key] = propertyToRef(object, key); + } + return ret; +} +class ObjectRefImpl { + constructor(_object, _key, _defaultValue) { + this._object = _object; + this._key = _key; + this._defaultValue = _defaultValue; + this["__v_isRef"] = true; + this._value = void 0; + } + get value() { + const val = this._object[this._key]; + return this._value = val === void 0 ? this._defaultValue : val; + } + set value(newVal) { + this._object[this._key] = newVal; + } + get dep() { + return getDepFromReactive(toRaw(this._object), this._key); + } +} +class GetterRefImpl { + constructor(_getter) { + this._getter = _getter; + this["__v_isRef"] = true; + this["__v_isReadonly"] = true; + this._value = void 0; + } + get value() { + return this._value = this._getter(); + } +} +function toRef(source, key, defaultValue) { + if (isRef(source)) { + return source; + } else if (isFunction(source)) { + return new GetterRefImpl(source); + } else if (isObject(source) && arguments.length > 1) { + return propertyToRef(source, key, defaultValue); + } else { + return ref(source); + } +} +function propertyToRef(source, key, defaultValue) { + const val = source[key]; + return isRef(val) ? val : new ObjectRefImpl(source, key, defaultValue); +} + +class ComputedRefImpl { + constructor(fn, setter, isSSR) { + this.fn = fn; + this.setter = setter; + /** + * @internal + */ + this._value = void 0; + /** + * @internal + */ + this.dep = new Dep(this); + /** + * @internal + */ + this.__v_isRef = true; + // TODO isolatedDeclarations "__v_isReadonly" + // A computed is also a subscriber that tracks other deps + /** + * @internal + */ + this.deps = void 0; + /** + * @internal + */ + this.depsTail = void 0; + /** + * @internal + */ + this.flags = 16; + /** + * @internal + */ + this.globalVersion = globalVersion - 1; + /** + * @internal + */ + this.next = void 0; + // for backwards compat + this.effect = this; + this["__v_isReadonly"] = !setter; + this.isSSR = isSSR; + } + /** + * @internal + */ + notify() { + this.flags |= 16; + if (!(this.flags & 8) && // avoid infinite self recursion + activeSub !== this) { + batch(this, true); + return true; + } + } + get value() { + const link = this.dep.track({ + target: this, + type: "get", + key: "value" + }) ; + refreshComputed(this); + if (link) { + link.version = this.dep.version; + } + return this._value; + } + set value(newValue) { + if (this.setter) { + this.setter(newValue); + } else { + warn$2("Write operation failed: computed value is readonly"); + } + } +} +function computed$1(getterOrOptions, debugOptions, isSSR = false) { + let getter; + let setter; + if (isFunction(getterOrOptions)) { + getter = getterOrOptions; + } else { + getter = getterOrOptions.get; + setter = getterOrOptions.set; + } + const cRef = new ComputedRefImpl(getter, setter, isSSR); + if (debugOptions && !isSSR) { + cRef.onTrack = debugOptions.onTrack; + cRef.onTrigger = debugOptions.onTrigger; + } + return cRef; +} + +const TrackOpTypes = { + "GET": "get", + "HAS": "has", + "ITERATE": "iterate" +}; +const TriggerOpTypes = { + "SET": "set", + "ADD": "add", + "DELETE": "delete", + "CLEAR": "clear" +}; + +const INITIAL_WATCHER_VALUE = {}; +const cleanupMap = /* @__PURE__ */ new WeakMap(); +let activeWatcher = void 0; +function getCurrentWatcher() { + return activeWatcher; +} +function onWatcherCleanup(cleanupFn, failSilently = false, owner = activeWatcher) { + if (owner) { + let cleanups = cleanupMap.get(owner); + if (!cleanups) cleanupMap.set(owner, cleanups = []); + cleanups.push(cleanupFn); + } else if (!failSilently) { + warn$2( + `onWatcherCleanup() was called when there was no active watcher to associate with.` + ); + } +} +function watch$1(source, cb, options = EMPTY_OBJ) { + const { immediate, deep, once, scheduler, augmentJob, call } = options; + const warnInvalidSource = (s) => { + (options.onWarn || warn$2)( + `Invalid watch source: `, + s, + `A watch source can only be a getter/effect function, a ref, a reactive object, or an array of these types.` + ); + }; + const reactiveGetter = (source2) => { + if (deep) return source2; + if (isShallow(source2) || deep === false || deep === 0) + return traverse(source2, 1); + return traverse(source2); + }; + let effect; + let getter; + let cleanup; + let boundCleanup; + let forceTrigger = false; + let isMultiSource = false; + if (isRef(source)) { + getter = () => source.value; + forceTrigger = isShallow(source); + } else if (isReactive(source)) { + getter = () => reactiveGetter(source); + forceTrigger = true; + } else if (isArray(source)) { + isMultiSource = true; + forceTrigger = source.some((s) => isReactive(s) || isShallow(s)); + getter = () => source.map((s) => { + if (isRef(s)) { + return s.value; + } else if (isReactive(s)) { + return reactiveGetter(s); + } else if (isFunction(s)) { + return call ? call(s, 2) : s(); + } else { + warnInvalidSource(s); + } + }); + } else if (isFunction(source)) { + if (cb) { + getter = call ? () => call(source, 2) : source; + } else { + getter = () => { + if (cleanup) { + pauseTracking(); + try { + cleanup(); + } finally { + resetTracking(); + } + } + const currentEffect = activeWatcher; + activeWatcher = effect; + try { + return call ? call(source, 3, [boundCleanup]) : source(boundCleanup); + } finally { + activeWatcher = currentEffect; + } + }; + } + } else { + getter = NOOP; + warnInvalidSource(source); + } + if (cb && deep) { + const baseGetter = getter; + const depth = deep === true ? Infinity : deep; + getter = () => traverse(baseGetter(), depth); + } + const scope = getCurrentScope(); + const watchHandle = () => { + effect.stop(); + if (scope) { + remove(scope.effects, effect); + } + }; + if (once && cb) { + const _cb = cb; + cb = (...args) => { + _cb(...args); + watchHandle(); + }; + } + let oldValue = isMultiSource ? new Array(source.length).fill(INITIAL_WATCHER_VALUE) : INITIAL_WATCHER_VALUE; + const job = (immediateFirstRun) => { + if (!(effect.flags & 1) || !effect.dirty && !immediateFirstRun) { + return; + } + if (cb) { + const newValue = effect.run(); + if (deep || forceTrigger || (isMultiSource ? newValue.some((v, i) => hasChanged(v, oldValue[i])) : hasChanged(newValue, oldValue))) { + if (cleanup) { + cleanup(); + } + const currentWatcher = activeWatcher; + activeWatcher = effect; + try { + const args = [ + newValue, + // pass undefined as the old value when it's changed for the first time + oldValue === INITIAL_WATCHER_VALUE ? void 0 : isMultiSource && oldValue[0] === INITIAL_WATCHER_VALUE ? [] : oldValue, + boundCleanup + ]; + call ? call(cb, 3, args) : ( + // @ts-expect-error + cb(...args) + ); + oldValue = newValue; + } finally { + activeWatcher = currentWatcher; + } + } + } else { + effect.run(); + } + }; + if (augmentJob) { + augmentJob(job); + } + effect = new ReactiveEffect(getter); + effect.scheduler = scheduler ? () => scheduler(job, false) : job; + boundCleanup = (fn) => onWatcherCleanup(fn, false, effect); + cleanup = effect.onStop = () => { + const cleanups = cleanupMap.get(effect); + if (cleanups) { + if (call) { + call(cleanups, 4); + } else { + for (const cleanup2 of cleanups) cleanup2(); + } + cleanupMap.delete(effect); + } + }; + { + effect.onTrack = options.onTrack; + effect.onTrigger = options.onTrigger; + } + if (cb) { + if (immediate) { + job(true); + } else { + oldValue = effect.run(); + } + } else if (scheduler) { + scheduler(job.bind(null, true), true); + } else { + effect.run(); + } + watchHandle.pause = effect.pause.bind(effect); + watchHandle.resume = effect.resume.bind(effect); + watchHandle.stop = watchHandle; + return watchHandle; +} +function traverse(value, depth = Infinity, seen) { + if (depth <= 0 || !isObject(value) || value["__v_skip"]) { + return value; + } + seen = seen || /* @__PURE__ */ new Set(); + if (seen.has(value)) { + return value; + } + seen.add(value); + depth--; + if (isRef(value)) { + traverse(value.value, depth, seen); + } else if (isArray(value)) { + for (let i = 0; i < value.length; i++) { + traverse(value[i], depth, seen); + } + } else if (isSet(value) || isMap(value)) { + value.forEach((v) => { + traverse(v, depth, seen); + }); + } else if (isPlainObject(value)) { + for (const key in value) { + traverse(value[key], depth, seen); + } + for (const key of Object.getOwnPropertySymbols(value)) { + if (Object.prototype.propertyIsEnumerable.call(value, key)) { + traverse(value[key], depth, seen); + } + } + } + return value; +} + +const stack$1 = []; +function pushWarningContext(vnode) { + stack$1.push(vnode); +} +function popWarningContext() { + stack$1.pop(); +} +let isWarning = false; +function warn$1(msg, ...args) { + if (isWarning) return; + isWarning = true; + pauseTracking(); + const instance = stack$1.length ? stack$1[stack$1.length - 1].component : null; + const appWarnHandler = instance && instance.appContext.config.warnHandler; + const trace = getComponentTrace(); + if (appWarnHandler) { + callWithErrorHandling( + appWarnHandler, + instance, + 11, + [ + // eslint-disable-next-line no-restricted-syntax + msg + args.map((a) => { + var _a, _b; + return (_b = (_a = a.toString) == null ? void 0 : _a.call(a)) != null ? _b : JSON.stringify(a); + }).join(""), + instance && instance.proxy, + trace.map( + ({ vnode }) => `at <${formatComponentName(instance, vnode.type)}>` + ).join("\n"), + trace + ] + ); + } else { + const warnArgs = [`[Vue warn]: ${msg}`, ...args]; + if (trace.length && // avoid spamming console during tests + true) { + warnArgs.push(` +`, ...formatTrace(trace)); + } + console.warn(...warnArgs); + } + resetTracking(); + isWarning = false; +} +function getComponentTrace() { + let currentVNode = stack$1[stack$1.length - 1]; + if (!currentVNode) { + return []; + } + const normalizedStack = []; + while (currentVNode) { + const last = normalizedStack[0]; + if (last && last.vnode === currentVNode) { + last.recurseCount++; + } else { + normalizedStack.push({ + vnode: currentVNode, + recurseCount: 0 + }); + } + const parentInstance = currentVNode.component && currentVNode.component.parent; + currentVNode = parentInstance && parentInstance.vnode; + } + return normalizedStack; +} +function formatTrace(trace) { + const logs = []; + trace.forEach((entry, i) => { + logs.push(...i === 0 ? [] : [` +`], ...formatTraceEntry(entry)); + }); + return logs; +} +function formatTraceEntry({ vnode, recurseCount }) { + const postfix = recurseCount > 0 ? `... (${recurseCount} recursive calls)` : ``; + const isRoot = vnode.component ? vnode.component.parent == null : false; + const open = ` at <${formatComponentName( + vnode.component, + vnode.type, + isRoot + )}`; + const close = `>` + postfix; + return vnode.props ? [open, ...formatProps(vnode.props), close] : [open + close]; +} +function formatProps(props) { + const res = []; + const keys = Object.keys(props); + keys.slice(0, 3).forEach((key) => { + res.push(...formatProp(key, props[key])); + }); + if (keys.length > 3) { + res.push(` ...`); + } + return res; +} +function formatProp(key, value, raw) { + if (isString(value)) { + value = JSON.stringify(value); + return raw ? value : [`${key}=${value}`]; + } else if (typeof value === "number" || typeof value === "boolean" || value == null) { + return raw ? value : [`${key}=${value}`]; + } else if (isRef(value)) { + value = formatProp(key, toRaw(value.value), true); + return raw ? value : [`${key}=Ref<`, value, `>`]; + } else if (isFunction(value)) { + return [`${key}=fn${value.name ? `<${value.name}>` : ``}`]; + } else { + value = toRaw(value); + return raw ? value : [`${key}=`, value]; + } +} +function assertNumber(val, type) { + if (val === void 0) { + return; + } else if (typeof val !== "number") { + warn$1(`${type} is not a valid number - got ${JSON.stringify(val)}.`); + } else if (isNaN(val)) { + warn$1(`${type} is NaN - the duration expression might be incorrect.`); + } +} + +const ErrorCodes = { + "SETUP_FUNCTION": 0, + "0": "SETUP_FUNCTION", + "RENDER_FUNCTION": 1, + "1": "RENDER_FUNCTION", + "NATIVE_EVENT_HANDLER": 5, + "5": "NATIVE_EVENT_HANDLER", + "COMPONENT_EVENT_HANDLER": 6, + "6": "COMPONENT_EVENT_HANDLER", + "VNODE_HOOK": 7, + "7": "VNODE_HOOK", + "DIRECTIVE_HOOK": 8, + "8": "DIRECTIVE_HOOK", + "TRANSITION_HOOK": 9, + "9": "TRANSITION_HOOK", + "APP_ERROR_HANDLER": 10, + "10": "APP_ERROR_HANDLER", + "APP_WARN_HANDLER": 11, + "11": "APP_WARN_HANDLER", + "FUNCTION_REF": 12, + "12": "FUNCTION_REF", + "ASYNC_COMPONENT_LOADER": 13, + "13": "ASYNC_COMPONENT_LOADER", + "SCHEDULER": 14, + "14": "SCHEDULER", + "COMPONENT_UPDATE": 15, + "15": "COMPONENT_UPDATE", + "APP_UNMOUNT_CLEANUP": 16, + "16": "APP_UNMOUNT_CLEANUP" +}; +const ErrorTypeStrings$1 = { + ["sp"]: "serverPrefetch hook", + ["bc"]: "beforeCreate hook", + ["c"]: "created hook", + ["bm"]: "beforeMount hook", + ["m"]: "mounted hook", + ["bu"]: "beforeUpdate hook", + ["u"]: "updated", + ["bum"]: "beforeUnmount hook", + ["um"]: "unmounted hook", + ["a"]: "activated hook", + ["da"]: "deactivated hook", + ["ec"]: "errorCaptured hook", + ["rtc"]: "renderTracked hook", + ["rtg"]: "renderTriggered hook", + [0]: "setup function", + [1]: "render function", + [2]: "watcher getter", + [3]: "watcher callback", + [4]: "watcher cleanup function", + [5]: "native event handler", + [6]: "component event handler", + [7]: "vnode hook", + [8]: "directive hook", + [9]: "transition hook", + [10]: "app errorHandler", + [11]: "app warnHandler", + [12]: "ref function", + [13]: "async component loader", + [14]: "scheduler flush", + [15]: "component update", + [16]: "app unmount cleanup function" +}; +function callWithErrorHandling(fn, instance, type, args) { + try { + return args ? fn(...args) : fn(); + } catch (err) { + handleError(err, instance, type); + } +} +function callWithAsyncErrorHandling(fn, instance, type, args) { + if (isFunction(fn)) { + const res = callWithErrorHandling(fn, instance, type, args); + if (res && isPromise(res)) { + res.catch((err) => { + handleError(err, instance, type); + }); + } + return res; + } + if (isArray(fn)) { + const values = []; + for (let i = 0; i < fn.length; i++) { + values.push(callWithAsyncErrorHandling(fn[i], instance, type, args)); + } + return values; + } else { + warn$1( + `Invalid value type passed to callWithAsyncErrorHandling(): ${typeof fn}` + ); + } +} +function handleError(err, instance, type, throwInDev = true) { + const contextVNode = instance ? instance.vnode : null; + const { errorHandler, throwUnhandledErrorInProduction } = instance && instance.appContext.config || EMPTY_OBJ; + if (instance) { + let cur = instance.parent; + const exposedInstance = instance.proxy; + const errorInfo = ErrorTypeStrings$1[type] ; + while (cur) { + const errorCapturedHooks = cur.ec; + if (errorCapturedHooks) { + for (let i = 0; i < errorCapturedHooks.length; i++) { + if (errorCapturedHooks[i](err, exposedInstance, errorInfo) === false) { + return; + } + } + } + cur = cur.parent; + } + if (errorHandler) { + pauseTracking(); + callWithErrorHandling(errorHandler, null, 10, [ + err, + exposedInstance, + errorInfo + ]); + resetTracking(); + return; + } + } + logError(err, type, contextVNode, throwInDev, throwUnhandledErrorInProduction); +} +function logError(err, type, contextVNode, throwInDev = true, throwInProd = false) { + { + const info = ErrorTypeStrings$1[type]; + if (contextVNode) { + pushWarningContext(contextVNode); + } + warn$1(`Unhandled error${info ? ` during execution of ${info}` : ``}`); + if (contextVNode) { + popWarningContext(); + } + if (throwInDev) { + throw err; + } else { + console.error(err); + } + } +} + +const queue = []; +let flushIndex = -1; +const pendingPostFlushCbs = []; +let activePostFlushCbs = null; +let postFlushIndex = 0; +const resolvedPromise = /* @__PURE__ */ Promise.resolve(); +let currentFlushPromise = null; +const RECURSION_LIMIT = 100; +function nextTick(fn) { + const p = currentFlushPromise || resolvedPromise; + return fn ? p.then(this ? fn.bind(this) : fn) : p; +} +function findInsertionIndex(id) { + let start = flushIndex + 1; + let end = queue.length; + while (start < end) { + const middle = start + end >>> 1; + const middleJob = queue[middle]; + const middleJobId = getId(middleJob); + if (middleJobId < id || middleJobId === id && middleJob.flags & 2) { + start = middle + 1; + } else { + end = middle; + } + } + return start; +} +function queueJob(job) { + if (!(job.flags & 1)) { + const jobId = getId(job); + const lastJob = queue[queue.length - 1]; + if (!lastJob || // fast path when the job id is larger than the tail + !(job.flags & 2) && jobId >= getId(lastJob)) { + queue.push(job); + } else { + queue.splice(findInsertionIndex(jobId), 0, job); + } + job.flags |= 1; + queueFlush(); + } +} +function queueFlush() { + if (!currentFlushPromise) { + currentFlushPromise = resolvedPromise.then(flushJobs); + } +} +function queuePostFlushCb(cb) { + if (!isArray(cb)) { + if (activePostFlushCbs && cb.id === -1) { + activePostFlushCbs.splice(postFlushIndex + 1, 0, cb); + } else if (!(cb.flags & 1)) { + pendingPostFlushCbs.push(cb); + cb.flags |= 1; + } + } else { + pendingPostFlushCbs.push(...cb); + } + queueFlush(); +} +function flushPreFlushCbs(instance, seen, i = flushIndex + 1) { + { + seen = seen || /* @__PURE__ */ new Map(); + } + for (; i < queue.length; i++) { + const cb = queue[i]; + if (cb && cb.flags & 2) { + if (instance && cb.id !== instance.uid) { + continue; + } + if (checkRecursiveUpdates(seen, cb)) { + continue; + } + queue.splice(i, 1); + i--; + if (cb.flags & 4) { + cb.flags &= ~1; + } + cb(); + if (!(cb.flags & 4)) { + cb.flags &= ~1; + } + } + } +} +function flushPostFlushCbs(seen) { + if (pendingPostFlushCbs.length) { + const deduped = [...new Set(pendingPostFlushCbs)].sort( + (a, b) => getId(a) - getId(b) + ); + pendingPostFlushCbs.length = 0; + if (activePostFlushCbs) { + activePostFlushCbs.push(...deduped); + return; + } + activePostFlushCbs = deduped; + { + seen = seen || /* @__PURE__ */ new Map(); + } + for (postFlushIndex = 0; postFlushIndex < activePostFlushCbs.length; postFlushIndex++) { + const cb = activePostFlushCbs[postFlushIndex]; + if (checkRecursiveUpdates(seen, cb)) { + continue; + } + if (cb.flags & 4) { + cb.flags &= ~1; + } + if (!(cb.flags & 8)) cb(); + cb.flags &= ~1; + } + activePostFlushCbs = null; + postFlushIndex = 0; + } +} +const getId = (job) => job.id == null ? job.flags & 2 ? -1 : Infinity : job.id; +function flushJobs(seen) { + { + seen = seen || /* @__PURE__ */ new Map(); + } + const check = (job) => checkRecursiveUpdates(seen, job) ; + try { + for (flushIndex = 0; flushIndex < queue.length; flushIndex++) { + const job = queue[flushIndex]; + if (job && !(job.flags & 8)) { + if (check(job)) { + continue; + } + if (job.flags & 4) { + job.flags &= ~1; + } + callWithErrorHandling( + job, + job.i, + job.i ? 15 : 14 + ); + if (!(job.flags & 4)) { + job.flags &= ~1; + } + } + } + } finally { + for (; flushIndex < queue.length; flushIndex++) { + const job = queue[flushIndex]; + if (job) { + job.flags &= ~1; + } + } + flushIndex = -1; + queue.length = 0; + flushPostFlushCbs(seen); + currentFlushPromise = null; + if (queue.length || pendingPostFlushCbs.length) { + flushJobs(seen); + } + } +} +function checkRecursiveUpdates(seen, fn) { + const count = seen.get(fn) || 0; + if (count > RECURSION_LIMIT) { + const instance = fn.i; + const componentName = instance && getComponentName(instance.type); + handleError( + `Maximum recursive updates exceeded${componentName ? ` in component <${componentName}>` : ``}. This means you have a reactive effect that is mutating its own dependencies and thus recursively triggering itself. Possible sources include component template, render function, updated hook or watcher source function.`, + null, + 10 + ); + return true; + } + seen.set(fn, count + 1); + return false; +} + +let isHmrUpdating = false; +const hmrDirtyComponents = /* @__PURE__ */ new Map(); +{ + getGlobalThis().__VUE_HMR_RUNTIME__ = { + createRecord: tryWrap(createRecord), + rerender: tryWrap(rerender), + reload: tryWrap(reload) + }; +} +const map = /* @__PURE__ */ new Map(); +function registerHMR(instance) { + const id = instance.type.__hmrId; + let record = map.get(id); + if (!record) { + createRecord(id, instance.type); + record = map.get(id); + } + record.instances.add(instance); +} +function unregisterHMR(instance) { + map.get(instance.type.__hmrId).instances.delete(instance); +} +function createRecord(id, initialDef) { + if (map.has(id)) { + return false; + } + map.set(id, { + initialDef: normalizeClassComponent(initialDef), + instances: /* @__PURE__ */ new Set() + }); + return true; +} +function normalizeClassComponent(component) { + return isClassComponent(component) ? component.__vccOpts : component; +} +function rerender(id, newRender) { + const record = map.get(id); + if (!record) { + return; + } + record.initialDef.render = newRender; + [...record.instances].forEach((instance) => { + if (newRender) { + instance.render = newRender; + normalizeClassComponent(instance.type).render = newRender; + } + instance.renderCache = []; + isHmrUpdating = true; + instance.update(); + isHmrUpdating = false; + }); +} +function reload(id, newComp) { + const record = map.get(id); + if (!record) return; + newComp = normalizeClassComponent(newComp); + updateComponentDef(record.initialDef, newComp); + const instances = [...record.instances]; + for (let i = 0; i < instances.length; i++) { + const instance = instances[i]; + const oldComp = normalizeClassComponent(instance.type); + let dirtyInstances = hmrDirtyComponents.get(oldComp); + if (!dirtyInstances) { + if (oldComp !== record.initialDef) { + updateComponentDef(oldComp, newComp); + } + hmrDirtyComponents.set(oldComp, dirtyInstances = /* @__PURE__ */ new Set()); + } + dirtyInstances.add(instance); + instance.appContext.propsCache.delete(instance.type); + instance.appContext.emitsCache.delete(instance.type); + instance.appContext.optionsCache.delete(instance.type); + if (instance.ceReload) { + dirtyInstances.add(instance); + instance.ceReload(newComp.styles); + dirtyInstances.delete(instance); + } else if (instance.parent) { + queueJob(() => { + isHmrUpdating = true; + instance.parent.update(); + isHmrUpdating = false; + dirtyInstances.delete(instance); + }); + } else if (instance.appContext.reload) { + instance.appContext.reload(); + } else if (typeof window !== "undefined") { + window.location.reload(); + } else { + console.warn( + "[HMR] Root or manually mounted instance modified. Full reload required." + ); + } + if (instance.root.ce && instance !== instance.root) { + instance.root.ce._removeChildStyle(oldComp); + } + } + queuePostFlushCb(() => { + hmrDirtyComponents.clear(); + }); +} +function updateComponentDef(oldComp, newComp) { + extend(oldComp, newComp); + for (const key in oldComp) { + if (key !== "__file" && !(key in newComp)) { + delete oldComp[key]; + } + } +} +function tryWrap(fn) { + return (id, arg) => { + try { + return fn(id, arg); + } catch (e) { + console.error(e); + console.warn( + `[HMR] Something went wrong during Vue component hot-reload. Full reload required.` + ); + } + }; +} + +let devtools$1; +let buffer = []; +let devtoolsNotInstalled = false; +function emit$1(event, ...args) { + if (devtools$1) { + devtools$1.emit(event, ...args); + } else if (!devtoolsNotInstalled) { + buffer.push({ event, args }); + } +} +function setDevtoolsHook$1(hook, target) { + var _a, _b; + devtools$1 = hook; + if (devtools$1) { + devtools$1.enabled = true; + buffer.forEach(({ event, args }) => devtools$1.emit(event, ...args)); + buffer = []; + } else if ( + // handle late devtools injection - only do this if we are in an actual + // browser environment to avoid the timer handle stalling test runner exit + // (#4815) + typeof window !== "undefined" && // some envs mock window but not fully + window.HTMLElement && // also exclude jsdom + // eslint-disable-next-line no-restricted-syntax + !((_b = (_a = window.navigator) == null ? void 0 : _a.userAgent) == null ? void 0 : _b.includes("jsdom")) + ) { + const replay = target.__VUE_DEVTOOLS_HOOK_REPLAY__ = target.__VUE_DEVTOOLS_HOOK_REPLAY__ || []; + replay.push((newHook) => { + setDevtoolsHook$1(newHook, target); + }); + setTimeout(() => { + if (!devtools$1) { + target.__VUE_DEVTOOLS_HOOK_REPLAY__ = null; + devtoolsNotInstalled = true; + buffer = []; + } + }, 3e3); + } else { + devtoolsNotInstalled = true; + buffer = []; + } +} +function devtoolsInitApp(app, version) { + emit$1("app:init" /* APP_INIT */, app, version, { + Fragment, + Text, + Comment, + Static + }); +} +function devtoolsUnmountApp(app) { + emit$1("app:unmount" /* APP_UNMOUNT */, app); +} +const devtoolsComponentAdded = /* @__PURE__ */ createDevtoolsComponentHook("component:added" /* COMPONENT_ADDED */); +const devtoolsComponentUpdated = /* @__PURE__ */ createDevtoolsComponentHook("component:updated" /* COMPONENT_UPDATED */); +const _devtoolsComponentRemoved = /* @__PURE__ */ createDevtoolsComponentHook( + "component:removed" /* COMPONENT_REMOVED */ +); +const devtoolsComponentRemoved = (component) => { + if (devtools$1 && typeof devtools$1.cleanupBuffer === "function" && // remove the component if it wasn't buffered + !devtools$1.cleanupBuffer(component)) { + _devtoolsComponentRemoved(component); + } +}; +/*! #__NO_SIDE_EFFECTS__ */ +// @__NO_SIDE_EFFECTS__ +function createDevtoolsComponentHook(hook) { + return (component) => { + emit$1( + hook, + component.appContext.app, + component.uid, + component.parent ? component.parent.uid : void 0, + component + ); + }; +} +const devtoolsPerfStart = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:start" /* PERFORMANCE_START */); +const devtoolsPerfEnd = /* @__PURE__ */ createDevtoolsPerformanceHook("perf:end" /* PERFORMANCE_END */); +function createDevtoolsPerformanceHook(hook) { + return (component, type, time) => { + emit$1(hook, component.appContext.app, component.uid, component, type, time); + }; +} +function devtoolsComponentEmit(component, event, params) { + emit$1( + "component:emit" /* COMPONENT_EMIT */, + component.appContext.app, + component, + event, + params + ); +} + +let currentRenderingInstance = null; +let currentScopeId = null; +function setCurrentRenderingInstance(instance) { + const prev = currentRenderingInstance; + currentRenderingInstance = instance; + currentScopeId = instance && instance.type.__scopeId || null; + return prev; +} +function pushScopeId(id) { + currentScopeId = id; +} +function popScopeId() { + currentScopeId = null; +} +const withScopeId = (_id) => withCtx; +function withCtx(fn, ctx = currentRenderingInstance, isNonScopedSlot) { + if (!ctx) return fn; + if (fn._n) { + return fn; + } + const renderFnWithContext = (...args) => { + if (renderFnWithContext._d) { + setBlockTracking(-1); + } + const prevInstance = setCurrentRenderingInstance(ctx); + let res; + try { + res = fn(...args); + } finally { + setCurrentRenderingInstance(prevInstance); + if (renderFnWithContext._d) { + setBlockTracking(1); + } + } + { + devtoolsComponentUpdated(ctx); + } + return res; + }; + renderFnWithContext._n = true; + renderFnWithContext._c = true; + renderFnWithContext._d = true; + return renderFnWithContext; +} + +function validateDirectiveName(name) { + if (isBuiltInDirective(name)) { + warn$1("Do not use built-in directive ids as custom directive id: " + name); + } +} +function withDirectives(vnode, directives) { + if (currentRenderingInstance === null) { + warn$1(`withDirectives can only be used inside render functions.`); + return vnode; + } + const instance = getComponentPublicInstance(currentRenderingInstance); + const bindings = vnode.dirs || (vnode.dirs = []); + for (let i = 0; i < directives.length; i++) { + let [dir, value, arg, modifiers = EMPTY_OBJ] = directives[i]; + if (dir) { + if (isFunction(dir)) { + dir = { + mounted: dir, + updated: dir + }; + } + if (dir.deep) { + traverse(value); + } + bindings.push({ + dir, + instance, + value, + oldValue: void 0, + arg, + modifiers + }); + } + } + return vnode; +} +function invokeDirectiveHook(vnode, prevVNode, instance, name) { + const bindings = vnode.dirs; + const oldBindings = prevVNode && prevVNode.dirs; + for (let i = 0; i < bindings.length; i++) { + const binding = bindings[i]; + if (oldBindings) { + binding.oldValue = oldBindings[i].value; + } + let hook = binding.dir[name]; + if (hook) { + pauseTracking(); + callWithAsyncErrorHandling(hook, instance, 8, [ + vnode.el, + binding, + vnode, + prevVNode + ]); + resetTracking(); + } + } +} + +const TeleportEndKey = Symbol("_vte"); +const isTeleport = (type) => type.__isTeleport; +const isTeleportDisabled = (props) => props && (props.disabled || props.disabled === ""); +const isTeleportDeferred = (props) => props && (props.defer || props.defer === ""); +const isTargetSVG = (target) => typeof SVGElement !== "undefined" && target instanceof SVGElement; +const isTargetMathML = (target) => typeof MathMLElement === "function" && target instanceof MathMLElement; +const resolveTarget = (props, select) => { + const targetSelector = props && props.to; + if (isString(targetSelector)) { + if (!select) { + warn$1( + `Current renderer does not support string target for Teleports. (missing querySelector renderer option)` + ); + return null; + } else { + const target = select(targetSelector); + if (!target && !isTeleportDisabled(props)) { + warn$1( + `Failed to locate Teleport target with selector "${targetSelector}". Note the target element must exist before the component is mounted - i.e. the target cannot be rendered by the component itself, and ideally should be outside of the entire Vue component tree.` + ); + } + return target; + } + } else { + if (!targetSelector && !isTeleportDisabled(props)) { + warn$1(`Invalid Teleport target: ${targetSelector}`); + } + return targetSelector; + } +}; +const TeleportImpl = { + name: "Teleport", + __isTeleport: true, + process(n1, n2, container, anchor, parentComponent, parentSuspense, namespace, slotScopeIds, optimized, internals) { + const { + mc: mountChildren, + pc: patchChildren, + pbc: patchBlockChildren, + o: { insert, querySelector, createText, createComment } + } = internals; + const disabled = isTeleportDisabled(n2.props); + let { shapeFlag, children, dynamicChildren } = n2; + if (isHmrUpdating) { + optimized = false; + dynamicChildren = null; + } + if (n1 == null) { + const placeholder = n2.el = createComment("teleport start") ; + const mainAnchor = n2.anchor = createComment("teleport end") ; + insert(placeholder, container, anchor); + insert(mainAnchor, container, anchor); + const mount = (container2, anchor2) => { + if (shapeFlag & 16) { + if (parentComponent && parentComponent.isCE) { + parentComponent.ce._teleportTarget = container2; + } + mountChildren( + children, + container2, + anchor2, + parentComponent, + parentSuspense, + namespace, + slotScopeIds, + optimized + ); + } + }; + const mountToTarget = () => { + const target = n2.target = resolveTarget(n2.props, querySelector); + const targetAnchor = prepareAnchor(target, n2, createText, insert); + if (target) { + if (namespace !== "svg" && isTargetSVG(target)) { + namespace = "svg"; + } else if (namespace !== "mathml" && isTargetMathML(target)) { + namespace = "mathml"; + } + if (!disabled) { + mount(target, targetAnchor); + updateCssVars(n2, false); + } + } else if (!disabled) { + warn$1( + "Invalid Teleport target on mount:", + target, + `(${typeof target})` + ); + } + }; + if (disabled) { + mount(container, mainAnchor); + updateCssVars(n2, true); + } + if (isTeleportDeferred(n2.props)) { + queuePostRenderEffect(mountToTarget, parentSuspense); + } else { + mountToTarget(); + } + } else { + n2.el = n1.el; + n2.targetStart = n1.targetStart; + const mainAnchor = n2.anchor = n1.anchor; + const target = n2.target = n1.target; + const targetAnchor = n2.targetAnchor = n1.targetAnchor; + const wasDisabled = isTeleportDisabled(n1.props); + const currentContainer = wasDisabled ? container : target; + const currentAnchor = wasDisabled ? mainAnchor : targetAnchor; + if (namespace === "svg" || isTargetSVG(target)) { + namespace = "svg"; + } else if (namespace === "mathml" || isTargetMathML(target)) { + namespace = "mathml"; + } + if (dynamicChildren) { + patchBlockChildren( + n1.dynamicChildren, + dynamicChildren, + currentContainer, + parentComponent, + parentSuspense, + namespace, + slotScopeIds + ); + traverseStaticChildren(n1, n2, true); + } else if (!optimized) { + patchChildren( + n1, + n2, + currentContainer, + currentAnchor, + parentComponent, + parentSuspense, + namespace, + slotScopeIds, + false + ); + } + if (disabled) { + if (!wasDisabled) { + moveTeleport( + n2, + container, + mainAnchor, + internals, + 1 + ); + } else { + if (n2.props && n1.props && n2.props.to !== n1.props.to) { + n2.props.to = n1.props.to; + } + } + } else { + if ((n2.props && n2.props.to) !== (n1.props && n1.props.to)) { + const nextTarget = n2.target = resolveTarget( + n2.props, + querySelector + ); + if (nextTarget) { + moveTeleport( + n2, + nextTarget, + null, + internals, + 0 + ); + } else { + warn$1( + "Invalid Teleport target on update:", + target, + `(${typeof target})` + ); + } + } else if (wasDisabled) { + moveTeleport( + n2, + target, + targetAnchor, + internals, + 1 + ); + } + } + updateCssVars(n2, disabled); + } + }, + remove(vnode, parentComponent, parentSuspense, { um: unmount, o: { remove: hostRemove } }, doRemove) { + const { + shapeFlag, + children, + anchor, + targetStart, + targetAnchor, + target, + props + } = vnode; + if (target) { + hostRemove(targetStart); + hostRemove(targetAnchor); + } + doRemove && hostRemove(anchor); + if (shapeFlag & 16) { + const shouldRemove = doRemove || !isTeleportDisabled(props); + for (let i = 0; i < children.length; i++) { + const child = children[i]; + unmount( + child, + parentComponent, + parentSuspense, + shouldRemove, + !!child.dynamicChildren + ); + } + } + }, + move: moveTeleport, + hydrate: hydrateTeleport +}; +function moveTeleport(vnode, container, parentAnchor, { o: { insert }, m: move }, moveType = 2) { + if (moveType === 0) { + insert(vnode.targetAnchor, container, parentAnchor); + } + const { el, anchor, shapeFlag, children, props } = vnode; + const isReorder = moveType === 2; + if (isReorder) { + insert(el, container, parentAnchor); + } + if (!isReorder || isTeleportDisabled(props)) { + if (shapeFlag & 16) { + for (let i = 0; i < children.length; i++) { + move( + children[i], + container, + parentAnchor, + 2 + ); + } + } + } + if (isReorder) { + insert(anchor, container, parentAnchor); + } +} +function hydrateTeleport(node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized, { + o: { nextSibling, parentNode, querySelector, insert, createText } +}, hydrateChildren) { + const target = vnode.target = resolveTarget( + vnode.props, + querySelector + ); + if (target) { + const disabled = isTeleportDisabled(vnode.props); + const targetNode = target._lpa || target.firstChild; + if (vnode.shapeFlag & 16) { + if (disabled) { + vnode.anchor = hydrateChildren( + nextSibling(node), + vnode, + parentNode(node), + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + vnode.targetStart = targetNode; + vnode.targetAnchor = targetNode && nextSibling(targetNode); + } else { + vnode.anchor = nextSibling(node); + let targetAnchor = targetNode; + while (targetAnchor) { + if (targetAnchor && targetAnchor.nodeType === 8) { + if (targetAnchor.data === "teleport start anchor") { + vnode.targetStart = targetAnchor; + } else if (targetAnchor.data === "teleport anchor") { + vnode.targetAnchor = targetAnchor; + target._lpa = vnode.targetAnchor && nextSibling(vnode.targetAnchor); + break; + } + } + targetAnchor = nextSibling(targetAnchor); + } + if (!vnode.targetAnchor) { + prepareAnchor(target, vnode, createText, insert); + } + hydrateChildren( + targetNode && nextSibling(targetNode), + vnode, + target, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + } + } + updateCssVars(vnode, disabled); + } + return vnode.anchor && nextSibling(vnode.anchor); +} +const Teleport = TeleportImpl; +function updateCssVars(vnode, isDisabled) { + const ctx = vnode.ctx; + if (ctx && ctx.ut) { + let node, anchor; + if (isDisabled) { + node = vnode.el; + anchor = vnode.anchor; + } else { + node = vnode.targetStart; + anchor = vnode.targetAnchor; + } + while (node && node !== anchor) { + if (node.nodeType === 1) node.setAttribute("data-v-owner", ctx.uid); + node = node.nextSibling; + } + ctx.ut(); + } +} +function prepareAnchor(target, vnode, createText, insert) { + const targetStart = vnode.targetStart = createText(""); + const targetAnchor = vnode.targetAnchor = createText(""); + targetStart[TeleportEndKey] = targetAnchor; + if (target) { + insert(targetStart, target); + insert(targetAnchor, target); + } + return targetAnchor; +} + +const leaveCbKey = Symbol("_leaveCb"); +const enterCbKey$1 = Symbol("_enterCb"); +function useTransitionState() { + const state = { + isMounted: false, + isLeaving: false, + isUnmounting: false, + leavingVNodes: /* @__PURE__ */ new Map() + }; + onMounted(() => { + state.isMounted = true; + }); + onBeforeUnmount(() => { + state.isUnmounting = true; + }); + return state; +} +const TransitionHookValidator = [Function, Array]; +const BaseTransitionPropsValidators = { + mode: String, + appear: Boolean, + persisted: Boolean, + // enter + onBeforeEnter: TransitionHookValidator, + onEnter: TransitionHookValidator, + onAfterEnter: TransitionHookValidator, + onEnterCancelled: TransitionHookValidator, + // leave + onBeforeLeave: TransitionHookValidator, + onLeave: TransitionHookValidator, + onAfterLeave: TransitionHookValidator, + onLeaveCancelled: TransitionHookValidator, + // appear + onBeforeAppear: TransitionHookValidator, + onAppear: TransitionHookValidator, + onAfterAppear: TransitionHookValidator, + onAppearCancelled: TransitionHookValidator +}; +const recursiveGetSubtree = (instance) => { + const subTree = instance.subTree; + return subTree.component ? recursiveGetSubtree(subTree.component) : subTree; +}; +const BaseTransitionImpl = { + name: `BaseTransition`, + props: BaseTransitionPropsValidators, + setup(props, { slots }) { + const instance = getCurrentInstance(); + const state = useTransitionState(); + return () => { + const children = slots.default && getTransitionRawChildren(slots.default(), true); + if (!children || !children.length) { + return; + } + const child = findNonCommentChild(children); + const rawProps = toRaw(props); + const { mode } = rawProps; + if (mode && mode !== "in-out" && mode !== "out-in" && mode !== "default") { + warn$1(`invalid mode: ${mode}`); + } + if (state.isLeaving) { + return emptyPlaceholder(child); + } + const innerChild = getInnerChild$1(child); + if (!innerChild) { + return emptyPlaceholder(child); + } + let enterHooks = resolveTransitionHooks( + innerChild, + rawProps, + state, + instance, + // #11061, ensure enterHooks is fresh after clone + (hooks) => enterHooks = hooks + ); + if (innerChild.type !== Comment) { + setTransitionHooks(innerChild, enterHooks); + } + const oldChild = instance.subTree; + const oldInnerChild = oldChild && getInnerChild$1(oldChild); + if (oldInnerChild && oldInnerChild.type !== Comment && !isSameVNodeType(innerChild, oldInnerChild) && recursiveGetSubtree(instance).type !== Comment) { + const leavingHooks = resolveTransitionHooks( + oldInnerChild, + rawProps, + state, + instance + ); + setTransitionHooks(oldInnerChild, leavingHooks); + if (mode === "out-in" && innerChild.type !== Comment) { + state.isLeaving = true; + leavingHooks.afterLeave = () => { + state.isLeaving = false; + if (!(instance.job.flags & 8)) { + instance.update(); + } + delete leavingHooks.afterLeave; + }; + return emptyPlaceholder(child); + } else if (mode === "in-out" && innerChild.type !== Comment) { + leavingHooks.delayLeave = (el, earlyRemove, delayedLeave) => { + const leavingVNodesCache = getLeavingNodesForType( + state, + oldInnerChild + ); + leavingVNodesCache[String(oldInnerChild.key)] = oldInnerChild; + el[leaveCbKey] = () => { + earlyRemove(); + el[leaveCbKey] = void 0; + delete enterHooks.delayedLeave; + }; + enterHooks.delayedLeave = delayedLeave; + }; + } + } + return child; + }; + } +}; +function findNonCommentChild(children) { + let child = children[0]; + if (children.length > 1) { + let hasFound = false; + for (const c of children) { + if (c.type !== Comment) { + if (hasFound) { + warn$1( + " can only be used on a single element or component. Use for lists." + ); + break; + } + child = c; + hasFound = true; + } + } + } + return child; +} +const BaseTransition = BaseTransitionImpl; +function getLeavingNodesForType(state, vnode) { + const { leavingVNodes } = state; + let leavingVNodesCache = leavingVNodes.get(vnode.type); + if (!leavingVNodesCache) { + leavingVNodesCache = /* @__PURE__ */ Object.create(null); + leavingVNodes.set(vnode.type, leavingVNodesCache); + } + return leavingVNodesCache; +} +function resolveTransitionHooks(vnode, props, state, instance, postClone) { + const { + appear, + mode, + persisted = false, + onBeforeEnter, + onEnter, + onAfterEnter, + onEnterCancelled, + onBeforeLeave, + onLeave, + onAfterLeave, + onLeaveCancelled, + onBeforeAppear, + onAppear, + onAfterAppear, + onAppearCancelled + } = props; + const key = String(vnode.key); + const leavingVNodesCache = getLeavingNodesForType(state, vnode); + const callHook = (hook, args) => { + hook && callWithAsyncErrorHandling( + hook, + instance, + 9, + args + ); + }; + const callAsyncHook = (hook, args) => { + const done = args[1]; + callHook(hook, args); + if (isArray(hook)) { + if (hook.every((hook2) => hook2.length <= 1)) done(); + } else if (hook.length <= 1) { + done(); + } + }; + const hooks = { + mode, + persisted, + beforeEnter(el) { + let hook = onBeforeEnter; + if (!state.isMounted) { + if (appear) { + hook = onBeforeAppear || onBeforeEnter; + } else { + return; + } + } + if (el[leaveCbKey]) { + el[leaveCbKey]( + true + /* cancelled */ + ); + } + const leavingVNode = leavingVNodesCache[key]; + if (leavingVNode && isSameVNodeType(vnode, leavingVNode) && leavingVNode.el[leaveCbKey]) { + leavingVNode.el[leaveCbKey](); + } + callHook(hook, [el]); + }, + enter(el) { + let hook = onEnter; + let afterHook = onAfterEnter; + let cancelHook = onEnterCancelled; + if (!state.isMounted) { + if (appear) { + hook = onAppear || onEnter; + afterHook = onAfterAppear || onAfterEnter; + cancelHook = onAppearCancelled || onEnterCancelled; + } else { + return; + } + } + let called = false; + const done = el[enterCbKey$1] = (cancelled) => { + if (called) return; + called = true; + if (cancelled) { + callHook(cancelHook, [el]); + } else { + callHook(afterHook, [el]); + } + if (hooks.delayedLeave) { + hooks.delayedLeave(); + } + el[enterCbKey$1] = void 0; + }; + if (hook) { + callAsyncHook(hook, [el, done]); + } else { + done(); + } + }, + leave(el, remove) { + const key2 = String(vnode.key); + if (el[enterCbKey$1]) { + el[enterCbKey$1]( + true + /* cancelled */ + ); + } + if (state.isUnmounting) { + return remove(); + } + callHook(onBeforeLeave, [el]); + let called = false; + const done = el[leaveCbKey] = (cancelled) => { + if (called) return; + called = true; + remove(); + if (cancelled) { + callHook(onLeaveCancelled, [el]); + } else { + callHook(onAfterLeave, [el]); + } + el[leaveCbKey] = void 0; + if (leavingVNodesCache[key2] === vnode) { + delete leavingVNodesCache[key2]; + } + }; + leavingVNodesCache[key2] = vnode; + if (onLeave) { + callAsyncHook(onLeave, [el, done]); + } else { + done(); + } + }, + clone(vnode2) { + const hooks2 = resolveTransitionHooks( + vnode2, + props, + state, + instance, + postClone + ); + if (postClone) postClone(hooks2); + return hooks2; + } + }; + return hooks; +} +function emptyPlaceholder(vnode) { + if (isKeepAlive(vnode)) { + vnode = cloneVNode(vnode); + vnode.children = null; + return vnode; + } +} +function getInnerChild$1(vnode) { + if (!isKeepAlive(vnode)) { + if (isTeleport(vnode.type) && vnode.children) { + return findNonCommentChild(vnode.children); + } + return vnode; + } + if (vnode.component) { + return vnode.component.subTree; + } + const { shapeFlag, children } = vnode; + if (children) { + if (shapeFlag & 16) { + return children[0]; + } + if (shapeFlag & 32 && isFunction(children.default)) { + return children.default(); + } + } +} +function setTransitionHooks(vnode, hooks) { + if (vnode.shapeFlag & 6 && vnode.component) { + vnode.transition = hooks; + setTransitionHooks(vnode.component.subTree, hooks); + } else if (vnode.shapeFlag & 128) { + vnode.ssContent.transition = hooks.clone(vnode.ssContent); + vnode.ssFallback.transition = hooks.clone(vnode.ssFallback); + } else { + vnode.transition = hooks; + } +} +function getTransitionRawChildren(children, keepComment = false, parentKey) { + let ret = []; + let keyedFragmentCount = 0; + for (let i = 0; i < children.length; i++) { + let child = children[i]; + const key = parentKey == null ? child.key : String(parentKey) + String(child.key != null ? child.key : i); + if (child.type === Fragment) { + if (child.patchFlag & 128) keyedFragmentCount++; + ret = ret.concat( + getTransitionRawChildren(child.children, keepComment, key) + ); + } else if (keepComment || child.type !== Comment) { + ret.push(key != null ? cloneVNode(child, { key }) : child); + } + } + if (keyedFragmentCount > 1) { + for (let i = 0; i < ret.length; i++) { + ret[i].patchFlag = -2; + } + } + return ret; +} + +/*! #__NO_SIDE_EFFECTS__ */ +// @__NO_SIDE_EFFECTS__ +function defineComponent(options, extraOptions) { + return isFunction(options) ? ( + // #8236: extend call and options.name access are considered side-effects + // by Rollup, so we have to wrap it in a pure-annotated IIFE. + /* @__PURE__ */ (() => extend({ name: options.name }, extraOptions, { setup: options }))() + ) : options; +} + +function useId() { + const i = getCurrentInstance(); + if (i) { + return (i.appContext.config.idPrefix || "v") + "-" + i.ids[0] + i.ids[1]++; + } else { + warn$1( + `useId() is called when there is no active component instance to be associated with.` + ); + } + return ""; +} +function markAsyncBoundary(instance) { + instance.ids = [instance.ids[0] + instance.ids[2]++ + "-", 0, 0]; +} + +const knownTemplateRefs = /* @__PURE__ */ new WeakSet(); +function useTemplateRef(key) { + const i = getCurrentInstance(); + const r = shallowRef(null); + if (i) { + const refs = i.refs === EMPTY_OBJ ? i.refs = {} : i.refs; + let desc; + if ((desc = Object.getOwnPropertyDescriptor(refs, key)) && !desc.configurable) { + warn$1(`useTemplateRef('${key}') already exists.`); + } else { + Object.defineProperty(refs, key, { + enumerable: true, + get: () => r.value, + set: (val) => r.value = val + }); + } + } else { + warn$1( + `useTemplateRef() is called when there is no active component instance to be associated with.` + ); + } + const ret = readonly(r) ; + { + knownTemplateRefs.add(ret); + } + return ret; +} + +function setRef(rawRef, oldRawRef, parentSuspense, vnode, isUnmount = false) { + if (isArray(rawRef)) { + rawRef.forEach( + (r, i) => setRef( + r, + oldRawRef && (isArray(oldRawRef) ? oldRawRef[i] : oldRawRef), + parentSuspense, + vnode, + isUnmount + ) + ); + return; + } + if (isAsyncWrapper(vnode) && !isUnmount) { + return; + } + const refValue = vnode.shapeFlag & 4 ? getComponentPublicInstance(vnode.component) : vnode.el; + const value = isUnmount ? null : refValue; + const { i: owner, r: ref } = rawRef; + if (!owner) { + warn$1( + `Missing ref owner context. ref cannot be used on hoisted vnodes. A vnode with ref must be created inside the render function.` + ); + return; + } + const oldRef = oldRawRef && oldRawRef.r; + const refs = owner.refs === EMPTY_OBJ ? owner.refs = {} : owner.refs; + const setupState = owner.setupState; + const rawSetupState = toRaw(setupState); + const canSetSetupRef = setupState === EMPTY_OBJ ? () => false : (key) => { + { + if (hasOwn(rawSetupState, key) && !isRef(rawSetupState[key])) { + warn$1( + `Template ref "${key}" used on a non-ref value. It will not work in the production build.` + ); + } + if (knownTemplateRefs.has(rawSetupState[key])) { + return false; + } + } + return hasOwn(rawSetupState, key); + }; + if (oldRef != null && oldRef !== ref) { + if (isString(oldRef)) { + refs[oldRef] = null; + if (canSetSetupRef(oldRef)) { + setupState[oldRef] = null; + } + } else if (isRef(oldRef)) { + oldRef.value = null; + } + } + if (isFunction(ref)) { + callWithErrorHandling(ref, owner, 12, [value, refs]); + } else { + const _isString = isString(ref); + const _isRef = isRef(ref); + if (_isString || _isRef) { + const doSet = () => { + if (rawRef.f) { + const existing = _isString ? canSetSetupRef(ref) ? setupState[ref] : refs[ref] : ref.value; + if (isUnmount) { + isArray(existing) && remove(existing, refValue); + } else { + if (!isArray(existing)) { + if (_isString) { + refs[ref] = [refValue]; + if (canSetSetupRef(ref)) { + setupState[ref] = refs[ref]; + } + } else { + ref.value = [refValue]; + if (rawRef.k) refs[rawRef.k] = ref.value; + } + } else if (!existing.includes(refValue)) { + existing.push(refValue); + } + } + } else if (_isString) { + refs[ref] = value; + if (canSetSetupRef(ref)) { + setupState[ref] = value; + } + } else if (_isRef) { + ref.value = value; + if (rawRef.k) refs[rawRef.k] = value; + } else { + warn$1("Invalid template ref type:", ref, `(${typeof ref})`); + } + }; + if (value) { + doSet.id = -1; + queuePostRenderEffect(doSet, parentSuspense); + } else { + doSet(); + } + } else { + warn$1("Invalid template ref type:", ref, `(${typeof ref})`); + } + } +} + +let hasLoggedMismatchError = false; +const logMismatchError = () => { + if (hasLoggedMismatchError) { + return; + } + console.error("Hydration completed but contains mismatches."); + hasLoggedMismatchError = true; +}; +const isSVGContainer = (container) => container.namespaceURI.includes("svg") && container.tagName !== "foreignObject"; +const isMathMLContainer = (container) => container.namespaceURI.includes("MathML"); +const getContainerType = (container) => { + if (container.nodeType !== 1) return void 0; + if (isSVGContainer(container)) return "svg"; + if (isMathMLContainer(container)) return "mathml"; + return void 0; +}; +const isComment = (node) => node.nodeType === 8; +function createHydrationFunctions(rendererInternals) { + const { + mt: mountComponent, + p: patch, + o: { + patchProp, + createText, + nextSibling, + parentNode, + remove, + insert, + createComment + } + } = rendererInternals; + const hydrate = (vnode, container) => { + if (!container.hasChildNodes()) { + warn$1( + `Attempting to hydrate existing markup but container is empty. Performing full mount instead.` + ); + patch(null, vnode, container); + flushPostFlushCbs(); + container._vnode = vnode; + return; + } + hydrateNode(container.firstChild, vnode, null, null, null); + flushPostFlushCbs(); + container._vnode = vnode; + }; + const hydrateNode = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized = false) => { + optimized = optimized || !!vnode.dynamicChildren; + const isFragmentStart = isComment(node) && node.data === "["; + const onMismatch = () => handleMismatch( + node, + vnode, + parentComponent, + parentSuspense, + slotScopeIds, + isFragmentStart + ); + const { type, ref, shapeFlag, patchFlag } = vnode; + let domType = node.nodeType; + vnode.el = node; + { + def(node, "__vnode", vnode, true); + def(node, "__vueParentComponent", parentComponent, true); + } + if (patchFlag === -2) { + optimized = false; + vnode.dynamicChildren = null; + } + let nextNode = null; + switch (type) { + case Text: + if (domType !== 3) { + if (vnode.children === "") { + insert(vnode.el = createText(""), parentNode(node), node); + nextNode = node; + } else { + nextNode = onMismatch(); + } + } else { + if (node.data !== vnode.children) { + warn$1( + `Hydration text mismatch in`, + node.parentNode, + ` + - rendered on server: ${JSON.stringify( + node.data + )} + - expected on client: ${JSON.stringify(vnode.children)}` + ); + logMismatchError(); + node.data = vnode.children; + } + nextNode = nextSibling(node); + } + break; + case Comment: + if (isTemplateNode(node)) { + nextNode = nextSibling(node); + replaceNode( + vnode.el = node.content.firstChild, + node, + parentComponent + ); + } else if (domType !== 8 || isFragmentStart) { + nextNode = onMismatch(); + } else { + nextNode = nextSibling(node); + } + break; + case Static: + if (isFragmentStart) { + node = nextSibling(node); + domType = node.nodeType; + } + if (domType === 1 || domType === 3) { + nextNode = node; + const needToAdoptContent = !vnode.children.length; + for (let i = 0; i < vnode.staticCount; i++) { + if (needToAdoptContent) + vnode.children += nextNode.nodeType === 1 ? nextNode.outerHTML : nextNode.data; + if (i === vnode.staticCount - 1) { + vnode.anchor = nextNode; + } + nextNode = nextSibling(nextNode); + } + return isFragmentStart ? nextSibling(nextNode) : nextNode; + } else { + onMismatch(); + } + break; + case Fragment: + if (!isFragmentStart) { + nextNode = onMismatch(); + } else { + nextNode = hydrateFragment( + node, + vnode, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + } + break; + default: + if (shapeFlag & 1) { + if ((domType !== 1 || vnode.type.toLowerCase() !== node.tagName.toLowerCase()) && !isTemplateNode(node)) { + nextNode = onMismatch(); + } else { + nextNode = hydrateElement( + node, + vnode, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + } + } else if (shapeFlag & 6) { + vnode.slotScopeIds = slotScopeIds; + const container = parentNode(node); + if (isFragmentStart) { + nextNode = locateClosingAnchor(node); + } else if (isComment(node) && node.data === "teleport start") { + nextNode = locateClosingAnchor(node, node.data, "teleport end"); + } else { + nextNode = nextSibling(node); + } + mountComponent( + vnode, + container, + null, + parentComponent, + parentSuspense, + getContainerType(container), + optimized + ); + if (isAsyncWrapper(vnode)) { + let subTree; + if (isFragmentStart) { + subTree = createVNode(Fragment); + subTree.anchor = nextNode ? nextNode.previousSibling : container.lastChild; + } else { + subTree = node.nodeType === 3 ? createTextVNode("") : createVNode("div"); + } + subTree.el = node; + vnode.component.subTree = subTree; + } + } else if (shapeFlag & 64) { + if (domType !== 8) { + nextNode = onMismatch(); + } else { + nextNode = vnode.type.hydrate( + node, + vnode, + parentComponent, + parentSuspense, + slotScopeIds, + optimized, + rendererInternals, + hydrateChildren + ); + } + } else if (shapeFlag & 128) { + nextNode = vnode.type.hydrate( + node, + vnode, + parentComponent, + parentSuspense, + getContainerType(parentNode(node)), + slotScopeIds, + optimized, + rendererInternals, + hydrateNode + ); + } else { + warn$1("Invalid HostVNode type:", type, `(${typeof type})`); + } + } + if (ref != null) { + setRef(ref, null, parentSuspense, vnode); + } + return nextNode; + }; + const hydrateElement = (el, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => { + optimized = optimized || !!vnode.dynamicChildren; + const { type, props, patchFlag, shapeFlag, dirs, transition } = vnode; + const forcePatch = type === "input" || type === "option"; + { + if (dirs) { + invokeDirectiveHook(vnode, null, parentComponent, "created"); + } + let needCallTransitionHooks = false; + if (isTemplateNode(el)) { + needCallTransitionHooks = needTransition( + null, + // no need check parentSuspense in hydration + transition + ) && parentComponent && parentComponent.vnode.props && parentComponent.vnode.props.appear; + const content = el.content.firstChild; + if (needCallTransitionHooks) { + transition.beforeEnter(content); + } + replaceNode(content, el, parentComponent); + vnode.el = el = content; + } + if (shapeFlag & 16 && // skip if element has innerHTML / textContent + !(props && (props.innerHTML || props.textContent))) { + let next = hydrateChildren( + el.firstChild, + vnode, + el, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + let hasWarned = false; + while (next) { + if (!isMismatchAllowed(el, 1 /* CHILDREN */)) { + if (!hasWarned) { + warn$1( + `Hydration children mismatch on`, + el, + ` +Server rendered element contains more child nodes than client vdom.` + ); + hasWarned = true; + } + logMismatchError(); + } + const cur = next; + next = next.nextSibling; + remove(cur); + } + } else if (shapeFlag & 8) { + let clientText = vnode.children; + if (clientText[0] === "\n" && (el.tagName === "PRE" || el.tagName === "TEXTAREA")) { + clientText = clientText.slice(1); + } + if (el.textContent !== clientText) { + if (!isMismatchAllowed(el, 0 /* TEXT */)) { + warn$1( + `Hydration text content mismatch on`, + el, + ` + - rendered on server: ${el.textContent} + - expected on client: ${vnode.children}` + ); + logMismatchError(); + } + el.textContent = vnode.children; + } + } + if (props) { + { + const isCustomElement = el.tagName.includes("-"); + for (const key in props) { + if (// #11189 skip if this node has directives that have created hooks + // as it could have mutated the DOM in any possible way + !(dirs && dirs.some((d) => d.dir.created)) && propHasMismatch(el, key, props[key], vnode, parentComponent)) { + logMismatchError(); + } + if (forcePatch && (key.endsWith("value") || key === "indeterminate") || isOn(key) && !isReservedProp(key) || // force hydrate v-bind with .prop modifiers + key[0] === "." || isCustomElement) { + patchProp(el, key, null, props[key], void 0, parentComponent); + } + } + } + } + let vnodeHooks; + if (vnodeHooks = props && props.onVnodeBeforeMount) { + invokeVNodeHook(vnodeHooks, parentComponent, vnode); + } + if (dirs) { + invokeDirectiveHook(vnode, null, parentComponent, "beforeMount"); + } + if ((vnodeHooks = props && props.onVnodeMounted) || dirs || needCallTransitionHooks) { + queueEffectWithSuspense(() => { + vnodeHooks && invokeVNodeHook(vnodeHooks, parentComponent, vnode); + needCallTransitionHooks && transition.enter(el); + dirs && invokeDirectiveHook(vnode, null, parentComponent, "mounted"); + }, parentSuspense); + } + } + return el.nextSibling; + }; + const hydrateChildren = (node, parentVNode, container, parentComponent, parentSuspense, slotScopeIds, optimized) => { + optimized = optimized || !!parentVNode.dynamicChildren; + const children = parentVNode.children; + const l = children.length; + let hasWarned = false; + for (let i = 0; i < l; i++) { + const vnode = optimized ? children[i] : children[i] = normalizeVNode(children[i]); + const isText = vnode.type === Text; + if (node) { + if (isText && !optimized) { + if (i + 1 < l && normalizeVNode(children[i + 1]).type === Text) { + insert( + createText( + node.data.slice(vnode.children.length) + ), + container, + nextSibling(node) + ); + node.data = vnode.children; + } + } + node = hydrateNode( + node, + vnode, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + } else if (isText && !vnode.children) { + insert(vnode.el = createText(""), container); + } else { + if (!isMismatchAllowed(container, 1 /* CHILDREN */)) { + if (!hasWarned) { + warn$1( + `Hydration children mismatch on`, + container, + ` +Server rendered element contains fewer child nodes than client vdom.` + ); + hasWarned = true; + } + logMismatchError(); + } + patch( + null, + vnode, + container, + null, + parentComponent, + parentSuspense, + getContainerType(container), + slotScopeIds + ); + } + } + return node; + }; + const hydrateFragment = (node, vnode, parentComponent, parentSuspense, slotScopeIds, optimized) => { + const { slotScopeIds: fragmentSlotScopeIds } = vnode; + if (fragmentSlotScopeIds) { + slotScopeIds = slotScopeIds ? slotScopeIds.concat(fragmentSlotScopeIds) : fragmentSlotScopeIds; + } + const container = parentNode(node); + const next = hydrateChildren( + nextSibling(node), + vnode, + container, + parentComponent, + parentSuspense, + slotScopeIds, + optimized + ); + if (next && isComment(next) && next.data === "]") { + return nextSibling(vnode.anchor = next); + } else { + logMismatchError(); + insert(vnode.anchor = createComment(`]`), container, next); + return next; + } + }; + const handleMismatch = (node, vnode, parentComponent, parentSuspense, slotScopeIds, isFragment) => { + if (!isMismatchAllowed(node.parentElement, 1 /* CHILDREN */)) { + warn$1( + `Hydration node mismatch: +- rendered on server:`, + node, + node.nodeType === 3 ? `(text)` : isComment(node) && node.data === "[" ? `(start of fragment)` : ``, + ` +- expected on client:`, + vnode.type + ); + logMismatchError(); + } + vnode.el = null; + if (isFragment) { + const end = locateClosingAnchor(node); + while (true) { + const next2 = nextSibling(node); + if (next2 && next2 !== end) { + remove(next2); + } else { + break; + } + } + } + const next = nextSibling(node); + const container = parentNode(node); + remove(node); + patch( + null, + vnode, + container, + next, + parentComponent, + parentSuspense, + getContainerType(container), + slotScopeIds + ); + return next; + }; + const locateClosingAnchor = (node, open = "[", close = "]") => { + let match = 0; + while (node) { + node = nextSibling(node); + if (node && isComment(node)) { + if (node.data === open) match++; + if (node.data === close) { + if (match === 0) { + return nextSibling(node); + } else { + match--; + } + } + } + } + return node; + }; + const replaceNode = (newNode, oldNode, parentComponent) => { + const parentNode2 = oldNode.parentNode; + if (parentNode2) { + parentNode2.replaceChild(newNode, oldNode); + } + let parent = parentComponent; + while (parent) { + if (parent.vnode.el === oldNode) { + parent.vnode.el = parent.subTree.el = newNode; + } + parent = parent.parent; + } + }; + const isTemplateNode = (node) => { + return node.nodeType === 1 && node.tagName === "TEMPLATE"; + }; + return [hydrate, hydrateNode]; +} +function propHasMismatch(el, key, clientValue, vnode, instance) { + let mismatchType; + let mismatchKey; + let actual; + let expected; + if (key === "class") { + actual = el.getAttribute("class"); + expected = normalizeClass(clientValue); + if (!isSetEqual(toClassSet(actual || ""), toClassSet(expected))) { + mismatchType = 2 /* CLASS */; + mismatchKey = `class`; + } + } else if (key === "style") { + actual = el.getAttribute("style") || ""; + expected = isString(clientValue) ? clientValue : stringifyStyle(normalizeStyle(clientValue)); + const actualMap = toStyleMap(actual); + const expectedMap = toStyleMap(expected); + if (vnode.dirs) { + for (const { dir, value } of vnode.dirs) { + if (dir.name === "show" && !value) { + expectedMap.set("display", "none"); + } + } + } + if (instance) { + resolveCssVars(instance, vnode, expectedMap); + } + if (!isMapEqual(actualMap, expectedMap)) { + mismatchType = 3 /* STYLE */; + mismatchKey = "style"; + } + } else if (el instanceof SVGElement && isKnownSvgAttr(key) || el instanceof HTMLElement && (isBooleanAttr(key) || isKnownHtmlAttr(key))) { + if (isBooleanAttr(key)) { + actual = el.hasAttribute(key); + expected = includeBooleanAttr(clientValue); + } else if (clientValue == null) { + actual = el.hasAttribute(key); + expected = false; + } else { + if (el.hasAttribute(key)) { + actual = el.getAttribute(key); + } else if (key === "value" && el.tagName === "TEXTAREA") { + actual = el.value; + } else { + actual = false; + } + expected = isRenderableAttrValue(clientValue) ? String(clientValue) : false; + } + if (actual !== expected) { + mismatchType = 4 /* ATTRIBUTE */; + mismatchKey = key; + } + } + if (mismatchType != null && !isMismatchAllowed(el, mismatchType)) { + const format = (v) => v === false ? `(not rendered)` : `${mismatchKey}="${v}"`; + const preSegment = `Hydration ${MismatchTypeString[mismatchType]} mismatch on`; + const postSegment = ` + - rendered on server: ${format(actual)} + - expected on client: ${format(expected)} + Note: this mismatch is check-only. The DOM will not be rectified in production due to performance overhead. + You should fix the source of the mismatch.`; + { + warn$1(preSegment, el, postSegment); + } + return true; + } + return false; +} +function toClassSet(str) { + return new Set(str.trim().split(/\s+/)); +} +function isSetEqual(a, b) { + if (a.size !== b.size) { + return false; + } + for (const s of a) { + if (!b.has(s)) { + return false; + } + } + return true; +} +function toStyleMap(str) { + const styleMap = /* @__PURE__ */ new Map(); + for (const item of str.split(";")) { + let [key, value] = item.split(":"); + key = key.trim(); + value = value && value.trim(); + if (key && value) { + styleMap.set(key, value); + } + } + return styleMap; +} +function isMapEqual(a, b) { + if (a.size !== b.size) { + return false; + } + for (const [key, value] of a) { + if (value !== b.get(key)) { + return false; + } + } + return true; +} +function resolveCssVars(instance, vnode, expectedMap) { + const root = instance.subTree; + if (instance.getCssVars && (vnode === root || root && root.type === Fragment && root.children.includes(vnode))) { + const cssVars = instance.getCssVars(); + for (const key in cssVars) { + expectedMap.set( + `--${getEscapedCssVarName(key)}`, + String(cssVars[key]) + ); + } + } + if (vnode === root && instance.parent) { + resolveCssVars(instance.parent, instance.vnode, expectedMap); + } +} +const allowMismatchAttr = "data-allow-mismatch"; +const MismatchTypeString = { + [0 /* TEXT */]: "text", + [1 /* CHILDREN */]: "children", + [2 /* CLASS */]: "class", + [3 /* STYLE */]: "style", + [4 /* ATTRIBUTE */]: "attribute" +}; +function isMismatchAllowed(el, allowedType) { + if (allowedType === 0 /* TEXT */ || allowedType === 1 /* CHILDREN */) { + while (el && !el.hasAttribute(allowMismatchAttr)) { + el = el.parentElement; + } + } + const allowedAttr = el && el.getAttribute(allowMismatchAttr); + if (allowedAttr == null) { + return false; + } else if (allowedAttr === "") { + return true; + } else { + const list = allowedAttr.split(","); + if (allowedType === 0 /* TEXT */ && list.includes("children")) { + return true; + } + return allowedAttr.split(",").includes(MismatchTypeString[allowedType]); + } +} + +const requestIdleCallback = getGlobalThis().requestIdleCallback || ((cb) => setTimeout(cb, 1)); +const cancelIdleCallback = getGlobalThis().cancelIdleCallback || ((id) => clearTimeout(id)); +const hydrateOnIdle = (timeout = 1e4) => (hydrate) => { + const id = requestIdleCallback(hydrate, { timeout }); + return () => cancelIdleCallback(id); +}; +function elementIsVisibleInViewport(el) { + const { top, left, bottom, right } = el.getBoundingClientRect(); + const { innerHeight, innerWidth } = window; + return (top > 0 && top < innerHeight || bottom > 0 && bottom < innerHeight) && (left > 0 && left < innerWidth || right > 0 && right < innerWidth); +} +const hydrateOnVisible = (opts) => (hydrate, forEach) => { + const ob = new IntersectionObserver((entries) => { + for (const e of entries) { + if (!e.isIntersecting) continue; + ob.disconnect(); + hydrate(); + break; + } + }, opts); + forEach((el) => { + if (!(el instanceof Element)) return; + if (elementIsVisibleInViewport(el)) { + hydrate(); + ob.disconnect(); + return false; + } + ob.observe(el); + }); + return () => ob.disconnect(); +}; +const hydrateOnMediaQuery = (query) => (hydrate) => { + if (query) { + const mql = matchMedia(query); + if (mql.matches) { + hydrate(); + } else { + mql.addEventListener("change", hydrate, { once: true }); + return () => mql.removeEventListener("change", hydrate); + } + } +}; +const hydrateOnInteraction = (interactions = []) => (hydrate, forEach) => { + if (isString(interactions)) interactions = [interactions]; + let hasHydrated = false; + const doHydrate = (e) => { + if (!hasHydrated) { + hasHydrated = true; + teardown(); + hydrate(); + e.target.dispatchEvent(new e.constructor(e.type, e)); + } + }; + const teardown = () => { + forEach((el) => { + for (const i of interactions) { + el.removeEventListener(i, doHydrate); + } + }); + }; + forEach((el) => { + for (const i of interactions) { + el.addEventListener(i, doHydrate, { once: true }); + } + }); + return teardown; +}; +function forEachElement(node, cb) { + if (isComment(node) && node.data === "[") { + let depth = 1; + let next = node.nextSibling; + while (next) { + if (next.nodeType === 1) { + const result = cb(next); + if (result === false) { + break; + } + } else if (isComment(next)) { + if (next.data === "]") { + if (--depth === 0) break; + } else if (next.data === "[") { + depth++; + } + } + next = next.nextSibling; + } + } else { + cb(node); + } +} + +const isAsyncWrapper = (i) => !!i.type.__asyncLoader; +/*! #__NO_SIDE_EFFECTS__ */ +// @__NO_SIDE_EFFECTS__ +function defineAsyncComponent(source) { + if (isFunction(source)) { + source = { loader: source }; + } + const { + loader, + loadingComponent, + errorComponent, + delay = 200, + hydrate: hydrateStrategy, + timeout, + // undefined = never times out + suspensible = true, + onError: userOnError + } = source; + let pendingRequest = null; + let resolvedComp; + let retries = 0; + const retry = () => { + retries++; + pendingRequest = null; + return load(); + }; + const load = () => { + let thisRequest; + return pendingRequest || (thisRequest = pendingRequest = loader().catch((err) => { + err = err instanceof Error ? err : new Error(String(err)); + if (userOnError) { + return new Promise((resolve, reject) => { + const userRetry = () => resolve(retry()); + const userFail = () => reject(err); + userOnError(err, userRetry, userFail, retries + 1); + }); + } else { + throw err; + } + }).then((comp) => { + if (thisRequest !== pendingRequest && pendingRequest) { + return pendingRequest; + } + if (!comp) { + warn$1( + `Async component loader resolved to undefined. If you are using retry(), make sure to return its return value.` + ); + } + if (comp && (comp.__esModule || comp[Symbol.toStringTag] === "Module")) { + comp = comp.default; + } + if (comp && !isObject(comp) && !isFunction(comp)) { + throw new Error(`Invalid async component load result: ${comp}`); + } + resolvedComp = comp; + return comp; + })); + }; + return defineComponent({ + name: "AsyncComponentWrapper", + __asyncLoader: load, + __asyncHydrate(el, instance, hydrate) { + const doHydrate = hydrateStrategy ? () => { + const teardown = hydrateStrategy( + hydrate, + (cb) => forEachElement(el, cb) + ); + if (teardown) { + (instance.bum || (instance.bum = [])).push(teardown); + } + } : hydrate; + if (resolvedComp) { + doHydrate(); + } else { + load().then(() => !instance.isUnmounted && doHydrate()); + } + }, + get __asyncResolved() { + return resolvedComp; + }, + setup() { + const instance = currentInstance; + markAsyncBoundary(instance); + if (resolvedComp) { + return () => createInnerComp(resolvedComp, instance); + } + const onError = (err) => { + pendingRequest = null; + handleError( + err, + instance, + 13, + !errorComponent + ); + }; + if (suspensible && instance.suspense || isInSSRComponentSetup) { + return load().then((comp) => { + return () => createInnerComp(comp, instance); + }).catch((err) => { + onError(err); + return () => errorComponent ? createVNode(errorComponent, { + error: err + }) : null; + }); + } + const loaded = ref(false); + const error = ref(); + const delayed = ref(!!delay); + if (delay) { + setTimeout(() => { + delayed.value = false; + }, delay); + } + if (timeout != null) { + setTimeout(() => { + if (!loaded.value && !error.value) { + const err = new Error( + `Async component timed out after ${timeout}ms.` + ); + onError(err); + error.value = err; + } + }, timeout); + } + load().then(() => { + loaded.value = true; + if (instance.parent && isKeepAlive(instance.parent.vnode)) { + instance.parent.update(); + } + }).catch((err) => { + onError(err); + error.value = err; + }); + return () => { + if (loaded.value && resolvedComp) { + return createInnerComp(resolvedComp, instance); + } else if (error.value && errorComponent) { + return createVNode(errorComponent, { + error: error.value + }); + } else if (loadingComponent && !delayed.value) { + return createVNode(loadingComponent); + } + }; + } + }); +} +function createInnerComp(comp, parent) { + const { ref: ref2, props, children, ce } = parent.vnode; + const vnode = createVNode(comp, props, children); + vnode.ref = ref2; + vnode.ce = ce; + delete parent.vnode.ce; + return vnode; +} + +const isKeepAlive = (vnode) => vnode.type.__isKeepAlive; +const KeepAliveImpl = { + name: `KeepAlive`, + // Marker for special handling inside the renderer. We are not using a === + // check directly on KeepAlive in the renderer, because importing it directly + // would prevent it from being tree-shaken. + __isKeepAlive: true, + props: { + include: [String, RegExp, Array], + exclude: [String, RegExp, Array], + max: [String, Number] + }, + setup(props, { slots }) { + const instance = getCurrentInstance(); + const sharedContext = instance.ctx; + if (!sharedContext.renderer) { + return () => { + const children = slots.default && slots.default(); + return children && children.length === 1 ? children[0] : children; + }; + } + const cache = /* @__PURE__ */ new Map(); + const keys = /* @__PURE__ */ new Set(); + let current = null; + { + instance.__v_cache = cache; + } + const parentSuspense = instance.suspense; + const { + renderer: { + p: patch, + m: move, + um: _unmount, + o: { createElement } + } + } = sharedContext; + const storageContainer = createElement("div"); + sharedContext.activate = (vnode, container, anchor, namespace, optimized) => { + const instance2 = vnode.component; + move(vnode, container, anchor, 0, parentSuspense); + patch( + instance2.vnode, + vnode, + container, + anchor, + instance2, + parentSuspense, + namespace, + vnode.slotScopeIds, + optimized + ); + queuePostRenderEffect(() => { + instance2.isDeactivated = false; + if (instance2.a) { + invokeArrayFns(instance2.a); + } + const vnodeHook = vnode.props && vnode.props.onVnodeMounted; + if (vnodeHook) { + invokeVNodeHook(vnodeHook, instance2.parent, vnode); + } + }, parentSuspense); + { + devtoolsComponentAdded(instance2); + } + }; + sharedContext.deactivate = (vnode) => { + const instance2 = vnode.component; + invalidateMount(instance2.m); + invalidateMount(instance2.a); + move(vnode, storageContainer, null, 1, parentSuspense); + queuePostRenderEffect(() => { + if (instance2.da) { + invokeArrayFns(instance2.da); + } + const vnodeHook = vnode.props && vnode.props.onVnodeUnmounted; + if (vnodeHook) { + invokeVNodeHook(vnodeHook, instance2.parent, vnode); + } + instance2.isDeactivated = true; + }, parentSuspense); + { + devtoolsComponentAdded(instance2); + } + }; + function unmount(vnode) { + resetShapeFlag(vnode); + _unmount(vnode, instance, parentSuspense, true); + } + function pruneCache(filter) { + cache.forEach((vnode, key) => { + const name = getComponentName(vnode.type); + if (name && !filter(name)) { + pruneCacheEntry(key); + } + }); + } + function pruneCacheEntry(key) { + const cached = cache.get(key); + if (cached && (!current || !isSameVNodeType(cached, current))) { + unmount(cached); + } else if (current) { + resetShapeFlag(current); + } + cache.delete(key); + keys.delete(key); + } + watch( + () => [props.include, props.exclude], + ([include, exclude]) => { + include && pruneCache((name) => matches(include, name)); + exclude && pruneCache((name) => !matches(exclude, name)); + }, + // prune post-render after `current` has been updated + { flush: "post", deep: true } + ); + let pendingCacheKey = null; + const cacheSubtree = () => { + if (pendingCacheKey != null) { + if (isSuspense(instance.subTree.type)) { + queuePostRenderEffect(() => { + cache.set(pendingCacheKey, getInnerChild(instance.subTree)); + }, instance.subTree.suspense); + } else { + cache.set(pendingCacheKey, getInnerChild(instance.subTree)); + } + } + }; + onMounted(cacheSubtree); + onUpdated(cacheSubtree); + onBeforeUnmount(() => { + cache.forEach((cached) => { + const { subTree, suspense } = instance; + const vnode = getInnerChild(subTree); + if (cached.type === vnode.type && cached.key === vnode.key) { + resetShapeFlag(vnode); + const da = vnode.component.da; + da && queuePostRenderEffect(da, suspense); + return; + } + unmount(cached); + }); + }); + return () => { + pendingCacheKey = null; + if (!slots.default) { + return current = null; + } + const children = slots.default(); + const rawVNode = children[0]; + if (children.length > 1) { + { + warn$1(`KeepAlive should contain exactly one component child.`); + } + current = null; + return children; + } else if (!isVNode(rawVNode) || !(rawVNode.shapeFlag & 4) && !(rawVNode.shapeFlag & 128)) { + current = null; + return rawVNode; + } + let vnode = getInnerChild(rawVNode); + if (vnode.type === Comment) { + current = null; + return vnode; + } + const comp = vnode.type; + const name = getComponentName( + isAsyncWrapper(vnode) ? vnode.type.__asyncResolved || {} : comp + ); + const { include, exclude, max } = props; + if (include && (!name || !matches(include, name)) || exclude && name && matches(exclude, name)) { + vnode.shapeFlag &= ~256; + current = vnode; + return rawVNode; + } + const key = vnode.key == null ? comp : vnode.key; + const cachedVNode = cache.get(key); + if (vnode.el) { + vnode = cloneVNode(vnode); + if (rawVNode.shapeFlag & 128) { + rawVNode.ssContent = vnode; + } + } + pendingCacheKey = key; + if (cachedVNode) { + vnode.el = cachedVNode.el; + vnode.component = cachedVNode.component; + if (vnode.transition) { + setTransitionHooks(vnode, vnode.transition); + } + vnode.shapeFlag |= 512; + keys.delete(key); + keys.add(key); + } else { + keys.add(key); + if (max && keys.size > parseInt(max, 10)) { + pruneCacheEntry(keys.values().next().value); + } + } + vnode.shapeFlag |= 256; + current = vnode; + return isSuspense(rawVNode.type) ? rawVNode : vnode; + }; + } +}; +const KeepAlive = KeepAliveImpl; +function matches(pattern, name) { + if (isArray(pattern)) { + return pattern.some((p) => matches(p, name)); + } else if (isString(pattern)) { + return pattern.split(",").includes(name); + } else if (isRegExp(pattern)) { + pattern.lastIndex = 0; + return pattern.test(name); + } + return false; +} +function onActivated(hook, target) { + registerKeepAliveHook(hook, "a", target); +} +function onDeactivated(hook, target) { + registerKeepAliveHook(hook, "da", target); +} +function registerKeepAliveHook(hook, type, target = currentInstance) { + const wrappedHook = hook.__wdc || (hook.__wdc = () => { + let current = target; + while (current) { + if (current.isDeactivated) { + return; + } + current = current.parent; + } + return hook(); + }); + injectHook(type, wrappedHook, target); + if (target) { + let current = target.parent; + while (current && current.parent) { + if (isKeepAlive(current.parent.vnode)) { + injectToKeepAliveRoot(wrappedHook, type, target, current); + } + current = current.parent; + } + } +} +function injectToKeepAliveRoot(hook, type, target, keepAliveRoot) { + const injected = injectHook( + type, + hook, + keepAliveRoot, + true + /* prepend */ + ); + onUnmounted(() => { + remove(keepAliveRoot[type], injected); + }, target); +} +function resetShapeFlag(vnode) { + vnode.shapeFlag &= ~256; + vnode.shapeFlag &= ~512; +} +function getInnerChild(vnode) { + return vnode.shapeFlag & 128 ? vnode.ssContent : vnode; +} + +function injectHook(type, hook, target = currentInstance, prepend = false) { + if (target) { + const hooks = target[type] || (target[type] = []); + const wrappedHook = hook.__weh || (hook.__weh = (...args) => { + pauseTracking(); + const reset = setCurrentInstance(target); + const res = callWithAsyncErrorHandling(hook, target, type, args); + reset(); + resetTracking(); + return res; + }); + if (prepend) { + hooks.unshift(wrappedHook); + } else { + hooks.push(wrappedHook); + } + return wrappedHook; + } else { + const apiName = toHandlerKey(ErrorTypeStrings$1[type].replace(/ hook$/, "")); + warn$1( + `${apiName} is called when there is no active component instance to be associated with. Lifecycle injection APIs can only be used during execution of setup().` + (` If you are using async setup(), make sure to register lifecycle hooks before the first await statement.` ) + ); + } +} +const createHook = (lifecycle) => (hook, target = currentInstance) => { + if (!isInSSRComponentSetup || lifecycle === "sp") { + injectHook(lifecycle, (...args) => hook(...args), target); + } +}; +const onBeforeMount = createHook("bm"); +const onMounted = createHook("m"); +const onBeforeUpdate = createHook( + "bu" +); +const onUpdated = createHook("u"); +const onBeforeUnmount = createHook( + "bum" +); +const onUnmounted = createHook("um"); +const onServerPrefetch = createHook( + "sp" +); +const onRenderTriggered = createHook("rtg"); +const onRenderTracked = createHook("rtc"); +function onErrorCaptured(hook, target = currentInstance) { + injectHook("ec", hook, target); +} + +const COMPONENTS = "components"; +const DIRECTIVES = "directives"; +function resolveComponent(name, maybeSelfReference) { + return resolveAsset(COMPONENTS, name, true, maybeSelfReference) || name; +} +const NULL_DYNAMIC_COMPONENT = Symbol.for("v-ndc"); +function resolveDynamicComponent(component) { + if (isString(component)) { + return resolveAsset(COMPONENTS, component, false) || component; + } else { + return component || NULL_DYNAMIC_COMPONENT; + } +} +function resolveDirective(name) { + return resolveAsset(DIRECTIVES, name); +} +function resolveAsset(type, name, warnMissing = true, maybeSelfReference = false) { + const instance = currentRenderingInstance || currentInstance; + if (instance) { + const Component = instance.type; + if (type === COMPONENTS) { + const selfName = getComponentName( + Component, + false + ); + if (selfName && (selfName === name || selfName === camelize(name) || selfName === capitalize(camelize(name)))) { + return Component; + } + } + const res = ( + // local registration + // check instance[type] first which is resolved for options API + resolve(instance[type] || Component[type], name) || // global registration + resolve(instance.appContext[type], name) + ); + if (!res && maybeSelfReference) { + return Component; + } + if (warnMissing && !res) { + const extra = type === COMPONENTS ? ` +If this is a native custom element, make sure to exclude it from component resolution via compilerOptions.isCustomElement.` : ``; + warn$1(`Failed to resolve ${type.slice(0, -1)}: ${name}${extra}`); + } + return res; + } else { + warn$1( + `resolve${capitalize(type.slice(0, -1))} can only be used in render() or setup().` + ); + } +} +function resolve(registry, name) { + return registry && (registry[name] || registry[camelize(name)] || registry[capitalize(camelize(name))]); +} + +function renderList(source, renderItem, cache, index) { + let ret; + const cached = cache && cache[index]; + const sourceIsArray = isArray(source); + if (sourceIsArray || isString(source)) { + const sourceIsReactiveArray = sourceIsArray && isReactive(source); + let needsWrap = false; + if (sourceIsReactiveArray) { + needsWrap = !isShallow(source); + source = shallowReadArray(source); + } + ret = new Array(source.length); + for (let i = 0, l = source.length; i < l; i++) { + ret[i] = renderItem( + needsWrap ? toReactive(source[i]) : source[i], + i, + void 0, + cached && cached[i] + ); + } + } else if (typeof source === "number") { + if (!Number.isInteger(source)) { + warn$1(`The v-for range expect an integer value but got ${source}.`); + } + ret = new Array(source); + for (let i = 0; i < source; i++) { + ret[i] = renderItem(i + 1, i, void 0, cached && cached[i]); + } + } else if (isObject(source)) { + if (source[Symbol.iterator]) { + ret = Array.from( + source, + (item, i) => renderItem(item, i, void 0, cached && cached[i]) + ); + } else { + const keys = Object.keys(source); + ret = new Array(keys.length); + for (let i = 0, l = keys.length; i < l; i++) { + const key = keys[i]; + ret[i] = renderItem(source[key], key, i, cached && cached[i]); + } + } + } else { + ret = []; + } + if (cache) { + cache[index] = ret; + } + return ret; +} + +function createSlots(slots, dynamicSlots) { + for (let i = 0; i < dynamicSlots.length; i++) { + const slot = dynamicSlots[i]; + if (isArray(slot)) { + for (let j = 0; j < slot.length; j++) { + slots[slot[j].name] = slot[j].fn; + } + } else if (slot) { + slots[slot.name] = slot.key ? (...args) => { + const res = slot.fn(...args); + if (res) res.key = slot.key; + return res; + } : slot.fn; + } + } + return slots; +} + +function renderSlot(slots, name, props = {}, fallback, noSlotted) { + if (currentRenderingInstance.ce || currentRenderingInstance.parent && isAsyncWrapper(currentRenderingInstance.parent) && currentRenderingInstance.parent.ce) { + if (name !== "default") props.name = name; + return openBlock(), createBlock( + Fragment, + null, + [createVNode("slot", props, fallback && fallback())], + 64 + ); + } + let slot = slots[name]; + if (slot && slot.length > 1) { + warn$1( + `SSR-optimized slot function detected in a non-SSR-optimized render function. You need to mark this component with $dynamic-slots in the parent template.` + ); + slot = () => []; + } + if (slot && slot._c) { + slot._d = false; + } + openBlock(); + const validSlotContent = slot && ensureValidVNode(slot(props)); + const slotKey = props.key || // slot content array of a dynamic conditional slot may have a branch + // key attached in the `createSlots` helper, respect that + validSlotContent && validSlotContent.key; + const rendered = createBlock( + Fragment, + { + key: (slotKey && !isSymbol(slotKey) ? slotKey : `_${name}`) + // #7256 force differentiate fallback content from actual content + (!validSlotContent && fallback ? "_fb" : "") + }, + validSlotContent || (fallback ? fallback() : []), + validSlotContent && slots._ === 1 ? 64 : -2 + ); + if (!noSlotted && rendered.scopeId) { + rendered.slotScopeIds = [rendered.scopeId + "-s"]; + } + if (slot && slot._c) { + slot._d = true; + } + return rendered; +} +function ensureValidVNode(vnodes) { + return vnodes.some((child) => { + if (!isVNode(child)) return true; + if (child.type === Comment) return false; + if (child.type === Fragment && !ensureValidVNode(child.children)) + return false; + return true; + }) ? vnodes : null; +} + +function toHandlers(obj, preserveCaseIfNecessary) { + const ret = {}; + if (!isObject(obj)) { + warn$1(`v-on with no argument expects an object value.`); + return ret; + } + for (const key in obj) { + ret[preserveCaseIfNecessary && /[A-Z]/.test(key) ? `on:${key}` : toHandlerKey(key)] = obj[key]; + } + return ret; +} + +const getPublicInstance = (i) => { + if (!i) return null; + if (isStatefulComponent(i)) return getComponentPublicInstance(i); + return getPublicInstance(i.parent); +}; +const publicPropertiesMap = ( + // Move PURE marker to new line to workaround compiler discarding it + // due to type annotation + /* @__PURE__ */ extend(/* @__PURE__ */ Object.create(null), { + $: (i) => i, + $el: (i) => i.vnode.el, + $data: (i) => i.data, + $props: (i) => shallowReadonly(i.props) , + $attrs: (i) => shallowReadonly(i.attrs) , + $slots: (i) => shallowReadonly(i.slots) , + $refs: (i) => shallowReadonly(i.refs) , + $parent: (i) => getPublicInstance(i.parent), + $root: (i) => getPublicInstance(i.root), + $host: (i) => i.ce, + $emit: (i) => i.emit, + $options: (i) => resolveMergedOptions(i) , + $forceUpdate: (i) => i.f || (i.f = () => { + queueJob(i.update); + }), + $nextTick: (i) => i.n || (i.n = nextTick.bind(i.proxy)), + $watch: (i) => instanceWatch.bind(i) + }) +); +const isReservedPrefix = (key) => key === "_" || key === "$"; +const hasSetupBinding = (state, key) => state !== EMPTY_OBJ && !state.__isScriptSetup && hasOwn(state, key); +const PublicInstanceProxyHandlers = { + get({ _: instance }, key) { + if (key === "__v_skip") { + return true; + } + const { ctx, setupState, data, props, accessCache, type, appContext } = instance; + if (key === "__isVue") { + return true; + } + let normalizedProps; + if (key[0] !== "$") { + const n = accessCache[key]; + if (n !== void 0) { + switch (n) { + case 1 /* SETUP */: + return setupState[key]; + case 2 /* DATA */: + return data[key]; + case 4 /* CONTEXT */: + return ctx[key]; + case 3 /* PROPS */: + return props[key]; + } + } else if (hasSetupBinding(setupState, key)) { + accessCache[key] = 1 /* SETUP */; + return setupState[key]; + } else if (data !== EMPTY_OBJ && hasOwn(data, key)) { + accessCache[key] = 2 /* DATA */; + return data[key]; + } else if ( + // only cache other properties when instance has declared (thus stable) + // props + (normalizedProps = instance.propsOptions[0]) && hasOwn(normalizedProps, key) + ) { + accessCache[key] = 3 /* PROPS */; + return props[key]; + } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) { + accessCache[key] = 4 /* CONTEXT */; + return ctx[key]; + } else if (shouldCacheAccess) { + accessCache[key] = 0 /* OTHER */; + } + } + const publicGetter = publicPropertiesMap[key]; + let cssModule, globalProperties; + if (publicGetter) { + if (key === "$attrs") { + track(instance.attrs, "get", ""); + markAttrsAccessed(); + } else if (key === "$slots") { + track(instance, "get", key); + } + return publicGetter(instance); + } else if ( + // css module (injected by vue-loader) + (cssModule = type.__cssModules) && (cssModule = cssModule[key]) + ) { + return cssModule; + } else if (ctx !== EMPTY_OBJ && hasOwn(ctx, key)) { + accessCache[key] = 4 /* CONTEXT */; + return ctx[key]; + } else if ( + // global properties + globalProperties = appContext.config.globalProperties, hasOwn(globalProperties, key) + ) { + { + return globalProperties[key]; + } + } else if (currentRenderingInstance && (!isString(key) || // #1091 avoid internal isRef/isVNode checks on component instance leading + // to infinite warning loop + key.indexOf("__v") !== 0)) { + if (data !== EMPTY_OBJ && isReservedPrefix(key[0]) && hasOwn(data, key)) { + warn$1( + `Property ${JSON.stringify( + key + )} must be accessed via $data because it starts with a reserved character ("$" or "_") and is not proxied on the render context.` + ); + } else if (instance === currentRenderingInstance) { + warn$1( + `Property ${JSON.stringify(key)} was accessed during render but is not defined on instance.` + ); + } + } + }, + set({ _: instance }, key, value) { + const { data, setupState, ctx } = instance; + if (hasSetupBinding(setupState, key)) { + setupState[key] = value; + return true; + } else if (setupState.__isScriptSetup && hasOwn(setupState, key)) { + warn$1(`Cannot mutate + - - -

- + -
+ + + diff --git a/examples/server/public/colorthemes.css b/examples/server/public_legacy/colorthemes.css similarity index 100% rename from examples/server/public/colorthemes.css rename to examples/server/public_legacy/colorthemes.css diff --git a/examples/server/public_legacy/completion.js b/examples/server/public_legacy/completion.js new file mode 100644 index 000000000..30df7c2fa --- /dev/null +++ b/examples/server/public_legacy/completion.js @@ -0,0 +1,209 @@ +const paramDefaults = { + stream: true, + n_predict: 500, + temperature: 0.2, + stop: [""] +}; + +let generation_settings = null; + + +// Completes the prompt as a generator. Recommended for most use cases. +// +// Example: +// +// import { llama } from '/completion.js' +// +// const request = llama("Tell me a joke", {n_predict: 800}) +// for await (const chunk of request) { +// document.write(chunk.data.content) +// } +// +export async function* llama(prompt, params = {}, config = {}) { + let controller = config.controller; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; + + if (!controller) { + controller = new AbortController(); + } + + const completionParams = { ...paramDefaults, ...params, prompt }; + + const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, { + method: 'POST', + body: JSON.stringify(completionParams), + headers: { + 'Connection': 'keep-alive', + 'Content-Type': 'application/json', + 'Accept': 'text/event-stream', + ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {}) + }, + signal: controller.signal, + }); + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + + let content = ""; + let leftover = ""; // Buffer for partially read lines + + try { + let cont = true; + + while (cont) { + const result = await reader.read(); + if (result.done) { + break; + } + + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); + + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith('\n'); + + // Split the text into lines + let lines = text.split('\n'); + + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end + } + + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2]; + if (result.data === '[DONE]') { + cont = false; + break; + } + + // since we know this is llama.cpp, let's just decode the json in data + if (result.data) { + result.data = JSON.parse(result.data); + content += result.data.content; + + // yield + yield result; + + // if we got a stop token from server, we will break here + if (result.data.stop) { + if (result.data.generation_settings) { + generation_settings = result.data.generation_settings; + } + cont = false; + break; + } + } + if (result.error) { + try { + result.error = JSON.parse(result.error); + if (result.error.message.includes('slot unavailable')) { + // Throw an error to be caught by upstream callers + throw new Error('slot unavailable'); + } else { + console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`); + } + } catch(e) { + console.error(`llama.cpp error ${result.error}`) + } + } + } + } + } + } catch (e) { + if (e.name !== 'AbortError') { + console.error("llama error: ", e); + } + throw e; + } + finally { + controller.abort(); + } + + return content; +} + +// Call llama, return an event target that you can subscribe to +// +// Example: +// +// import { llamaEventTarget } from '/completion.js' +// +// const conn = llamaEventTarget(prompt) +// conn.addEventListener("message", (chunk) => { +// document.write(chunk.detail.content) +// }) +// +export const llamaEventTarget = (prompt, params = {}, config = {}) => { + const eventTarget = new EventTarget(); + (async () => { + let content = ""; + for await (const chunk of llama(prompt, params, config)) { + if (chunk.data) { + content += chunk.data.content; + eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data })); + } + if (chunk.data.generation_settings) { + eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings })); + } + if (chunk.data.timings) { + eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings })); + } + } + eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } })); + })(); + return eventTarget; +} + +// Call llama, return a promise that resolves to the completed text. This does not support streaming +// +// Example: +// +// llamaPromise(prompt).then((content) => { +// document.write(content) +// }) +// +// or +// +// const content = await llamaPromise(prompt) +// document.write(content) +// +export const llamaPromise = (prompt, params = {}, config = {}) => { + return new Promise(async (resolve, reject) => { + let content = ""; + try { + for await (const chunk of llama(prompt, params, config)) { + content += chunk.data.content; + } + resolve(content); + } catch (error) { + reject(error); + } + }); +}; + +/** + * (deprecated) + */ +export const llamaComplete = async (params, controller, callback) => { + for await (const chunk of llama(params.prompt, params, { controller })) { + callback(chunk); + } +} + +// Get the model info from the server. This is useful for getting the context window and so on. +export const llamaModelInfo = async (config = {}) => { + if (!generation_settings) { + const api_url = config.api_url?.replace(/\/+$/, '') || ""; + const props = await fetch(`${api_url}/props`).then(r => r.json()); + generation_settings = props.default_generation_settings; + } + return generation_settings; +} diff --git a/examples/server/public/favicon.ico b/examples/server/public_legacy/favicon.ico similarity index 100% rename from examples/server/public/favicon.ico rename to examples/server/public_legacy/favicon.ico diff --git a/examples/server/public/index-new.html b/examples/server/public_legacy/index-new.html similarity index 100% rename from examples/server/public/index-new.html rename to examples/server/public_legacy/index-new.html diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html new file mode 100644 index 000000000..a95f5c6df --- /dev/null +++ b/examples/server/public_legacy/index.html @@ -0,0 +1,1303 @@ + + + + + + llama.cpp - chat + + + + + + + +
+ +
+
+ + + diff --git a/examples/server/public/index.js b/examples/server/public_legacy/index.js similarity index 100% rename from examples/server/public/index.js rename to examples/server/public_legacy/index.js diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs similarity index 100% rename from examples/server/public/json-schema-to-grammar.mjs rename to examples/server/public_legacy/json-schema-to-grammar.mjs diff --git a/examples/server/public_legacy/loading.html b/examples/server/public_legacy/loading.html new file mode 100644 index 000000000..c3fd19a0f --- /dev/null +++ b/examples/server/public_legacy/loading.html @@ -0,0 +1,12 @@ + + + + + + +
+ The model is loading. Please wait.
+ The user interface will appear soon. +
+ + diff --git a/examples/server/public/prompt-formats.js b/examples/server/public_legacy/prompt-formats.js similarity index 100% rename from examples/server/public/prompt-formats.js rename to examples/server/public_legacy/prompt-formats.js diff --git a/examples/server/public/style.css b/examples/server/public_legacy/style.css similarity index 100% rename from examples/server/public/style.css rename to examples/server/public_legacy/style.css diff --git a/examples/server/public/system-prompts.js b/examples/server/public_legacy/system-prompts.js similarity index 100% rename from examples/server/public/system-prompts.js rename to examples/server/public_legacy/system-prompts.js diff --git a/examples/server/public/theme-beeninorder.css b/examples/server/public_legacy/theme-beeninorder.css similarity index 100% rename from examples/server/public/theme-beeninorder.css rename to examples/server/public_legacy/theme-beeninorder.css diff --git a/examples/server/public/theme-ketivah.css b/examples/server/public_legacy/theme-ketivah.css similarity index 100% rename from examples/server/public/theme-ketivah.css rename to examples/server/public_legacy/theme-ketivah.css diff --git a/examples/server/public/theme-mangotango.css b/examples/server/public_legacy/theme-mangotango.css similarity index 100% rename from examples/server/public/theme-mangotango.css rename to examples/server/public_legacy/theme-mangotango.css diff --git a/examples/server/public/theme-playground.css b/examples/server/public_legacy/theme-playground.css similarity index 100% rename from examples/server/public/theme-playground.css rename to examples/server/public_legacy/theme-playground.css diff --git a/examples/server/public/theme-polarnight.css b/examples/server/public_legacy/theme-polarnight.css similarity index 100% rename from examples/server/public/theme-polarnight.css rename to examples/server/public_legacy/theme-polarnight.css diff --git a/examples/server/public/theme-snowstorm.css b/examples/server/public_legacy/theme-snowstorm.css similarity index 100% rename from examples/server/public/theme-snowstorm.css rename to examples/server/public_legacy/theme-snowstorm.css diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1c7f0fd1d..a6d3a1c95 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -14,22 +14,13 @@ #define MIMETYPE_JSON "application/json; charset=utf-8" // auto generated files (update with ./deps.sh) -#include "colorthemes.css.hpp" -#include "style.css.hpp" -#include "theme-beeninorder.css.hpp" -#include "theme-ketivah.css.hpp" -#include "theme-mangotango.css.hpp" -#include "theme-playground.css.hpp" -#include "theme-polarnight.css.hpp" -#include "theme-snowstorm.css.hpp" #include "index.html.hpp" -#include "index-new.html.hpp" -#include "index.js.hpp" #include "completion.js.hpp" -#include "system-prompts.js.hpp" -#include "prompt-formats.js.hpp" -#include "json-schema-to-grammar.mjs.hpp" #include "loading.html.hpp" +#include "deps_daisyui.min.css.hpp" +#include "deps_markdown-it.js.hpp" +#include "deps_tailwindcss.js.hpp" +#include "deps_vue.esm-browser.js.hpp" #include #include @@ -2285,16 +2276,6 @@ int main(int argc, char ** argv) { std::atomic state{SERVER_STATE_LOADING_MODEL}; svr->set_default_headers({{"Server", "llama.cpp"}}); - - // CORS preflight - svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) { - // Access-Control-Allow-Origin is already set by middleware - res.set_header("Access-Control-Allow-Credentials", "true"); - res.set_header("Access-Control-Allow-Methods", "POST"); - res.set_header("Access-Control-Allow-Headers", "*"); - return res.set_content("", "text/html"); // blank response, no data - }); - svr->set_logger(log_server_request); auto res_error = [](httplib::Response & res, const json & error_data) { @@ -2407,6 +2388,14 @@ int main(int argc, char ** argv) { // register server middlewares svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + // If this is OPTIONS request, skip validation because browsers don't include Authorization header + if (req.method == "OPTIONS") { + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "GET, POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + res.set_content("", "text/html"); // blank response, no data + return httplib::Server::HandlerResponse::Handled; // skip further processing + } if (!middleware_server_state(req, res)) { return httplib::Server::HandlerResponse::Handled; } @@ -3116,33 +3105,19 @@ int main(int argc, char ** argv) { // register static assets routes if (!params.public_path.empty()) { // Set the base directory for serving static files - svr->set_base_dir(params.public_path); - } - - if (!params.api_keys.empty()) { - // for now, if API key is set, web UI is unusable - svr->Get("/", [&](const httplib::Request &, httplib::Response & res) { - return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8"); - }); + bool is_found = svr->set_mount_point("/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } } else { // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8")); + svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8")); + svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8")); + svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8")); } // register API routes diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index 0a3c5cc77..ef30007c3 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -64,5 +64,5 @@ Feature: Security | localhost | Access-Control-Allow-Origin | localhost | | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | | origin | Access-Control-Allow-Credentials | true | - | web.mydomain.fr | Access-Control-Allow-Methods | POST | + | web.mydomain.fr | Access-Control-Allow-Methods | GET, POST | | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/grammars/README.md b/grammars/README.md index 4e8b4e2fc..4e57bca5f 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -124,7 +124,7 @@ You can use GBNF grammars: - In [llama-cli](../examples/main), passed as the `--json` / `-j` flag - To convert to a grammar ahead of time: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) + - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). diff --git a/tests/run-json-schema-to-grammar.mjs b/tests/run-json-schema-to-grammar.mjs index 71bf62ed3..b20ac1d6b 100644 --- a/tests/run-json-schema-to-grammar.mjs +++ b/tests/run-json-schema-to-grammar.mjs @@ -1,5 +1,5 @@ import { readFileSync } from "fs" -import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs" +import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs" const [, , file] = process.argv const url = `file://${file}` From 76c6e7f10551960e4ec9e14e0535b72081f1c7ad Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 7 Nov 2024 18:44:38 -0400 Subject: [PATCH 054/279] server : minor UI fix (#10207) --- examples/server/public/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 850c652ac..bf1d1b794 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -167,7 +167,7 @@
-
+
From d05b3127bd30515955aa4ee2bacdb68ebafe88f4 Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Fri, 8 Nov 2024 17:34:06 +0800 Subject: [PATCH 055/279] swift : exclude ggml-metal-embed.metal (#10211) * llama.swift : exclude ggml-metal-embed.metal * swift : exclude build/ --- Package.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Package.swift b/Package.swift index d3661d13c..0f4f19018 100644 --- a/Package.swift +++ b/Package.swift @@ -61,13 +61,15 @@ let package = Package( name: "llama", path: ".", exclude: [ + "build", "cmake", "examples", "scripts", "models", "tests", "CMakeLists.txt", - "Makefile" + "Makefile", + "ggml/src/ggml-metal-embed.metal" ], sources: sources, resources: resources, From 841f27abdbbcecc9daac14dc540ba6202e4ffe40 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Nov 2024 13:47:22 +0200 Subject: [PATCH 056/279] metal : optimize FA kernels (#10171) * ggml : add ggml_flash_attn_ext_get_prec * metal : use F16 precision in FA kernels ggml-ci * metal : minor clean-up * metal : compile-guard bf16 FA kernels ggml-ci * build : remove obsolete compile flag [no ci] * metal : prevent int overflows [no ci] * cuda : disable BF16 FA ggml-ci * metal : fix BF16 requirement for FA kernels ggml-ci * make : clean-up [no ci] --- examples/llama-bench/llama-bench.cpp | 3 + ggml/include/ggml.h | 3 + ggml/src/ggml-cuda.cu | 3 + ggml/src/ggml-cuda/fattn.cu | 10 +- ggml/src/ggml-metal.m | 74 ++- ggml/src/ggml-metal.metal | 733 +++++++++++++++------------ ggml/src/ggml.c | 9 + tests/test-backend-ops.cpp | 2 +- 8 files changed, 498 insertions(+), 339 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index e7873a143..1eddfd0db 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } + if (s == "bf16") { + return GGML_TYPE_BF16; + } if (s == "q8_0") { return GGML_TYPE_Q8_0; } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 0d143d2fe..73ede1813 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1746,6 +1746,9 @@ extern "C" { struct ggml_tensor * a, enum ggml_prec prec); + GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec( + const struct ggml_tensor * a); + // TODO: needs to be adapted to ggml_flash_attn_ext GGML_API struct ggml_tensor * ggml_flash_attn_back( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e27c8e87d..357cee660 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g #ifndef FLASH_ATTN_AVAILABLE return false; #endif + if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) { + return false; + } if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) { return true; } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 83e5589a1..0e7ebbc53 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; - const int32_t precision = KQV->op_params[3]; + const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); - if (precision != GGML_PREC_DEFAULT) { + if (prec != GGML_PREC_DEFAULT) { if (Q->ne[1] <= 32 || Q->ne[0] > 128) { constexpr int cols_per_block = 16; switch (Q->ne[0]) { @@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst ggml_cuda_set_device(ctx.device); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - const int32_t precision = KQV->op_params[3]; + const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); // On AMD the tile kernels perform poorly, use the vec kernel instead: if (cc >= CC_OFFSET_AMD) { - if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { + if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); @@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst } if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { - if (precision == GGML_PREC_DEFAULT) { + if (prec == GGML_PREC_DEFAULT) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); return; } else if(Q->ne[0] <= 128) { diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f13adee38..e19397fd2 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -269,6 +269,12 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, @@ -300,12 +306,14 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, @@ -585,6 +593,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ id metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \ kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \ + GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ + (int) kernel->pipeline.threadExecutionWidth); \ [metal_function release]; \ if (error) { \ GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ @@ -777,6 +788,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, flash_attn_ext_bf16_h112, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, flash_attn_ext_bf16_h128, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm); @@ -808,12 +825,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, flash_attn_ext_vec_bf16_h256, has_simdgroup_reduction && has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); @@ -1111,7 +1130,7 @@ static void ggml_metal_encode_node( const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); const uint64_t nb21 = src2 ? src2->nb[1] : 0; const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); const int64_t ne0 = dst ? dst->ne[0] : 0; const int64_t ne1 = dst ? dst->ne[1] : 0; @@ -3033,6 +3052,23 @@ static void ggml_metal_encode_node( } } } break; + case GGML_TYPE_BF16: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; case GGML_TYPE_Q4_0: { switch (ne00) { @@ -3133,6 +3169,7 @@ static void ggml_metal_encode_node( { switch (src1->type) { case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break; @@ -3150,6 +3187,7 @@ static void ggml_metal_encode_node( { switch (src1->type) { case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break; @@ -3194,18 +3232,15 @@ static void ggml_metal_encode_node( [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15]; [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19]; - [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22]; - [encoder setBytes:&scale length:sizeof( float) atIndex:23]; - [encoder setBytes:&max_bias length:sizeof( float) atIndex:24]; - [encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; - [encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; - [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; - [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28]; + [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:18]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:19]; + [encoder setBytes:&scale length:sizeof( float) atIndex:20]; + [encoder setBytes:&max_bias length:sizeof( float) atIndex:21]; + [encoder setBytes:&m0 length:sizeof(m0) atIndex:22]; + [encoder setBytes:&m1 length:sizeof(m1) atIndex:23]; + [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:24]; + [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25]; if (!use_vec_kernel) { // half8x8 kernel @@ -3216,11 +3251,14 @@ static void ggml_metal_encode_node( GGML_ASSERT(nqptg % 8 == 0); GGML_ASSERT(ncpsg % 32 == 0); + // 2*(2*ncpsg + nqptg)*(nsg) + // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float) + // // 16*32*(nsg) // the shared memory needed for the simdgroups to load the KV cache // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG // -#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16)) +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16)) int64_t nsgmax = 2; @@ -3254,12 +3292,12 @@ static void ggml_metal_encode_node( // ne00 + 2*ncpsg*(nsg) // for each query, we load it as f16 in shared memory (ne00) - // and store the attention scores (nqptg x ncpsg) as f32 + // and store the soft_max values and the mask // - // 2*ne00*(nsg) - // each simdgroup has a full f32 head vector in shared mem to accumulate results + // ne00*(nsg) + // each simdgroup has a full f16 head vector in shared mem to accumulate results // -#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16)) +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + ne00*(nsg))*(sizeof(float)/2), 16)) int64_t nsgmax = 2; diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 16b5da3ff..edce74108 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -57,10 +57,14 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask1 = mask0 << 8; - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md; + reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md; } + + reg = (type4x4) reg_f; } template @@ -72,10 +76,14 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask1 = mask0 << 8; - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; + float4x4 reg_f; + + for (int i = 0; i < 8; i++) { + reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m; + reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m; } + + reg = (type4x4) reg_f; } template @@ -92,6 +100,8 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg const int gh_mv = il ? 12 : 0; const int gh_bk = il ? 0 : 4; + float4x4 reg_f; + for (int i = 0; i < 8; i++) { // extract the 5-th bits for x0 and x1 const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; @@ -101,9 +111,11 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - reg[i/2][2*(i%2)+0] = d * x0 + md; - reg[i/2][2*(i%2)+1] = d * x1 + md; + reg_f[i/2][2*(i%2) + 0] = d * x0 + md; + reg_f[i/2][2*(i%2) + 1] = d * x1 + md; } + + reg = (type4x4) reg_f; } template @@ -120,6 +132,8 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg const int gh_mv = il ? 12 : 0; const int gh_bk = il ? 0 : 4; + float4x4 reg_f; + for (int i = 0; i < 8; i++) { // extract the 5-th bits for x0 and x1 const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; @@ -129,9 +143,11 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - reg[i/2][2*(i%2)+0] = d * x0 + m; - reg[i/2][2*(i%2)+1] = d * x1 + m; + reg_f[i/2][2*(i%2) + 0] = d * x0 + m; + reg_f[i/2][2*(i%2) + 1] = d * x1 + m; } + + reg = (type4x4) reg_f; } template @@ -139,9 +155,13 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg device const int8_t * qs = ((device const int8_t *)xb->qs); const half d = xb->d; + float4x4 reg_f; + for (int i = 0; i < 16; i++) { - reg[i/4][i%4] = (qs[i + 16*il] * d); + reg_f[i/4][i%4] = (qs[i + 16*il] * d); } + + reg = (type4x4) reg_f; } template @@ -2755,44 +2775,65 @@ kernel void kernel_leaky_relu_f32( } // ref: https://arxiv.org/pdf/2307.08691.pdf -// D - head size, Q - queries per threadgroup, KV - key/value processed per each simdgroup, C - cache items per threadgroup -template +template< + typename q_t, // query types in shared memory + typename q4_t, + typename q8x8_t, + typename k_t, // key types in shared memory + typename k4x4_t, + typename k8x8_t, + typename v_t, // value types in shared memory + typename v4x4_t, + typename v8x8_t, + typename qk_t, // Q*K types + typename qk8x8_t, + typename s_t, // soft-max types + typename s8x8_t, + typename o_t, // attention accumulation types + typename o4_t, + typename o8x8_t, + typename kd4x4_t, // key type in device memory + short nl_k, + void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), + typename vd4x4_t, // key type in device memory + short nl_v, + void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), + short D, // head size + short Q = 8, // queries per threadgroup + short KV = 8, // key/value processed per each simdgroup + short C = 32> // cache items per threadgroup kernel void kernel_flash_attn_ext( device const char * q, device const char * k, device const char * v, device const char * mask, device float * dst, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant uint64_t & nb21, - constant uint64_t & nb22, - constant uint64_t & nb23, - constant uint64_t & nb31, - constant int64_t & ne1, - constant int64_t & ne2, + constant int32_t & ne01, + constant int32_t & ne02, + constant int32_t & ne03, + constant uint32_t & nb01, + constant uint32_t & nb02, + constant uint32_t & nb03, + constant int32_t & ne11, + constant int32_t & ne_12_2, // assume K and V are same shape + constant int32_t & ne_12_3, + constant uint32_t & nb_12_1, + constant uint32_t & nb_12_2, + constant uint32_t & nb_12_3, + constant uint32_t & nb31, + constant int32_t & ne1, + constant int32_t & ne2, constant float & scale, constant float & max_bias, constant float & m0, constant float & m1, - constant uint32_t & n_head_log2, + constant uint16_t & n_head_log2, constant float & logit_softcap, threadgroup half * shared [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { + ushort3 tgpig[[threadgroup_position_in_grid]], + ushort3 ntg[[threads_per_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups const int iq3 = tgpig[2]; @@ -2803,21 +2844,25 @@ kernel void kernel_flash_attn_ext( const short D8 = D/8; const short D16 = D/16; const short NW = N_SIMDWIDTH; - const short SH = (C + Q); // shared memory per simdgroup in (half) + const short SH = (2*C + Q); // shared memory per simdgroup (s_t == float) - const short T = D + 2*nsg*SH; // shared memory size per query in (half) - const short TF = T/2; // shared memory size per query in (float) - const short T4 = T/4; // shared memory size per query in (half4) + const short TS = nsg*SH; // shared memory size per query in (s_t == float) + const short T = D + 2*TS; // shared memory size per query in (half) - threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data - threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 - threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix + threadgroup q_t * sq = (threadgroup q_t *) (shared + 0*D); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shared + 0*D); // same as above but in q4_t + threadgroup o_t * so = (threadgroup o_t *) (shared + 0*D); // reuse query data for accumulation + threadgroup o4_t * so4 = (threadgroup o4_t *) (shared + 0*D); // same as above but in o4_t + threadgroup s_t * ss = (threadgroup s_t *) (shared + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix - threadgroup half * skv = (threadgroup half *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory - threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4 + threadgroup k_t * sk = (threadgroup k_t *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory + threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t + + threadgroup v_t * sv = (threadgroup v_t *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load V in shared memory + threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) - simdgroup_half8x8 lo[D8]; + o8x8_t lo[D8]; // load heads from Q to shared memory for (short j = sgitg; j < Q; j += nsg) { @@ -2825,71 +2870,61 @@ kernel void kernel_flash_attn_ext( for (short i = tiisg; i < D4; i += NW) { if (iq1 + j < ne01) { - sq4[j*T4 + i] = (half4) q4[i]; + sq4[j*D4 + i] = (q4_t) q4[i]; } else { - sq4[j*T4 + i] = 0.0h; + sq4[j*D4 + i] = (q4_t) 0.0f; } } } // zero out lo for (short i = 0; i < D8; ++i) { - lo[i] = make_filled_simdgroup_matrix(0.0h); + lo[i] = make_filled_simdgroup_matrix((o_t) 0.0f); } // zero out shared memory SH for (short j = 0; j < Q; ++j) { for (short i = tiisg; i < SH; i += NW) { - ss[j*TF + i] = 0.0f; + ss[j*TS + i] = 0.0f; } } threadgroup_barrier(mem_flags::mem_threadgroup); { - float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 }; + half S[Q] = { [0 ... Q-1] = 0.0f }; + half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; // thread indices inside the simdgroup + // TODO: see if we can utilize quad-group functions for better performance + // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (6.9.3) const short tx = tiisg%4; const short ty = tiisg/4; - // assume K and V are same shape - const short ne22 = ne12; - const short ne23 = ne13; + // broadcast kv + //const short rk2 = ne02/ne12; + //const short rk3 = ne03/ne13; - // broadcast k - const short rk2 = ne02/ne12; - const short rk3 = ne03/ne13; - - const short ik2 = iq2/rk2; - const short ik3 = iq3/rk3; - - // broadcast v - const short rv2 = ne02/ne22; - const short rv3 = ne03/ne23; - - const short iv2 = iq2/rv2; - const short iv3 = iq3/rv3; + const short ikv2 = iq2/(ne02/ne_12_2); + const short ikv3 = iq3/(ne03/ne_12_3); // load the queries from shared memory into local memory - simdgroup_half8x8 mq[D8]; + q8x8_t mq[D8]; for (short i = 0; i < D8; ++i) { - simdgroup_load(mq[i], sq + i*8, T); + simdgroup_load(mq[i], sq + i*8, D); } - // pointer to the mask - device const half * mp = (device const half *) (mask + iq1*nb31); + const bool has_mask = mask != q; - float slope = 1.0f; + half slope = 1.0f; // ALiBi if (max_bias > 0.0f) { - const uint32_t h = iq2; + const short h = iq2; - const float base = h < n_head_log2 ? m0 : m1; - const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + const half base = h < n_head_log2 ? m0 : m1; + const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; slope = pow(base, exph); } @@ -2902,120 +2937,137 @@ kernel void kernel_flash_attn_ext( break; } + if (has_mask) { + // used to detect blocks full of -INF + half smax = -INFINITY; + + // load the mask in shared memory + for (short j = 0; j < Q; ++j) { + device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*nb31); + + const half m = pm[ic + tiisg]; + + ss[j*TS + C + tiisg] = m; + smax = max(smax, m); + } + + smax = simd_max(smax); + + if (smax == -INFINITY) { + continue; + } + } + // Q*K^T { for (short cc = 0; cc < C/8; ++cc) { - simdgroup_float8x8 mqk = make_filled_simdgroup_matrix(0.h); + qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); // this is compile-time check, so it does not have runtime overhead - if (is_same::value) { + if (is_same::value) { // we can read directly from global memory - device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); + device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); +#pragma unroll for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mk; - simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose + k8x8_t mk; + simdgroup_load(mk, pk + i*8, nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10 simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); } } else { for (short ii = 0; ii < D16; ii += 4) { - device const block_q * pk4 = (device const block_q *) ((device const char *) k + ((ic + 8*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); + device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); if (D16%4 == 0) { // the head is evenly divisible by 4*16 = 64, so no need for bound checks - half4x4 tmp; - dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); - skv4[4*ty + tx] = tmp; + { + k4x4_t tmp; + deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); + sk4x4[4*ty + tx] = tmp; + } simdgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll for (short k = 0; k < 4; ++k) { - simdgroup_half8x8 mk; + k8x8_t mk; - simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); - simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); } } else { if (ii + tx < D16) { - half4x4 tmp; - dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); - skv4[4*ty + tx] = tmp; + k4x4_t tmp; + deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp); + sk4x4[4*ty + tx] = tmp; } simdgroup_barrier(mem_flags::mem_threadgroup); for (short k = 0; k < 4 && ii + k < D16; ++k) { - simdgroup_half8x8 mk; + k8x8_t mk; - simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); - simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); } } } } - simdgroup_store(mqk, ss + 8*cc, TF, 0, false); + // cast qk_t -> s_t + //s8x8_t mqks(1.0f); + //simdgroup_multiply(mqks, mqk, mqks); + //simdgroup_store(mqks, ss + 8*cc, TS, 0, false); + + simdgroup_store(mqk, ss + 8*cc, TS, 0, false); } } - // used to detect blocks full of -INF - float smax = -INFINITY; - // online softmax { - float ms[Q]; - - for (short j = 0; j < Q; ++j) { - const float m = M[j]; + for (ushort j = 0; j < Q; ++j) { + const half m = M[j]; // scale and apply the logitcap / mask - float s = ss[j*TF + tiisg]*scale; + half s = ss[j*TS + tiisg]*scale; if (logit_softcap != 0.0f) { s = logit_softcap*precise::tanh(s); } - if (mask != q) { - // mqk = mqk + mask*slope - s += slope*mp[ic + j*nb31/sizeof(half) + tiisg]; - } + // mqk = mqk + mask*slope + s += slope*ss[j*TS + C + tiisg]; - smax = simd_max(max(smax, s)); M[j] = simd_max(max(M[j], s)); - ms[j] = exp(m - M[j]); - const float vs = exp(s - M[j]); + const half ms = exp(m - M[j]); + const half vs = exp(s - M[j]); - S[j] = S[j]*ms[j] + simd_sum(vs); + S[j] = S[j]*ms + simd_sum(vs); // the P matrix from the paper (Q rows, C columns) - ss[j*TF + tiisg] = vs; - } + ss[j*TS + tiisg] = vs; - // create a QxQ diagonal matrix for rescaling the output - if (tiisg < Q) { - ss[tiisg*TF + C + tiisg] = ms[tiisg]; + // create a QxQ diagonal matrix for rescaling the output + if (tiisg == j) { + ss[j*TS + 2*C + j] = ms; + } } } - // skip -INF blocks - if (smax == -INFINITY) { - continue; - } - // O = diag(ms)*O { - simdgroup_float8x8 mm; - simdgroup_load(mm, ss + C, TF, 0, false); + s8x8_t mm; + simdgroup_load(mm, ss + 2*C, TS, 0, false); +#pragma unroll for (short i = 0; i < D8; ++i) { simdgroup_multiply(lo[i], mm, lo[i]); } @@ -3024,57 +3076,59 @@ kernel void kernel_flash_attn_ext( // O = O + (Q*K^T)*V { for (short cc = 0; cc < C/8; ++cc) { - simdgroup_float8x8 ms; - simdgroup_load(ms, ss + 8*cc, TF, 0, false); + s8x8_t ms; + simdgroup_load(ms, ss + 8*cc, TS, 0, false); - if (is_same::value) { + if (is_same::value) { // we can read directly from global memory - device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); + device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); #pragma unroll for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mv; - simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false); + v8x8_t mv; + simdgroup_load(mv, pv + i*8, nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20 simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]); } } else { for (short ii = 0; ii < D16; ii += 4) { - device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 8*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); + device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); if (D16%4 == 0) { // no need for bound checks - half4x4 tmp; - dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); - skv4[4*ty + tx] = tmp; + { + v4x4_t tmp; + deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); + sv4x4[4*ty + tx] = tmp; + } simdgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll for (short k = 0; k < 4; ++k) { - simdgroup_half8x8 mv; + v8x8_t mv; - simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); - simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); } } else { if (ii + tx < D16) { - half4x4 tmp; - dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); - skv4[4*ty + tx] = tmp; + v4x4_t tmp; + deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp); + sv4x4[4*ty + tx] = tmp; } simdgroup_barrier(mem_flags::mem_threadgroup); for (short k = 0; k < 4 && ii + k < D16; ++k) { - simdgroup_half8x8 mv; + v8x8_t mv; - simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); - simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); } } @@ -3087,23 +3141,23 @@ kernel void kernel_flash_attn_ext( // these are needed for reducing the results from the simdgroups (reuse the ss buffer) for (short j = 0; j < Q; ++j) { if (tiisg == 0) { - ss[j*TF + 0] = S[j]; - ss[j*TF + 1] = M[j]; + ss[j*TS + 0] = S[j]; + ss[j*TS + 1] = M[j]; } } } // reduce the warps sequentially - for (short sg = 1; sg < nsg; ++sg) { - float S = { 0.0f }; - float M = { -FLT_MAX/2 }; + for (ushort sg = 1; sg < nsg; ++sg) { + half S = { 0.0f }; + half M = { -__FLT16_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); // each simdgroup stores its output to shared memory, reusing sq if (sgitg == sg) { for (short i = 0; i < D8; ++i) { - simdgroup_store(lo[i], sq + i*8, T, 0, false); + simdgroup_store(lo[i], so + i*8, D, 0, false); } } @@ -3112,39 +3166,40 @@ kernel void kernel_flash_attn_ext( // the first simdgroup accumulates the results from the other simdgroups if (sgitg == 0) { for (short j = 0; j < Q; ++j) { - const float S0 = ss[j*TF + 0]; - const float S1 = ss[j*TF + sg*SH + 0]; + const half S0 = ss[j*TS + 0]; + const half S1 = ss[j*TS + sg*SH + 0]; - const float M0 = ss[j*TF + 1]; - const float M1 = ss[j*TF + sg*SH + 1]; + const half M0 = ss[j*TS + 1]; + const half M1 = ss[j*TS + sg*SH + 1]; M = max(M0, M1); - const float ms0 = exp(M0 - M); - const float ms1 = exp(M1 - M); + const half ms0 = exp(M0 - M); + const half ms1 = exp(M1 - M); S = S0*ms0 + S1*ms1; if (tiisg == 0) { - ss[j*TF + 0] = S; - ss[j*TF + 1] = M; + ss[j*TS + 0] = S; + ss[j*TS + 1] = M; - ss[j*TF + C + j ] = ms0; - ss[j*TF + C + j + sg*SH] = ms1; + ss[j*TS + 2*C + j ] = ms0; + ss[j*TS + 2*C + j + sg*SH] = ms1; } } // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 { - simdgroup_half8x8 t; - simdgroup_float8x8 ms0; - simdgroup_float8x8 ms1; + s8x8_t ms0; + s8x8_t ms1; - simdgroup_load(ms0, ss + C, TF, 0, false); - simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false); + simdgroup_load(ms0, ss + 2*C, TS, 0, false); + simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false); for (short i = 0; i < D8; ++i) { - simdgroup_load (t, sq + i*8, T, 0, false); + o8x8_t t; + + simdgroup_load (t, so + i*8, D, 0, false); simdgroup_multiply(t, ms1, t); simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t); @@ -3156,7 +3211,7 @@ kernel void kernel_flash_attn_ext( // store result to shared memory (reuse sq) if (sgitg == 0) { for (short i = 0; i < D8; ++i) { - simdgroup_store(lo[i], sq + i*8, T, 0, false); + simdgroup_store(lo[i], so + i*8, D, 0, false); } } @@ -3165,98 +3220,133 @@ kernel void kernel_flash_attn_ext( // final rescale with 1/S and store to global memory if (sgitg == 0) { for (short j = 0; j < Q && iq1 + j < ne01; ++j) { - const float S = ss[j*TF + 0]; + const float S = ss[j*TS + 0]; for (short i = tiisg; i < D4; i += NW) { - dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S; + dst4[((int64_t)iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) so4[j*D4 + i]/S; } } } } -typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; +// TODO: this is quite ugly. in the future these types will be hardcoded in the kernel, but for now keep them as +// template to be able to explore different combinations +// +#define FA_TYPES \ + half, half4, simdgroup_half8x8, \ + half, half4x4, simdgroup_half8x8, \ + half, half4x4, simdgroup_half8x8, \ + float, simdgroup_float8x8, \ + float, simdgroup_float8x8, \ + half, half4, simdgroup_half8x8 -template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; -template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +#endif -template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -// NOTE: can use half instead of float precision for some extra perf -// D - head size, Q - queries per threadgroup, C - cache items per threadgroup -template +template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +#undef FA_TYPES + +template< + typename q4_t, // query types in shared memory + typename q4x4_t, + typename k4x4_t, // key types in shared memory + typename v4x4_t, // value types in shared memory + typename qk_t, // Q*K types + typename s_t, // soft-max types + typename s4_t, + typename s4x4_t, + typename o4x4_t, // attention accumulation types + typename kd4x4_t, // key type in device memory + short nl_k, + void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &), + typename vd4x4_t, // key type in device memory + short nl_v, + void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), + short D, // head size + short Q = 1, // queries per threadgroup + short C = 32> // cache items per threadgroup kernel void kernel_flash_attn_ext_vec( device const char * q, device const char * k, device const char * v, device const char * mask, device float * dst, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant uint64_t & nb21, - constant uint64_t & nb22, - constant uint64_t & nb23, - constant uint64_t & nb31, - constant int64_t & ne1, - constant int64_t & ne2, + constant int32_t & ne01, + constant int32_t & ne02, + constant int32_t & ne03, + constant uint32_t & nb01, + constant uint32_t & nb02, + constant uint32_t & nb03, + constant int32_t & ne11, + constant int32_t & ne_12_2, // assume K and V are same shape + constant int32_t & ne_12_3, + constant uint32_t & nb_12_1, + constant uint32_t & nb_12_2, + constant uint32_t & nb_12_3, + constant uint32_t & nb31, + constant int32_t & ne1, + constant int32_t & ne2, constant float & scale, constant float & max_bias, constant float & m0, constant float & m1, - constant uint32_t & n_head_log2, + constant uint16_t & n_head_log2, constant float & logit_softcap, threadgroup half * shared [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { + ushort3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups const int iq3 = tgpig[2]; @@ -3267,89 +3357,81 @@ kernel void kernel_flash_attn_ext_vec( const short D16 = D/16; const short NW = N_SIMDWIDTH; const short NW4 = NW/4; - const short SH = C; // shared memory per simdgroup in (half) + const short SH = 2*C; // shared memory per simdgroup - const short T = D + 2*nsg*SH; // shared memory size per query in (half) + const short T = D + nsg*SH; // shared memory size per query in (half) - //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data - threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 - threadgroup half4x4 * sq44 = (threadgroup half4x4 *) (shared + 0*D); // same as above but in half4x4 - threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention - threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 - threadgroup float4x4 * sr44 = (threadgroup float4x4 *) (shared + 2*sgitg*D + Q*T); // scratch buffer for the results + //threadgroup q_t * sq = (threadgroup q_t *) (shared + 0*D); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shared + 0*D); // same as above but in q4_t + threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shared + 0*D); // same as above but in q4x4_t + threadgroup s_t * ss = (threadgroup s_t *) (shared + sgitg*SH + Q*D); // scratch buffer for attention + threadgroup s4_t * ss4 = (threadgroup s4_t *) (shared + sgitg*SH + Q*D); // same as above but in s4_t + threadgroup half * sm = (threadgroup half *) (shared + sgitg*SH + C + Q*D); // scratch buffer for mask + threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shared + sgitg*D + Q*T); // scratch buffer for the results // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) - float4x4 lo[D16/NW4]; + o4x4_t lo[D16/NW4]; // load heads from Q to shared memory device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03)); for (short i = tiisg; i < D4; i += NW) { if (iq1 < ne01) { - sq4[i] = (half4) q4[i]; + sq4[i] = (q4_t) q4[i]; } else { - sq4[i] = 0.0h; + sq4[i] = (q4_t) 0.0f; } } // zero out lo for (short i = 0; i < D16/NW4; i += NW4) { - lo[i] = float4x4(0.0f); + lo[i] = (o4x4_t) 0.0f; } // zero out shared memory SH for (short i = tiisg; i < SH/4; i += NW) { - ss4[i] = 0.0h; + ss4[i] = (s4_t) 0.0f; } threadgroup_barrier(mem_flags::mem_threadgroup); { - float S = 0.0f; - float M = -FLT_MAX/2; + half S = 0.0f; + half M = -__FLT16_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%8; const short ty = tiisg/8; - // assume K and V are same shape - const short ne22 = ne12; - const short ne23 = ne13; + // broadcast kv + //const short rk2 = ne02/ne12; + //const short rk3 = ne03/ne13; - // broadcast k - const short rk2 = ne02/ne12; - const short rk3 = ne03/ne13; - - const short ik2 = iq2/rk2; - const short ik3 = iq3/rk3; - - // broadcast v - const short rv2 = ne02/ne22; - const short rv3 = ne03/ne23; - - const short iv2 = iq2/rv2; - const short iv3 = iq3/rv3; + const short ikv2 = iq2/(ne02/ne_12_2); + const short ikv3 = iq3/(ne03/ne_12_3); // load the queries from shared memory into local memory - float4x4 mq[D16/NW4]; + q4x4_t mq[D16/NW4]; for (short ii = 0; ii < D16; ii += NW4) { - mq[ii/NW4] = (float4x4) sq44[ii + tx]; + mq[ii/NW4] = sq4x4[ii + tx]; } - // pointer to the mask - device const half * mp = (device const half *) (mask + iq1*nb31); + const bool has_mask = mask != q; - float slope = 1.0f; + // pointer to the mask + device const half * pm = (device const half *) (mask + iq1*nb31); + + half slope = 1.0f; // ALiBi if (max_bias > 0.0f) { - const uint32_t h = iq2; + const short h = iq2; - const float base = h < n_head_log2 ? m0 : m1; - const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + const half base = h < n_head_log2 ? m0 : m1; + const short exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - slope = pow(base, exp); + slope = pow(base, exph); } // loop over the KV cache @@ -3360,20 +3442,24 @@ kernel void kernel_flash_attn_ext_vec( break; } + if (has_mask) { + sm[tiisg] = pm[ic + tiisg]; + } + // Q*K^T { // each simdgroup processes 1 query and 4 keys for (short cc = 0; cc < C/4; ++cc) { - float mqk = 0.0; + qk_t mqk = 0.0; - device const block_q * pk = (device const block_q *) ((device const char *) k + ((ic + 4*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); + device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); #pragma unroll for (short ii = 0; ii < D16; ii += NW4) { const short i = ii + tx; - float4x4 mk; - dequantize_func(pk + i/nl, i%nl, mk); + k4x4_t mk; + deq_k(pk + i/nl_k, i%nl_k, mk); mqk += dot(mq[ii/NW4][0], mk[0]) + @@ -3401,7 +3487,7 @@ kernel void kernel_flash_attn_ext_vec( mqk = logit_softcap*precise::tanh(mqk); } - mqk += (mask != q) ? ((float) mp[ic + 4*cc + ty])*slope : (float) 0.0f; + mqk += sm[4*cc + ty]*slope; ss[4*cc + ty] = mqk; } @@ -3412,20 +3498,18 @@ kernel void kernel_flash_attn_ext_vec( // online softmax { - const short p = tiisg; - - const float m = M; - const float s = ss[p]; + const half m = M; + const half s = ss[tiisg]; M = simd_max(max(M, s)); - const float ms = exp(m - M); - const float vs = exp(s - M); + const half ms = exp(m - M); + const half vs = exp(s - M); S = S*ms + simd_sum(vs); // the P matrix from the paper (Q rows, C columns) - ss[p] = vs; + ss[tiisg] = vs; // O = diag(ms)*O #pragma unroll @@ -3440,18 +3524,18 @@ kernel void kernel_flash_attn_ext_vec( { #pragma unroll for (short cc = 0; cc < C/4; ++cc) { - device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 4*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); + device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); - const float4x4 lss(ss[4*cc + ty]); + const s4x4_t ms(ss[4*cc + ty]); #pragma unroll for (short ii = 0; ii < D16; ii += NW4) { const short i = ii + tx; - float4x4 mv; - dequantize_func(pv4 + i/nl, i%nl, mv); + v4x4_t mv; + deq_v(pv4 + i/nl_v, i%nl_v, mv); - lo[ii/NW4] += mv*lss; + lo[ii/NW4] += mv*ms; } } } @@ -3459,8 +3543,8 @@ kernel void kernel_flash_attn_ext_vec( // these are needed for reducing the results from the simdgroups (reuse the ss buffer) if (tiisg == 0) { - ss[0] = S; - ss[1] = M; + ss[0] = (s_t) S; + ss[1] = (s_t) M; } } @@ -3489,7 +3573,7 @@ kernel void kernel_flash_attn_ext_vec( // store results to shared memory for (short i = tiisg; i < D16; i += NW4) { - sr44[i] = lo[i/NW4]; + sr4x4[i] = lo[i/NW4]; } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3497,18 +3581,18 @@ kernel void kernel_flash_attn_ext_vec( // parallel reduce for (short r = nsg/2; r > 0; r >>= 1) { if (sgitg < r) { - const float S0 = ss[ 0]; - const float S1 = ss[r*SH + 0]; + const half S0 = ss[ 0]; + const half S1 = ss[r*SH + 0]; - const float M0 = ss[ 1]; - const float M1 = ss[r*SH + 1]; + const half M0 = ss[ 1]; + const half M1 = ss[r*SH + 1]; - const float M = max(M0, M1); + const half M = max(M0, M1); - const float ms0 = exp(M0 - M); - const float ms1 = exp(M1 - M); + const half ms0 = exp(M0 - M); + const half ms1 = exp(M1 - M); - const float S = S0*ms0 + S1*ms1; + const half S = S0*ms0 + S1*ms1; if (tiisg == 0) { ss[0] = S; @@ -3517,7 +3601,7 @@ kernel void kernel_flash_attn_ext_vec( // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 for (short i = tiisg; i < D16; i += NW) { - sr44[i] = sr44[i]*ms0 + sr44[i + r*D16]*ms1; + sr4x4[i] = sr4x4[i]*ms0 + sr4x4[i + r*D16]*ms1; } } @@ -3531,26 +3615,45 @@ kernel void kernel_flash_attn_ext_vec( const float S = ss[0]; for (short i = tiisg; i < D16; i += NW) { - dst44[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = sr44[i]/S; + dst44[((int64_t)iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = (float4x4) sr4x4[i]/S; } } } -typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; +// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem +// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max +// +#define FA_TYPES \ + half4, half4x4, \ + half4x4, \ + half4x4, \ + float, \ + half, half4, half4x4, \ + half4x4 -template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; -template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +#endif +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +#undef FA_TYPES template kernel void kernel_cpy( diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bc034015f..cd26a361b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4228,6 +4228,15 @@ void ggml_flash_attn_ext_set_prec( ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second } +enum ggml_prec ggml_flash_attn_ext_get_prec( + const struct ggml_tensor * a) { + GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT); + + const int32_t prec_i32 = ggml_get_op_params_i32(a, 3); + + return (enum ggml_prec) prec_i32; +} + // ggml_flash_attn_back struct ggml_tensor * ggml_flash_attn_back( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9d48a2717..65be43281 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3745,7 +3745,7 @@ static std::vector> make_test_cases_eval() { for (int nh : { 32, }) { for (int kv : { 512, 1024, }) { for (int nb : { 1, 3, 32, 35, }) { - for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { + for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV)); } } From 695ad752b2631af84ba321177656705b30c6e401 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Nov 2024 18:37:41 +0200 Subject: [PATCH 057/279] metal : improve clarity (minor) (#10171) --- ggml/src/ggml-metal.metal | 70 +++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index edce74108..89f12724d 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -3356,7 +3356,7 @@ kernel void kernel_flash_attn_ext_vec( const short D4 = D/4; const short D16 = D/16; const short NW = N_SIMDWIDTH; - const short NW4 = NW/4; + const short NL = NW/4; const short SH = 2*C; // shared memory per simdgroup const short T = D + nsg*SH; // shared memory size per query in (half) @@ -3370,7 +3370,7 @@ kernel void kernel_flash_attn_ext_vec( threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shared + sgitg*D + Q*T); // scratch buffer for the results // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) - o4x4_t lo[D16/NW4]; + o4x4_t lo[D16/NL]; // load heads from Q to shared memory device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03)); @@ -3384,7 +3384,7 @@ kernel void kernel_flash_attn_ext_vec( } // zero out lo - for (short i = 0; i < D16/NW4; i += NW4) { + for (short i = 0; i < D16/NL; ++i) { lo[i] = (o4x4_t) 0.0f; } @@ -3400,8 +3400,8 @@ kernel void kernel_flash_attn_ext_vec( half M = -__FLT16_MAX__/2; // thread indices inside the simdgroup - const short tx = tiisg%8; - const short ty = tiisg/8; + const short tx = tiisg%NL; + const short ty = tiisg/NL; // broadcast kv //const short rk2 = ne02/ne12; @@ -3411,10 +3411,10 @@ kernel void kernel_flash_attn_ext_vec( const short ikv3 = iq3/(ne03/ne_12_3); // load the queries from shared memory into local memory - q4x4_t mq[D16/NW4]; + q4x4_t mq[D16/NL]; - for (short ii = 0; ii < D16; ii += NW4) { - mq[ii/NW4] = sq4x4[ii + tx]; + for (short ii = 0; ii < D16; ii += NL) { + mq[ii/NL] = sq4x4[ii + tx]; } const bool has_mask = mask != q; @@ -3455,17 +3455,17 @@ kernel void kernel_flash_attn_ext_vec( device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); #pragma unroll - for (short ii = 0; ii < D16; ii += NW4) { + for (short ii = 0; ii < D16; ii += NL) { const short i = ii + tx; k4x4_t mk; deq_k(pk + i/nl_k, i%nl_k, mk); mqk += - dot(mq[ii/NW4][0], mk[0]) + - dot(mq[ii/NW4][1], mk[1]) + - dot(mq[ii/NW4][2], mk[2]) + - dot(mq[ii/NW4][3], mk[3]); + dot(mq[ii/NL][0], mk[0]) + + dot(mq[ii/NL][1], mk[1]) + + dot(mq[ii/NL][2], mk[2]) + + dot(mq[ii/NL][3], mk[3]); } // simdgroup reduce @@ -3513,8 +3513,8 @@ kernel void kernel_flash_attn_ext_vec( // O = diag(ms)*O #pragma unroll - for (short ii = 0; ii < D16; ii += NW4) { - lo[ii/NW4] *= ms; + for (short ii = 0; ii < D16; ii += NL) { + lo[ii/NL] *= ms; } } @@ -3529,13 +3529,13 @@ kernel void kernel_flash_attn_ext_vec( const s4x4_t ms(ss[4*cc + ty]); #pragma unroll - for (short ii = 0; ii < D16; ii += NW4) { + for (short ii = 0; ii < D16; ii += NL) { const short i = ii + tx; v4x4_t mv; deq_v(pv4 + i/nl_v, i%nl_v, mv); - lo[ii/NW4] += mv*ms; + lo[ii/NL] += mv*ms; } } } @@ -3557,23 +3557,37 @@ kernel void kernel_flash_attn_ext_vec( // [ 5, 13, 21, 29] -> [ 5] // [ 6, 14, 22, 30] -> [ 6] // [ 7, 15, 23, 31] -> [ 7] - for (short ii = 0; ii < D16; ii += NW4) { - lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 16); - lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 8); + for (short ii = 0; ii < D16; ii += NL) { + lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 16); + lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 8); + //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 4); + //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 2); + //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 1); - lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 16); - lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 8); + lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 16); + lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 8); + //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 4); + //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 2); + //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 1); - lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 16); - lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 8); + lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 16); + lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 8); + //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 4); + //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 2); + //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 1); - lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 16); - lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 8); + lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 16); + lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 8); + //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 4); + //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 2); + //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 1); } + threadgroup_barrier(mem_flags::mem_threadgroup); + // store results to shared memory - for (short i = tiisg; i < D16; i += NW4) { - sr4x4[i] = lo[i/NW4]; + for (short i = tiisg; i < D16; i += NL) { + sr4x4[i] = lo[i/NL]; } threadgroup_barrier(mem_flags::mem_threadgroup); From ec450d3bbf9fdb3cd06b27c00c684fd1861cb0cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Nov 2024 21:59:46 +0200 Subject: [PATCH 058/279] metal : opt-in compile flag for BF16 (#10218) * metal : opt-in compile flag for BF16 ggml-ci * ci : use BF16 ggml-ci * swift : switch back to v12 * metal : has_float -> use_float ggml-ci * metal : fix BF16 check in MSL ggml-ci --- .github/workflows/build.yml | 17 +++++++++-- Makefile | 4 +++ ci/run.sh | 2 +- ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 4 +++ ggml/src/ggml-metal.m | 59 ++++++++++++++++++++++--------------- ggml/src/ggml-metal.metal | 32 ++++++++++---------- 7 files changed, 77 insertions(+), 42 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 02dcee963..1e37a3c79 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,7 +55,13 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. + cmake .. \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=ON \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DGGML_RPC=ON \ + -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -113,7 +119,12 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=ON \ + -DGGML_METAL=OFF \ + -DGGML_RPC=ON \ + -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -569,6 +580,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -599,6 +611,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ diff --git a/Makefile b/Makefile index b9131eae5..dfa32d516 100644 --- a/Makefile +++ b/Makefile @@ -878,6 +878,10 @@ ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJ_GGML += ggml/src/ggml-metal.o + +ifdef GGML_METAL_USE_BF16 + MK_CPPFLAGS += -DGGML_METAL_USE_BF16 +endif # GGML_METAL_USE_BF16 ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif diff --git a/ci/run.sh b/ci/run.sh index 21b62dd1e..20610e560 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,7 +39,7 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 6866a25d3..81b7a02f5 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -153,6 +153,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) +option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL}) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 34b81bd7f..6c5b816d2 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -58,6 +58,10 @@ if (GGML_METAL) add_compile_definitions(GGML_METAL_NDEBUG) endif() + if (GGML_METAL_USE_BF16) + add_compile_definitions(GGML_METAL_USE_BF16) + endif() + # copy ggml-common.h and ggml-metal.metal to bin directory configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index e19397fd2..10d59cb9f 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -39,6 +39,7 @@ static struct ggml_backend_metal_device_context { bool has_simdgroup_reduction; bool has_simdgroup_mm; bool has_bfloat; + bool use_bfloat; char name[128]; } g_ggml_ctx_dev_main = { @@ -47,6 +48,7 @@ static struct ggml_backend_metal_device_context { /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, /*.has_bfloat =*/ false, + /*.use_bfloat =*/ false, /*.name =*/ "", }; @@ -65,6 +67,12 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; +#if defined(GGML_METAL_USE_BF16) + ctx->use_bfloat = ctx->has_bfloat; +#else + ctx->use_bfloat = false; +#endif + strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } @@ -504,6 +512,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de // dictionary of preprocessor macros NSMutableDictionary * prep = [NSMutableDictionary dictionary]; + if (ctx_dev->use_bfloat) { + [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"]; + } + MTLCompileOptions * options = [MTLCompileOptions new]; options.preprocessorMacros = prep; @@ -556,7 +568,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); - GGML_LOG_INFO("%s: bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; @@ -608,7 +621,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; - const bool has_bfloat = ctx_dev->has_bfloat; + const bool use_bfloat = ctx_dev->use_bfloat; // simd_sum and simd_max requires MTLGPUFamilyApple7 @@ -644,7 +657,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, get_rows_bf16, has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, get_rows_bf16, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); @@ -671,10 +684,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); @@ -703,7 +716,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, has_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, has_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, has_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, mul_mv_id_bf16_f32, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, mul_mv_id_bf16_f32, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, has_simdgroup_reduction); @@ -725,7 +738,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, mul_mm_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, mul_mm_bf16_f32, has_simdgroup_mm && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, has_simdgroup_mm); @@ -747,7 +760,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, has_simdgroup_mm); @@ -788,12 +801,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, flash_attn_ext_bf16_h112, has_simdgroup_mm && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, flash_attn_ext_bf16_h128, has_simdgroup_mm && has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, flash_attn_ext_bf16_h112, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, flash_attn_ext_bf16_h128, has_simdgroup_mm && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm); @@ -825,14 +838,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, flash_attn_ext_vec_bf16_h256, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, flash_attn_ext_vec_bf16_h256, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); @@ -840,11 +853,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, cpy_bf16_f32, has_bfloat); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, cpy_bf16_bf16, has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, cpy_bf16_f32, use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, cpy_bf16_bf16, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); @@ -936,9 +949,9 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) { const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; - const bool has_bfloat = ctx_dev->has_bfloat; + const bool use_bfloat = ctx_dev->use_bfloat; - if (!has_bfloat) { + if (!use_bfloat) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 89f12724d..7e1517414 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -18,11 +18,11 @@ using namespace metal; // .../usr/bin/metal -dM -E -c ggml/src/ggml-metal.metal // .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal // -#if __METAL_VERSION__ < 310 -#define GGML_METAL_NO_BFLOAT +#if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16) +#undef GGML_METAL_USE_BF16 #endif -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) typedef matrix bfloat4x4; #endif @@ -41,7 +41,7 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) reg = (type4x4)(*src); } -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { reg = (type4x4)(*src); @@ -2082,7 +2082,7 @@ typedef decltype(kernel_mul_mv) mul_mv_t; template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t kernel_mul_mv; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; #endif @@ -2155,7 +2155,7 @@ kernel void kernel_mul_mv_1row( typedef decltype(kernel_mul_mv_1row) mul_mv_1row_t; template [[host_name("kernel_mul_mv_f16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; #endif @@ -2217,7 +2217,7 @@ kernel void kernel_mul_mv_l4( typedef decltype(kernel_mul_mv_l4) mul_mv_l4_t; template [[host_name("kernel_mul_mv_f16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; #endif @@ -3249,7 +3249,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_ template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -3648,7 +3648,7 @@ kernel void kernel_flash_attn_ext_vec( typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; #endif template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; @@ -3658,7 +3658,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; #endif template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; @@ -3715,12 +3715,12 @@ typedef decltype(kernel_cpy) kernel_cpy_t; template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy; template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy; #endif template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy; template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy; template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy; #endif @@ -6628,7 +6628,7 @@ typedef decltype(kernel_get_rows_f) get_rows_f_t; template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f; #endif @@ -6662,7 +6662,7 @@ typedef decltype(kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm; #endif template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -6693,7 +6693,7 @@ typedef decltype(kernel_mul_mm_id) mat_mm_id_t; template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; #endif template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -6919,7 +6919,7 @@ typedef decltype(kernel_mul_mv_id>>; template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; -#if !defined(GGML_METAL_NO_BFLOAT) +#if defined(GGML_METAL_USE_BF16) template [[host_name("kernel_mul_mv_id_bf16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; #endif template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; From 8fc393f246c550d2481e53323a47644a94e8d01f Mon Sep 17 00:00:00 2001 From: haopeng <657407891@qq.com> Date: Sat, 9 Nov 2024 15:06:54 +0800 Subject: [PATCH 059/279] scripts : fix pattern and get n_tokens in one go (#10221) --- examples/chat-persistent.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index d9cab9836..9d761ebb8 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" -SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' -SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' +SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\ +'|'\ +'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" CTX_SIZE=2048 @@ -129,15 +130,12 @@ while read -e line; do printf ' ' - # HACK get num tokens from debug message - # TODO get both messages in one go - if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || - ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then + if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then echo >&2 "Couldn't get number of tokens from ./llama-cli output!" exit 1 fi - n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) + n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")") if ((n_tokens > CTX_ROTATE_POINT)); then tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" From e89213492d3e01705739789733f0f2d250b4c449 Mon Sep 17 00:00:00 2001 From: amritahs-ibm Date: Sat, 9 Nov 2024 12:47:50 +0530 Subject: [PATCH 060/279] ggml : optimize llamafile cpu matrix multiplication for ppc64le (#10156) This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for FP32 datatype. This change results in a consistent 90% improvement in input processing time, and 20% to 80% improvement in output processing time, across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S --- ggml/src/CMakeLists.txt | 9 +- ggml/src/llamafile/sgemm.cpp | 608 +++++++++++++++++++++++++++++++++++ 2 files changed, 615 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6c5b816d2..a05f8c505 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1265,8 +1265,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" + OUTPUT_VARIABLE POWER10_M) + string(FIND ${POWER10_M} "POWER10" substring_index) + if(${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index 9eead3f61..da4146ec4 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); } inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(__MMA__) +typedef vector unsigned char vec_t; +typedef __vector_quad acc_t; +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// // VECTORIZED FUSED MULTIPLY ADD @@ -1026,6 +1030,600 @@ class tinyBLAS_Q0_AVX { }; #endif // __AVX__ +//PPC Implementation +#if defined(__MMA__) + +#define SAVE_ACC(ACC, ii, jj) \ + __builtin_mma_disassemble_acc(vec_C, ACC); \ + for (int I = 0; I < 4; I++) { \ + for (int J = 0; J < 4; J++) { \ + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \ + } \ + } \ + +template +class tinyBLAS_PPC { + public: + tinyBLAS_PPC(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n) { + mnpack(0, m, 0, n); + } + + private: + + void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); + + void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) { + int64_t i, j; + float *aoffset = NULL, *boffset = NULL; + float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + + aoffset = const_cast(a); + boffset = vec; + j = (rows >> 3); + if (j > 0) { + do { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + i = (cols >> 3); + if (i > 0) { + __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; + vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2]; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4); + C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5); + C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6); + C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7); + C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8); + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + __builtin_vsx_disassemble_pair(c5, &C5); + __builtin_vsx_disassemble_pair(c6, &C6); + __builtin_vsx_disassemble_pair(c7, &C7); + __builtin_vsx_disassemble_pair(c8, &C8); + + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); + t3 = vec_mergeh(c5[0], c6[0]); + t4 = vec_mergeh(c7[0], c8[0]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); + t3 = vec_mergel(c5[0], c6[0]); + t4 = vec_mergel(c7[0], c8[0]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + t1 = vec_mergeh(c1[1], c2[1]); + t2 = vec_mergeh(c3[1], c4[1]); + t3 = vec_mergeh(c5[1], c6[1]); + t4 = vec_mergeh(c7[1], c8[1]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset+32); + vec_xst(t6, 0, boffset+36); + vec_xst(t7, 0, boffset+40); + vec_xst(t8, 0, boffset+44); + + t1 = vec_mergel(c1[1], c2[1]); + t2 = vec_mergel(c3[1], c4[1]); + t3 = vec_mergel(c5[1], c6[1]); + t4 = vec_mergel(c7[1], c8[1]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset+48); + vec_xst(t6, 0, boffset+52); + vec_xst(t7, 0, boffset+56); + vec_xst(t8, 0, boffset+60); + + aoffset1 += 8*lda; + aoffset2 += 8*lda; + aoffset3 += 8*lda; + aoffset4 += 8*lda; + boffset += 64; + i--; + } while(i > 0); + } + if (cols & 4) { + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t3, t4, 0); + t7 = vec_xxpermdi(t1, t2, 3); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + } + j--; + } while(j > 0); + } + + if (rows & 4) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + i = (cols >> 3); + if (i > 0) { + __vector_pair C1, C2, C3, C4; + vector float c1[2], c2[2], c3[2], c4[2]; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4); + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); + t3 = vec_mergel(c1[0], c2[0]); + t4 = vec_mergel(c3[0], c4[0]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t1, t2, 3); + t7 = vec_xxpermdi(t3, t4, 0); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergeh(c1[1], c2[1]); + t2 = vec_mergeh(c3[1], c4[1]); + t3 = vec_mergel(c1[1], c2[1]); + t4 = vec_mergel(c3[1], c4[1]); + t5 = vec_xxpermdi(t1, t2, 0); + t6 = vec_xxpermdi(t1, t2, 3); + t7 = vec_xxpermdi(t3, t4, 0); + t8 = vec_xxpermdi(t3, t4, 3); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + aoffset1 += 8*lda; + aoffset2 += 8*lda; + aoffset3 += 8*lda; + aoffset4 += 8*lda; + boffset += 32; + i--; + } while(i > 0); + } + + if (cols & 4) { + vector float c1, c2, c3, c4; + vector float t1, t2, t3, t4; + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_xxpermdi(t1, t2, 0); + t4 = vec_xxpermdi(t1, t2, 3); + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_xxpermdi(t1, t2, 0); + t4 = vec_xxpermdi(t1, t2, 3); + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + } + } + if (rows & 3) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + if (cols & 4) { + vector float c1, c2, c3, c4 = {0}; + vector float t1, t2, t3, t4; + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_xxpermdi(t1, t2, 0); + t4 = vec_xxpermdi(t1, t2, 3); + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_xxpermdi(t1, t2, 0); + t4 = vec_xxpermdi(t1, t2, 3); + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + } + } + } + + void KERNEL_4x4(int64_t ii, int64_t jj) { + vec_t vec_A[4], vec_B[4], vec_C[4]; + acc_t acc_0; + __builtin_mma_xxsetaccz(&acc_0); + for (int l = 0; l < k; l+=4) { + READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); + READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]); + } + SAVE_ACC(&acc_0, ii, jj); + } + + void KERNEL_4x8(int64_t ii, int64_t jj) { + vec_t vec_A[4], vec_B[8], vec_C[4]; + acc_t acc_0, acc_1; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + for (int64_t l = 0; l < k; l+=4) { + READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); + READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]); + __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]); + __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]); + } + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii, jj+4); + } + + void KERNEL_8x4(int64_t ii, int64_t jj) { + vec_t vec_A[8], vec_B[4], vec_C[4]; + acc_t acc_0, acc_1; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + for (int64_t l = 0; l < k; l+=4) { + READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A); + READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]); + __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]); + } + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii+4, jj); + } + + void KERNEL_8x8(int64_t ii, int64_t jj) { + vec_t vec_A[16], vec_B[16], vec_C[4]; + acc_t acc_0, acc_1, acc_2, acc_3; + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + __builtin_mma_xxsetaccz(&acc_2); + __builtin_mma_xxsetaccz(&acc_3); + for (int l = 0; l < k; l+=8) { + READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A); + READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B); + for(int x = 0; x < 16; x+=2) { + __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]); + __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]); + __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]); + } + } + SAVE_ACC(&acc_0, ii, jj); + SAVE_ACC(&acc_1, ii, jj+4); + SAVE_ACC(&acc_2, ii+4, jj); + SAVE_ACC(&acc_3, ii+4, jj+4); + } + + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + int m_rem = MIN(m - m0, 16); + int n_rem = MIN(n - n0, 16); + if (m_rem >= 16 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if(m_rem >= 8 && n_rem >= 16) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if (m_rem >= 8 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 8) { + mc = 4; + nc = 8; + gemm<4,8>(m0, m, n0, n); + } else if (m_rem >= 8 && n_rem >= 4) { + mc = 8; + nc = 4; + gemm<8,4>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 4) { + mc = 4; + nc = 4; + gemm<4,4>(m0, m, n0, n); + } else if ((m_rem < 4) && (n_rem > 4)) { + nc = 4; + switch(m_rem) { + case 1: + mc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 2: + mc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 3: + mc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + default: + return; + } + } else if ((m_rem > 4) && (n_rem < 4)) { + mc = 4; + switch(n_rem) { + case 1: + nc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 2: + nc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 3: + nc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + default: + return; + } + } else { + switch((m_rem << 4) | n_rem) { + case 0x43: + mc = 4; + nc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x42: + mc = 4; + nc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x41: + mc = 4; + nc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x34: + mc = 3; + nc = 4; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x33: + mc = 3; + nc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x32: + mc = 3; + nc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x31: + mc = 3; + nc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x24: + mc = 2; + nc = 4; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x23: + mc = 2; + nc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x22: + mc = 2; + nc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x21: + mc = 2; + nc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x14: + mc = 1; + nc = 4; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x13: + mc = 1; + nc = 3; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x12: + mc = 1; + nc = 2; + gemm_small(m0, m, n0, n, mc, nc); + break; + case 0x11: + mc = 1; + nc = 1; + gemm_small(m0, m, n0, n, mc, nc); + break; + default: + return; + } + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + vec_t vec_C[4]; + acc_t acc_0; + __builtin_mma_xxsetaccz(&acc_0); + vec_t vec_A[4], vec_B[4]; + for (int l=0; l= 4 && RM == 1) { + float* a = const_cast(A+(ii)*lda+l); + READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + vec_A[0] = (vec_t)vec_xl(0,a); + vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3)); + } else { + READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); + READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); + } + __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); + __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]); + } + __builtin_mma_disassemble_acc(vec_C, &acc_0); + for (int I = 0; I < RM; I++) { + for (int J = 0; J < RN; J++) { + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); + } + } + } + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (RM == 4 && RN == 4) { + kernel = &tinyBLAS_PPC::KERNEL_4x4; + } else if (RM == 4 && RN == 8) { + kernel = &tinyBLAS_PPC::KERNEL_4x8; + } else if (RM == 8 && RN == 4) { + kernel = &tinyBLAS_PPC::KERNEL_8x4; + } else if (RM == 8 && RN == 8) { + kernel = &tinyBLAS_PPC::KERNEL_8x8; + } + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + (this->*kernel)(ii, jj); + } + } + + const TA *const A; + const TB *const B; + TC *C; + TA *At; + TB *Bt; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; +#endif } // namespace /** @@ -1114,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda ith, nth}; tb.matmul(m, n); return true; +#elif defined(__MMA__) + if (k % 8) + return false; + tinyBLAS_PPC tb{ + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; #else return false; #endif From 5b359bb1e3585de45bec79fd6c18934897662cdf Mon Sep 17 00:00:00 2001 From: SXX Date: Sat, 9 Nov 2024 15:35:46 +0800 Subject: [PATCH 061/279] =?UTF-8?q?ggml:=20fix=20zero=20division=20in=20?= =?UTF-8?q?=E2=80=98dne=E2=80=99=20calculation=20in=20CUDA=20COUNT=5FEQUAL?= =?UTF-8?q?=20operator=20when=20=E2=80=98ne=E2=80=99=20is=20small=20(#1021?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ggml/src/ggml-cuda/count-equal.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu index ffb053b10..08898115d 100644 --- a/ggml/src/ggml-cuda/count-equal.cu +++ b/ggml/src/ggml-cuda/count-equal.cu @@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int"); - const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE); + const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE); CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream)); From 46323fa9efd5e6c8aeef8d6eb6c332ee0d95eb13 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Nov 2024 11:21:49 +0200 Subject: [PATCH 062/279] metal : hide debug messages from normal log --- ggml/src/ggml-metal.m | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 10d59cb9f..c112fd866 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -596,17 +596,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->kernels[i].pipeline = nil; } - /* - GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ - (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ - (int) kernel->pipeline.threadExecutionWidth); \ - */ #define GGML_METAL_ADD_KERNEL(e, name, supported) \ if (supported) { \ struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ id metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \ kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \ - GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.threadExecutionWidth); \ [metal_function release]; \ From f018acba22095b8995bf6c5ef815b16a3ce4cf1b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Nov 2024 11:26:34 +0200 Subject: [PATCH 063/279] llama : fix Qwen model type strings --- src/llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 034441e1f..4d89c5222 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2301,6 +2301,7 @@ enum e_model { MODEL_1B, MODEL_1_3B, MODEL_1_4B, + MODEL_1_5B, MODEL_1_6B, MODEL_2B, MODEL_2_8B, @@ -5227,6 +5228,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_1B: return "1B"; case MODEL_1_3B: return "1.3B"; case MODEL_1_4B: return "1.4B"; + case MODEL_1_5B: return "1.5B"; case MODEL_1_6B: return "1.6B"; case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; @@ -5598,6 +5600,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break; + case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break; case 80: model.type = e_model::MODEL_70B; break; From bb38cdd8baf37de1fadab3e867c6ba4ae452eff6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Nov 2024 11:52:45 +0200 Subject: [PATCH 064/279] metal : fix F32 accumulation in FA vec kernel (#10232) --- ggml/src/ggml-metal.metal | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 7e1517414..1f233ba7f 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -3450,7 +3450,7 @@ kernel void kernel_flash_attn_ext_vec( { // each simdgroup processes 1 query and 4 keys for (short cc = 0; cc < C/4; ++cc) { - qk_t mqk = 0.0; + qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 }; device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*nb_12_1 + ikv2*nb_12_2 + ikv3*nb_12_3)); @@ -3461,13 +3461,14 @@ kernel void kernel_flash_attn_ext_vec( k4x4_t mk; deq_k(pk + i/nl_k, i%nl_k, mk); - mqk += - dot(mq[ii/NL][0], mk[0]) + - dot(mq[ii/NL][1], mk[1]) + - dot(mq[ii/NL][2], mk[2]) + - dot(mq[ii/NL][3], mk[3]); + mqka[0] += dot(mq[ii/NL][0], mk[0]); + mqka[1] += dot(mq[ii/NL][1], mk[1]); + mqka[2] += dot(mq[ii/NL][2], mk[2]); + mqka[3] += dot(mq[ii/NL][3], mk[3]); } + qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3]; + // simdgroup reduce // [ 0 .. 7] -> [ 0] // [ 8 .. 15] -> [ 8] From 39a334a9aaf2000f93a899d9f43d889e460640ee Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Nov 2024 11:53:02 +0200 Subject: [PATCH 065/279] metal : fix build and some more comments (#10229) --- ggml/src/ggml-metal.m | 2 ++ ggml/src/ggml-metal.metal | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index c112fd866..04ec5117f 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3041,6 +3041,8 @@ static void ggml_metal_encode_node( bool use_vec_kernel = false; + // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0) + // for now avoiding mainly to keep the number of templates/kernels a bit lower if (ne01 >= 4 || (ne00%128 != 0)) { switch (src1->type) { case GGML_TYPE_F16: diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 1f233ba7f..779f45968 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -3356,8 +3356,8 @@ kernel void kernel_flash_attn_ext_vec( const short D4 = D/4; const short D16 = D/16; const short NW = N_SIMDWIDTH; - const short NL = NW/4; - const short SH = 2*C; // shared memory per simdgroup + const short NL = NW/4; // note: this can be adjusted to support D%64 == 0 and D%32 == 0 + const short SH = 2*C; // shared memory per simdgroup const short T = D + nsg*SH; // shared memory size per query in (half) @@ -3448,7 +3448,7 @@ kernel void kernel_flash_attn_ext_vec( // Q*K^T { - // each simdgroup processes 1 query and 4 keys + // each simdgroup processes 1 query and 4 (NW/NL) keys for (short cc = 0; cc < C/4; ++cc) { qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 }; @@ -3646,7 +3646,7 @@ kernel void kernel_flash_attn_ext_vec( half, half4, half4x4, \ half4x4 -typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; +typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; #if defined(GGML_METAL_USE_BF16) From 6423c65aa8be1b98f990cf207422505ac5a441a1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Nov 2024 11:53:13 +0200 Subject: [PATCH 066/279] metal : reorder write loop in mul mat kernel + style (#10231) * metal : reorder write loop * metal : int -> short, style ggml-ci --- ggml/src/ggml-metal.metal | 76 ++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 779f45968..413661c8a 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -6318,8 +6318,8 @@ kernel void kernel_mul_mm(device const uchar * src0, const uint im = tgpig.z; // if this block is of 64x32 shape or smaller - short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; - short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + short n_rows = (ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; @@ -6327,9 +6327,10 @@ kernel void kernel_mul_mm(device const uchar * src0, simdgroup_T8x8 ma[4]; simdgroup_float8x8 mb[2]; - simdgroup_float8x8 c_res[8]; - for (int i = 0; i < 8; i++){ - c_res[i] = make_filled_simdgroup_matrix(0.f); + simdgroup_float8x8 mc[8]; + + for (short i = 0; i < 8; i++){ + mc[i] = make_filled_simdgroup_matrix(0.f); } short il = (tiitg % THREAD_PER_ROW); @@ -6340,7 +6341,7 @@ kernel void kernel_mul_mm(device const uchar * src0, uint offset0 = (i12/r2)*nb02 + (i13/r3)*nb03; ushort offset1 = il/nl; - device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*nb01 + offset0) + offset1; device const float * y = (device const float *)(src1 + nb13 * i13 + nb12 * i12 @@ -6354,13 +6355,13 @@ kernel void kernel_mul_mm(device const uchar * src0, threadgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll(16) - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ - + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ - + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + for (short i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \ + + (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \ + + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); il = (il + 2 < nl) ? il + 2 : il % 2; x = (il < 2) ? x + (2+nl-1)/nl : x; @@ -6369,27 +6370,27 @@ kernel void kernel_mul_mm(device const uchar * src0, threadgroup_barrier(mem_flags::mem_threadgroup); // load matrices from threadgroup memory and conduct outer products - threadgroup T * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); - threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); #pragma unroll(4) - for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { #pragma unroll(4) - for (int i = 0; i < 4; i++) { - simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + for (short i = 0; i < 4; i++) { + simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } simdgroup_barrier(mem_flags::mem_none); #pragma unroll(2) - for (int i = 0; i < 2; i++) { - simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + for (short i = 0; i < 2; i++) { + simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } - lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; - lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE; #pragma unroll(8) - for (int i = 0; i < 8; i++){ - simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + for (short i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); } } } @@ -6397,25 +6398,36 @@ kernel void kernel_mul_mm(device const uchar * src0, if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \ + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0; - for (int i = 0; i < 8; i++) { - simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); } } else { // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { - simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + threadgroup float * temp_str = ((threadgroup float *) shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M; + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); } threadgroup_barrier(mem_flags::mem_threadgroup); - device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; if (sgitg == 0) { - for (int i = 0; i < n_rows; i++) { - for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { + device float * D = dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*ne0 + im*ne1*ne0; + device float4 * D4 = (device float4 *) D; + + threadgroup float * C = temp_str + (j*BLOCK_SIZE_M); + threadgroup float4 * C4 = (threadgroup float4 *) C; + + int i = 0; + for (; i < n_rows/4; i++) { + *(D4 + i) = *(C4 + i); + } + + i *= 4; + for (; i < n_rows; i++) { + *(D + i) = *(C + i); } } } From 160687b3ed002eee83a04de83a9cd752f928ced1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 10 Nov 2024 05:37:56 -0600 Subject: [PATCH 067/279] vulkan: Fix newly added tests for permuted mul_mat and 1D im2col (#10226) --- ggml/src/ggml-vulkan.cpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index a8e78c4db..6c4c92262 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -3147,7 +3147,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; - if (mmp == nullptr) { + if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16); } @@ -3630,9 +3630,19 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); - if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { + if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 && + // detect 0213 permutation, and batch size of 1 + src0->nb[0] <= src0->nb[2] && + src0->nb[2] <= src0->nb[1] && + src0->nb[1] <= src0->nb[3] && + src1->nb[0] <= src1->nb[2] && + src1->nb[2] <= src1->nb[1] && + src1->nb[1] <= src1->nb[3] && + src0->ne[3] == 1 && + src1->ne[3] == 1) { ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun); - } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) { + } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && + !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) { ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun); } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun); @@ -3708,7 +3718,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; - if (mmp == nullptr) { + if (qx_needs_dequant) { GGML_ABORT("fatal error"); } @@ -4470,7 +4480,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint32_t OH = is_2D ? dst->ne[2] : 1; const uint32_t OW = dst->ne[1]; - const uint32_t batch = src1->ne[3]; + const uint32_t batch = src1->ne[is_2D ? 3 : 2]; elements = { OW * KW * KH, OH, batch * IC }; } break; @@ -4915,7 +4925,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint32_t OW = dst->ne[1]; const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32 const uint32_t pelements = OW * KW * KH; @@ -6804,6 +6814,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (a->ne[3] != b->ne[3]) { return false; } + if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) || + !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) { + return false; + } + return true; } break; case GGML_OP_GET_ROWS: From 505f33274d60676320216b662a97672a76ec600e Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Mon, 11 Nov 2024 00:42:25 +0500 Subject: [PATCH 068/279] server : (web UI) Add back sampler settings (#10239) * Add back samplers to server * Added tooltips with basic information * Fixed stretching of input fields. * use component for settings input, move help msg to tooltips --------- Co-authored-by: Xuan Son Nguyen --- examples/server/public/index.html | 106 +++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 9 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index bf1d1b794..55639a944 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -200,23 +200,38 @@
System Message
- - - diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2ec13d7d2..9bca3f30e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -16,12 +16,7 @@ // auto generated files (update with ./deps.sh) #include "index.html.hpp" -#include "completion.js.hpp" #include "loading.html.hpp" -#include "deps_daisyui.min.css.hpp" -#include "deps_markdown-it.js.hpp" -#include "deps_tailwindcss.js.hpp" -#include "deps_vue.esm-browser.js.hpp" #include #include @@ -103,12 +98,6 @@ struct server_task_result { bool error; }; -struct server_static_file { - const unsigned char * data; - unsigned int size; - const char * mime_type; -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -2457,16 +2446,6 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); - // static files - std::map static_files = { - { "/", { index_html, index_html_len, "text/html; charset=utf-8" }}, - { "/completion.js", { completion_js, completion_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_daisyui.min.css", { deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8" }}, - { "/deps_markdown-it.js", { deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_tailwindcss.js", { deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }}, - }; - std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { @@ -2547,7 +2526,7 @@ int main(int argc, char ** argv) { // Middlewares // - auto middleware_validate_api_key = [¶ms, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) { + auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { static const std::unordered_set public_endpoints = { "/health", "/models", @@ -2560,7 +2539,7 @@ int main(int argc, char ** argv) { } // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) { + if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { return true; } @@ -3317,14 +3296,11 @@ int main(int argc, char ** argv) { return 1; } } else { - // using embedded static files - for (const auto & it : static_files) { - const server_static_file & static_file = it.second; - svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(static_file.data), static_file.size, static_file.mime_type); - return false; - }); - } + // using embedded static index.html + svr->Get("/", [](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + return false; + }); } // register API routes diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html new file mode 100644 index 000000000..c7e18b45e --- /dev/null +++ b/examples/server/webui/index.html @@ -0,0 +1,268 @@ + + + + + + + 🦙 llama.cpp - chat + + + +
+
+ + + +
+ +
+
+

Conversations

+ + + +
+ + +
+ + New conversation +
+
+ {{ conv.messages[0].content }} +
+
+ Conversations are saved to browser's localStorage +
+
+
+ + +
+ +
+ + + +
llama.cpp
+ + +
+ + + + + +
+
+ + +
+
+ + {{ messages.length === 0 ? 'Send a message to start' : '' }} +
+
+
+
+ + + + +
+
+ + +
+ + + + + +
+
+ + +
+
+ + +
+
+
+ + +
+ + + +
+
+ +
+ + + + + + + +
+ + + + + + + + diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json new file mode 100644 index 000000000..6b93090f0 --- /dev/null +++ b/examples/server/webui/package-lock.json @@ -0,0 +1,2783 @@ +{ + "name": "webui", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "webui", + "version": "0.0.0", + "dependencies": { + "autoprefixer": "^10.4.20", + "daisyui": "^4.12.14", + "markdown-it": "^14.1.0", + "postcss": "^8.4.49", + "tailwindcss": "^3.4.15", + "vite-plugin-singlefile": "^2.0.3", + "vue": "^3.5.13" + }, + "devDependencies": { + "vite": "^5.4.10" + } + }, + "node_modules/@alloc/quick-lru": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz", + "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==", + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.28.0.tgz", + "integrity": "sha512-wLJuPLT6grGZsy34g4N1yRfYeouklTgPhH1gWXCYspenKYD0s3cR99ZevOGw5BexMNywkbV3UkjADisozBmpPQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.28.0.tgz", + "integrity": "sha512-eiNkznlo0dLmVG/6wf+Ifi/v78G4d4QxRhuUl+s8EWZpDewgk7PX3ZyECUXU0Zq/Ca+8nU8cQpNC4Xgn2gFNDA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.28.0.tgz", + "integrity": "sha512-8hxgfReVs7k9Js1uAIhS6zq3I+wKQETInnWQtgzt8JfGx51R1N6DRVy3F4o0lQwumbErRz52YqwjfvuwRxGv1w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.28.0.tgz", + "integrity": "sha512-lA1zZB3bFx5oxu9fYud4+g1mt+lYXCoch0M0V/xhqLoGatbzVse0wlSQ1UYOWKpuSu3gyN4qEc0Dxf/DII1bhQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.28.0.tgz", + "integrity": "sha512-aI2plavbUDjCQB/sRbeUZWX9qp12GfYkYSJOrdYTL/C5D53bsE2/nBPuoiJKoWp5SN78v2Vr8ZPnB+/VbQ2pFA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.28.0.tgz", + "integrity": "sha512-WXveUPKtfqtaNvpf0iOb0M6xC64GzUX/OowbqfiCSXTdi/jLlOmH0Ba94/OkiY2yTGTwteo4/dsHRfh5bDCZ+w==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.28.0.tgz", + "integrity": "sha512-yLc3O2NtOQR67lI79zsSc7lk31xjwcaocvdD1twL64PK1yNaIqCeWI9L5B4MFPAVGEVjH5k1oWSGuYX1Wutxpg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.28.0.tgz", + "integrity": "sha512-+P9G9hjEpHucHRXqesY+3X9hD2wh0iNnJXX/QhS/J5vTdG6VhNYMxJ2rJkQOxRUd17u5mbMLHM7yWGZdAASfcg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.28.0.tgz", + "integrity": "sha512-1xsm2rCKSTpKzi5/ypT5wfc+4bOGa/9yI/eaOLW0oMs7qpC542APWhl4A37AENGZ6St6GBMWhCCMM6tXgTIplw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.28.0.tgz", + "integrity": "sha512-zgWxMq8neVQeXL+ouSf6S7DoNeo6EPgi1eeqHXVKQxqPy1B2NvTbaOUWPn/7CfMKL7xvhV0/+fq/Z/J69g1WAQ==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.28.0.tgz", + "integrity": "sha512-VEdVYacLniRxbRJLNtzwGt5vwS0ycYshofI7cWAfj7Vg5asqj+pt+Q6x4n+AONSZW/kVm+5nklde0qs2EUwU2g==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.28.0.tgz", + "integrity": "sha512-LQlP5t2hcDJh8HV8RELD9/xlYtEzJkm/aWGsauvdO2ulfl3QYRjqrKW+mGAIWP5kdNCBheqqqYIGElSRCaXfpw==", + "cpu": [ + "s390x" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.28.0.tgz", + "integrity": "sha512-Nl4KIzteVEKE9BdAvYoTkW19pa7LR/RBrT6F1dJCV/3pbjwDcaOq+edkP0LXuJ9kflW/xOK414X78r+K84+msw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.28.0.tgz", + "integrity": "sha512-eKpJr4vBDOi4goT75MvW+0dXcNUqisK4jvibY9vDdlgLx+yekxSm55StsHbxUsRxSTt3JEQvlr3cGDkzcSP8bw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.28.0.tgz", + "integrity": "sha512-Vi+WR62xWGsE/Oj+mD0FNAPY2MEox3cfyG0zLpotZdehPFXwz6lypkGs5y38Jd/NVSbOD02aVad6q6QYF7i8Bg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.28.0.tgz", + "integrity": "sha512-kN/Vpip8emMLn/eOza+4JwqDZBL6MPNpkdaEsgUtW1NYN3DZvZqSQrbKzJcTL6hd8YNmFTn7XGWMwccOcJBL0A==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.28.0.tgz", + "integrity": "sha512-Bvno2/aZT6usSa7lRDL2+hMjVAGjuqaymF1ApZm31JXzniR/hvr14jpU+/z4X6Gt5BPlzosscyJZGUvguXIqeQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz", + "integrity": "sha512-ZOJ46sMOKUjO3e94wPdCzQ6P1Lx/vhp2RSvfaab88Ajexs0AHeV0uasYhi99WPaogmBlRHNRuly8xV75cNTMDA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/helper-string-parser": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", + "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/helper-validator-identifier": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", + "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/parser": { + "version": "7.26.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz", + "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.26.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/types": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz", + "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@vue/compiler-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz", + "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/shared": "3.5.13", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/@vue/compiler-dom/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-sfc/-/compiler-sfc-3.5.13.tgz", + "integrity": "sha512-6VdaljMpD82w6c2749Zhf5T9u5uLBWKnVue6XWxprDobftnletJ8+oel7sexFfM3qIxNmVE7LSFGTpv6obNyaQ==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/compiler-core": "3.5.13", + "@vue/compiler-dom": "3.5.13", + "@vue/compiler-ssr": "3.5.13", + "@vue/shared": "3.5.13", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.11", + "postcss": "^8.4.48", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-string-parser": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", + "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-validator-identifier": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", + "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/parser": { + "version": "7.26.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz", + "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.26.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/types": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz", + "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", + "license": "MIT" + }, + "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz", + "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/shared": "3.5.13", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-ssr": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz", + "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/@vue/compiler-sfc/node_modules/magic-string": { + "version": "0.30.14", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.14.tgz", + "integrity": "sha512-5c99P1WKTed11ZC0HMJOj6CDIue6F8ySu+bJL+85q1zBEIY8IklrJ1eiKC2NDRh3Ct3FcvmJPyQHb9erXMTJNw==", + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/runtime-dom/-/runtime-dom-3.5.13.tgz", + "integrity": "sha512-dLaj94s93NYLqjLiyFzVs9X6dWhTdAlEAciC3Moq7gzAc13VJUdCnjjRurNM6uTLFATRHexHCTu/Xp3eW6yoog==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.13", + "@vue/runtime-core": "3.5.13", + "@vue/shared": "3.5.13", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/runtime-dom/node_modules/@vue/reactivity": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/reactivity/-/reactivity-3.5.13.tgz", + "integrity": "sha512-NaCwtw8o48B9I6L1zl2p41OHo/2Z4wqYGGIK1Khu5T7yxrn+ATOixn/Udn2m+6kZKB/J7cuT9DbWWhRxqixACg==", + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/runtime-dom/node_modules/@vue/runtime-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/runtime-core/-/runtime-core-3.5.13.tgz", + "integrity": "sha512-Fj4YRQ3Az0WTZw1sFe+QDb0aXCerigEpw418pw1HBUKFtnQHWzwojaukAs2X/c9DQz4MQ4bsXTGlcpGxU/RCIw==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/runtime-dom/node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", + "license": "MIT" + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/server-renderer/-/server-renderer-3.5.13.tgz", + "integrity": "sha512-wAi4IRJV/2SAW3htkTlB+dHeRmpTiVIK1OGLWV1yeStVSebSQQOwGwIq0D3ZIoBj2C2qpgz5+vX9iEBkTdk5YA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.13", + "@vue/shared": "3.5.13" + }, + "peerDependencies": { + "vue": "3.5.13" + } + }, + "node_modules/@vue/server-renderer/node_modules/@vue/compiler-ssr": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz", + "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/shared/-/shared-3.5.13.tgz", + "integrity": "sha512-/hnE/qP5ZoGpol0a5mDi45bOd7t3tjYJBjsgCsivow7D48cJeV5l05RD82lPqi7gRiphZM37rnhW1l6ZoCNNnQ==", + "license": "MIT" + }, + "node_modules/arg": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", + "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", + "license": "MIT" + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, + "node_modules/autoprefixer": { + "version": "10.4.20", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz", + "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/autoprefixer" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "browserslist": "^4.23.3", + "caniuse-lite": "^1.0.30001646", + "fraction.js": "^4.3.7", + "normalize-range": "^0.1.2", + "picocolors": "^1.0.1", + "postcss-value-parser": "^4.2.0" + }, + "bin": { + "autoprefixer": "bin/autoprefixer" + }, + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/browserslist": { + "version": "4.24.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.2.tgz", + "integrity": "sha512-ZIc+Q62revdMcqC6aChtW4jz3My3klmCO1fEmINZY/8J3EpBg5/A/D0AKmBveUh6pgoeycoMkVMko84tuYS+Gg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "caniuse-lite": "^1.0.30001669", + "electron-to-chromium": "^1.5.41", + "node-releases": "^2.0.18", + "update-browserslist-db": "^1.1.1" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/browserslist/node_modules/electron-to-chromium": { + "version": "1.5.67", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.67.tgz", + "integrity": "sha512-nz88NNBsD7kQSAGGJyp8hS6xSPtWwqNogA0mjtc2nUYeEf3nURK9qpV18TuBdDmEDgVWotS8Wkzf+V52dSQ/LQ==", + "license": "ISC" + }, + "node_modules/browserslist/node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/browserslist/node_modules/node-releases": { + "version": "2.0.18", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz", + "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g==", + "license": "MIT" + }, + "node_modules/browserslist/node_modules/update-browserslist-db": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz", + "integrity": "sha512-R8UzCaa9Az+38REPiJ1tXlImTJXlVfgHZsglwBD/k6nj76ctsH1E3q4doGrukiLQd3sGQYu56r5+lo5r94l29A==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.0" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001684", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz", + "integrity": "sha512-G1LRwLIQjBQoyq0ZJGqGIJUXzJ8irpbjHLpVRXDvBEScFJ9b17sgK6vlx0GAJFE21okD7zXl08rRRUfq6HdoEQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "license": "MIT", + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/chokidar/node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "license": "ISC", + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/chokidar/node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/chokidar/node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/chokidar/node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "license": "MIT", + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/chokidar/node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/chokidar/node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "license": "MIT", + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/chokidar/node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/css-selector-tokenizer": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", + "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==", + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "fastparse": "^1.1.2" + } + }, + "node_modules/css-selector-tokenizer/node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/css-selector-tokenizer/node_modules/fastparse": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz", + "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==", + "license": "MIT" + }, + "node_modules/culori": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/culori/-/culori-3.3.0.tgz", + "integrity": "sha512-pHJg+jbuFsCjz9iclQBqyL3B2HLCBF71BwVNujUYEvCeQMvV97R59MNK3R2+jgJ3a1fcZgI9B3vYgz8lzr/BFQ==", + "license": "MIT", + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + } + }, + "node_modules/daisyui": { + "version": "4.12.14", + "resolved": "https://registry.npmjs.org/daisyui/-/daisyui-4.12.14.tgz", + "integrity": "sha512-hA27cdBasdwd4/iEjn+aidoCrRroDuo3G5W9NDKaVCJI437Mm/3eSL/2u7MkZ0pt8a+TrYF3aT2pFVemTS3how==", + "license": "MIT", + "dependencies": { + "css-selector-tokenizer": "^0.8", + "culori": "^3", + "picocolors": "^1", + "postcss-js": "^4" + }, + "engines": { + "node": ">=16.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/daisyui" + } + }, + "node_modules/didyoumean": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz", + "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==", + "license": "Apache-2.0" + }, + "node_modules/dlv": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", + "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==", + "license": "MIT" + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/esbuild/node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "license": "ISC", + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/fast-glob/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fast-glob/node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/fast-glob/node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/fast-glob/node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/fraction.js": { + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", + "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "patreon", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "license": "MIT", + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob/node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/jiti": { + "version": "1.21.6", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.6.tgz", + "integrity": "sha512-2yTgeWTWzMWkHu6Jp9NKgePDaYHbntiwvYuuJLbbN9vl7DC9DvXKOB2BC3ZZ92D3cvV/aflH0osDfwpHepQ53w==", + "license": "MIT", + "bin": { + "jiti": "bin/jiti.js" + } + }, + "node_modules/lilconfig": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", + "integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, + "node_modules/linkify-it": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/linkify-it/-/linkify-it-5.0.0.tgz", + "integrity": "sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==", + "license": "MIT", + "dependencies": { + "uc.micro": "^2.0.0" + } + }, + "node_modules/markdown-it": { + "version": "14.1.0", + "resolved": "https://registry.npmjs.org/markdown-it/-/markdown-it-14.1.0.tgz", + "integrity": "sha512-a54IwgWPaeBCAAsv13YgmALOF1elABB08FxO9i+r4VFk5Vl4pKokRPeX8u5TCgSsPi6ec1otfLjdOpVcgbpshg==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1", + "entities": "^4.4.0", + "linkify-it": "^5.0.0", + "mdurl": "^2.0.0", + "punycode.js": "^2.3.1", + "uc.micro": "^2.1.0" + }, + "bin": { + "markdown-it": "bin/markdown-it.mjs" + } + }, + "node_modules/mdurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-2.0.0.tgz", + "integrity": "sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w==", + "license": "MIT" + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "license": "MIT", + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/micromatch/node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/micromatch/node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/micromatch/node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/micromatch/node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/micromatch/node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/normalize-range": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz", + "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-hash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz", + "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-import": { + "version": "15.1.0", + "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz", + "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==", + "license": "MIT", + "dependencies": { + "postcss-value-parser": "^4.0.0", + "read-cache": "^1.0.0", + "resolve": "^1.1.7" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "postcss": "^8.0.0" + } + }, + "node_modules/postcss-import/node_modules/pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postcss-import/node_modules/read-cache": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz", + "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==", + "license": "MIT", + "dependencies": { + "pify": "^2.3.0" + } + }, + "node_modules/postcss-js": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz", + "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==", + "license": "MIT", + "dependencies": { + "camelcase-css": "^2.0.1" + }, + "engines": { + "node": "^12 || ^14 || >= 16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + "peerDependencies": { + "postcss": "^8.4.21" + } + }, + "node_modules/postcss-js/node_modules/camelcase-css": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz", + "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/postcss-load-config": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz", + "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "lilconfig": "^3.0.0", + "yaml": "^2.3.4" + }, + "engines": { + "node": ">= 14" + }, + "peerDependencies": { + "postcss": ">=8.0.9", + "ts-node": ">=9.0.0" + }, + "peerDependenciesMeta": { + "postcss": { + "optional": true + }, + "ts-node": { + "optional": true + } + } + }, + "node_modules/postcss-load-config/node_modules/lilconfig": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", + "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", + "license": "MIT", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, + "node_modules/postcss-load-config/node_modules/yaml": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz", + "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==", + "license": "ISC", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/postcss-nested": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", + "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "postcss-selector-parser": "^6.1.1" + }, + "engines": { + "node": ">=12.0" + }, + "peerDependencies": { + "postcss": "^8.2.14" + } + }, + "node_modules/postcss-selector-parser": { + "version": "6.1.2", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", + "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "util-deprecate": "^1.0.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-selector-parser/node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-selector-parser/node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "license": "MIT" + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", + "license": "MIT" + }, + "node_modules/postcss/node_modules/nanoid": { + "version": "3.3.8", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", + "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/postcss/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/punycode.js": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode.js/-/punycode.js-2.3.1.tgz", + "integrity": "sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/resolve": { + "version": "1.22.8", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", + "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==", + "license": "MIT", + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/resolve/node_modules/is-core-module": { + "version": "2.15.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.1.tgz", + "integrity": "sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ==", + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "license": "MIT" + }, + "node_modules/resolve/node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/rollup": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.28.0.tgz", + "integrity": "sha512-G9GOrmgWHBma4YfCcX8PjH0qhXSdH8B4HDE2o4/jaxj93S4DPCIDoLcXz99eWMji4hB29UFCEd7B2gwGJDR9cQ==", + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.6" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.28.0", + "@rollup/rollup-android-arm64": "4.28.0", + "@rollup/rollup-darwin-arm64": "4.28.0", + "@rollup/rollup-darwin-x64": "4.28.0", + "@rollup/rollup-freebsd-arm64": "4.28.0", + "@rollup/rollup-freebsd-x64": "4.28.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.28.0", + "@rollup/rollup-linux-arm-musleabihf": "4.28.0", + "@rollup/rollup-linux-arm64-gnu": "4.28.0", + "@rollup/rollup-linux-arm64-musl": "4.28.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.28.0", + "@rollup/rollup-linux-riscv64-gnu": "4.28.0", + "@rollup/rollup-linux-s390x-gnu": "4.28.0", + "@rollup/rollup-linux-x64-gnu": "4.28.0", + "@rollup/rollup-linux-x64-musl": "4.28.0", + "@rollup/rollup-win32-arm64-msvc": "4.28.0", + "@rollup/rollup-win32-ia32-msvc": "4.28.0", + "@rollup/rollup-win32-x64-msvc": "4.28.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/rollup/node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.28.0.tgz", + "integrity": "sha512-lmKx9yHsppblnLQZOGxdO66gT77bvdBtr/0P+TPOseowE7D9AJoBw8ZDULRasXRWf1Z86/gcOdpBrV6VDUY36Q==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/rollup/node_modules/@types/estree": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", + "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", + "license": "MIT" + }, + "node_modules/sucrase": { + "version": "3.35.0", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", + "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==", + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "glob": "^10.3.10", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/@isaacs/cliui": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", + "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", + "license": "ISC", + "dependencies": { + "string-width": "^5.1.2", + "string-width-cjs": "npm:string-width@^4.2.0", + "strip-ansi": "^7.0.1", + "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", + "wrap-ansi": "^8.1.0", + "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/gen-mapping": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz", + "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==", + "license": "MIT", + "dependencies": { + "@jridgewell/set-array": "^1.2.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.24" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/set-array": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", + "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/@jridgewell/trace-mapping": { + "version": "0.3.25", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", + "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/sucrase/node_modules/@pkgjs/parseargs": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", + "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/sucrase/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/sucrase/node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/sucrase/node_modules/cross-spawn": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", + "license": "MIT", + "dependencies": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/sucrase/node_modules/eastasianwidth": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/emoji-regex": { + "version": "9.2.2", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", + "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/foreground-child": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", + "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", + "license": "ISC", + "dependencies": { + "cross-spawn": "^7.0.0", + "signal-exit": "^4.0.1" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/glob": { + "version": "10.4.5", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", + "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "license": "ISC", + "dependencies": { + "foreground-child": "^3.1.0", + "jackspeak": "^3.1.2", + "minimatch": "^9.0.4", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^1.11.1" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", + "license": "ISC" + }, + "node_modules/sucrase/node_modules/jackspeak": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", + "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", + "license": "BlueOak-1.0.0", + "dependencies": { + "@isaacs/cliui": "^8.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + }, + "optionalDependencies": { + "@pkgjs/parseargs": "^0.11.0" + } + }, + "node_modules/sucrase/node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "license": "ISC" + }, + "node_modules/sucrase/node_modules/minimatch": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", + "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/minipass": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", + "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "license": "ISC", + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/sucrase/node_modules/package-json-from-dist": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", + "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", + "license": "BlueOak-1.0.0" + }, + "node_modules/sucrase/node_modules/path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/path-scurry": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", + "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", + "license": "BlueOak-1.0.0", + "dependencies": { + "lru-cache": "^10.2.0", + "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" + }, + "engines": { + "node": ">=16 || 14 >=14.18" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/pirates": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz", + "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/sucrase/node_modules/shebang-command": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", + "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", + "license": "MIT", + "dependencies": { + "shebang-regex": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/shebang-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", + "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/string-width": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", + "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", + "license": "MIT", + "dependencies": { + "eastasianwidth": "^0.2.0", + "emoji-regex": "^9.2.2", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs": { + "name": "string-width", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/strip-ansi-cjs": { + "name": "strip-ansi", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/strip-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "license": "MIT", + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/sucrase/node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "license": "Apache-2.0" + }, + "node_modules/sucrase/node_modules/which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "license": "ISC", + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "node-which": "bin/node-which" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", + "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.1.0", + "string-width": "^5.0.1", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs": { + "name": "wrap-ansi", + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "license": "MIT", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/tailwindcss": { + "version": "3.4.15", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz", + "integrity": "sha512-r4MeXnfBmSOuKUWmXe6h2CcyfzJCEk4F0pptO5jlnYSIViUkVmsawj80N5h2lO3gwcmSb4n3PuN+e+GC1Guylw==", + "license": "MIT", + "dependencies": { + "@alloc/quick-lru": "^5.2.0", + "arg": "^5.0.2", + "chokidar": "^3.6.0", + "didyoumean": "^1.2.2", + "dlv": "^1.1.3", + "fast-glob": "^3.3.2", + "glob-parent": "^6.0.2", + "is-glob": "^4.0.3", + "jiti": "^1.21.6", + "lilconfig": "^2.1.0", + "micromatch": "^4.0.8", + "normalize-path": "^3.0.0", + "object-hash": "^3.0.0", + "picocolors": "^1.1.1", + "postcss": "^8.4.47", + "postcss-import": "^15.1.0", + "postcss-js": "^4.0.1", + "postcss-load-config": "^4.0.2", + "postcss-nested": "^6.2.0", + "postcss-selector-parser": "^6.1.2", + "resolve": "^1.22.8", + "sucrase": "^3.35.0" + }, + "bin": { + "tailwind": "lib/cli.js", + "tailwindcss": "lib/cli.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/uc.micro": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz", + "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==", + "license": "MIT" + }, + "node_modules/vite": { + "version": "5.4.11", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz", + "integrity": "sha512-c7jFQRklXua0mTzneGW9QVyxFjUgwcihC4bXEtujIo2ouWCe1Ajt/amn2PCxYnhYfd5k09JX3SB7OYWFKYqj8Q==", + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/vite-plugin-singlefile": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/vite-plugin-singlefile/-/vite-plugin-singlefile-2.0.3.tgz", + "integrity": "sha512-OEBEwdX8nCGPSdtaB1D7rryYnT+YfPTS8ojL1TDyeUF+bWDCTfRriQqw6T0vl9EbKI/KMg7szN3awst6cLrKkA==", + "license": "MIT", + "dependencies": { + "micromatch": "^4.0.8" + }, + "engines": { + "node": ">18.0.0" + }, + "peerDependencies": { + "rollup": "^4.24.3", + "vite": "^5.4.10" + } + }, + "node_modules/vue": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.13.tgz", + "integrity": "sha512-wmeiSMxkZCSc+PM2w2VRsOYAZC8GdipNFRTsLSfodVqI9mbejKeXEGr8SckuLnrQPGe3oJN5c3K0vpoU9q/wCQ==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/compiler-sfc": "3.5.13", + "@vue/runtime-dom": "3.5.13", + "@vue/server-renderer": "3.5.13", + "@vue/shared": "3.5.13" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + } + } +} diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json new file mode 100644 index 000000000..2a45ece14 --- /dev/null +++ b/examples/server/webui/package.json @@ -0,0 +1,23 @@ +{ + "name": "webui", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "devDependencies": { + "vite": "^5.4.10" + }, + "dependencies": { + "autoprefixer": "^10.4.20", + "daisyui": "^4.12.14", + "markdown-it": "^14.1.0", + "postcss": "^8.4.49", + "tailwindcss": "^3.4.15", + "vite-plugin-singlefile": "^2.0.3", + "vue": "^3.5.13" + } +} diff --git a/examples/server/webui/postcss.config.js b/examples/server/webui/postcss.config.js new file mode 100644 index 000000000..2e7af2b7f --- /dev/null +++ b/examples/server/webui/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/examples/server/public/completion.js b/examples/server/webui/src/completion.js similarity index 100% rename from examples/server/public/completion.js rename to examples/server/webui/src/completion.js diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js new file mode 100644 index 000000000..9b5b12329 --- /dev/null +++ b/examples/server/webui/src/main.js @@ -0,0 +1,456 @@ +import './styles.css'; +import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js'; +import { llama } from './completion.js'; +import MarkdownIt from 'markdown-it'; + +// utility functions +const isString = (x) => !!x.toLowerCase; +const isNumeric = (n) => !isString(n) && !isNaN(n); +const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"'); +const copyStr = (str) => navigator.clipboard.writeText(str); + +// constants +const BASE_URL = localStorage.getItem('base') // for debugging + || (new URL('.', document.baseURI).href).toString(); // for production +const CONFIG_DEFAULT = { + // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. + apiKey: '', + systemMessage: 'You are a helpful assistant.', + // make sure these default values are in sync with `common.h` + samplers: 'dkypmxt', + temperature: 0.8, + dynatemp_range: 0.0, + dynatemp_exponent: 1.0, + top_k: 40, + top_p: 0.95, + min_p: 0.05, + xtc_probability: 0.0, + xtc_threshold: 0.1, + typical_p: 1.0, + repeat_last_n: 64, + repeat_penalty: 1.0, + presence_penalty: 0.0, + frequency_penalty: 0.0, + dry_multiplier: 0.0, + dry_base: 1.75, + dry_allowed_length: 2, + dry_penalty_last_n: -1, + max_tokens: -1, + custom: '', // custom json-stringified object +}; +const CONFIG_INFO = { + apiKey: 'Set the API Key if you are using --api-key option for the server.', + systemMessage: 'The starting message that defines how model should behave.', + samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature', + temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.', + dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.', + dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.', + top_k: 'Keeps only k top tokens.', + top_p: 'Limits tokens to those that together have a cumulative probability of at least p', + min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.', + xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.', + xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.', + typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.', + repeat_last_n: 'Last n tokens to consider for penalizing repetition', + repeat_penalty: 'Controls the repetition of token sequences in the generated text', + presence_penalty: 'Limits tokens based on whether they appear in the output or not.', + frequency_penalty: 'Limits tokens based on how often they appear in the output.', + dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.', + dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.', + dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.', + dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.', + max_tokens: 'The maximum number of token per output.', + custom: '', // custom json-stringified object +}; +// config keys having numeric value (i.e. temperature, top_k, top_p, etc) +const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]); +// list of themes supported by daisyui +const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset']; + +// markdown support +const VueMarkdown = defineComponent( + (props) => { + const md = shallowRef(new MarkdownIt({ breaks: true })); + const origFenchRenderer = md.value.renderer.rules.fence; + md.value.renderer.rules.fence = (tokens, idx, ...args) => { + const content = tokens[idx].content; + const origRendered = origFenchRenderer(tokens, idx, ...args); + return `
+ + ${origRendered} +
`; + }; + window.copyStr = copyStr; + const content = computed(() => md.value.render(props.source)); + return () => h("div", { innerHTML: content.value }); + }, + { props: ["source"] } +); + +// input field to be used by settings modal +const SettingsModalShortInput = defineComponent({ + template: document.getElementById('settings-modal-short-input').innerHTML, + props: { + label: { type: String, required: false }, + configKey: String, + configDefault: Object, + configInfo: Object, + modelValue: [Object, String, Number], + }, +}); + +// coversations is stored in localStorage +// format: { [convId]: { id: string, lastModified: number, messages: [...] } } +// convId is a string prefixed with 'conv-' +const StorageUtils = { + // manage conversations + getAllConversations() { + const res = []; + for (const key in localStorage) { + if (key.startsWith('conv-')) { + res.push(JSON.parse(localStorage.getItem(key))); + } + } + res.sort((a, b) => b.lastModified - a.lastModified); + return res; + }, + // can return null if convId does not exist + getOneConversation(convId) { + return JSON.parse(localStorage.getItem(convId) || 'null'); + }, + // if convId does not exist, create one + appendMsg(convId, msg) { + if (msg.content === null) return; + const conv = StorageUtils.getOneConversation(convId) || { + id: convId, + lastModified: Date.now(), + messages: [], + }; + conv.messages.push(msg); + conv.lastModified = Date.now(); + localStorage.setItem(convId, JSON.stringify(conv)); + }, + getNewConvId() { + return `conv-${Date.now()}`; + }, + remove(convId) { + localStorage.removeItem(convId); + }, + filterAndKeepMsgs(convId, predicate) { + const conv = StorageUtils.getOneConversation(convId); + if (!conv) return; + conv.messages = conv.messages.filter(predicate); + conv.lastModified = Date.now(); + localStorage.setItem(convId, JSON.stringify(conv)); + }, + popMsg(convId) { + const conv = StorageUtils.getOneConversation(convId); + if (!conv) return; + const msg = conv.messages.pop(); + conv.lastModified = Date.now(); + if (conv.messages.length === 0) { + StorageUtils.remove(convId); + } else { + localStorage.setItem(convId, JSON.stringify(conv)); + } + return msg; + }, + + // manage config + getConfig() { + const savedVal = JSON.parse(localStorage.getItem('config') || '{}'); + // to prevent breaking changes in the future, we always provide default value for missing keys + return { + ...CONFIG_DEFAULT, + ...savedVal, + }; + }, + setConfig(config) { + localStorage.setItem('config', JSON.stringify(config)); + }, + getTheme() { + return localStorage.getItem('theme') || 'auto'; + }, + setTheme(theme) { + if (theme === 'auto') { + localStorage.removeItem('theme'); + } else { + localStorage.setItem('theme', theme); + } + }, +}; + +// scroll to bottom of chat messages +// if requiresNearBottom is true, only auto-scroll if user is near bottom +const chatScrollToBottom = (requiresNearBottom) => { + const msgListElem = document.getElementById('messages-list'); + const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight; + if (!requiresNearBottom || (spaceToBottom < 100)) { + setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1); + } +}; + +const mainApp = createApp({ + components: { + VueMarkdown, + SettingsModalShortInput, + }, + data() { + return { + conversations: StorageUtils.getAllConversations(), + messages: [], // { id: number, role: 'user' | 'assistant', content: string } + viewingConvId: StorageUtils.getNewConvId(), + inputMsg: '', + isGenerating: false, + pendingMsg: null, // the on-going message from assistant + stopGeneration: () => {}, + selectedTheme: StorageUtils.getTheme(), + config: StorageUtils.getConfig(), + showConfigDialog: false, + editingMsg: null, + // const + themes: THEMES, + configDefault: {...CONFIG_DEFAULT}, + configInfo: {...CONFIG_INFO}, + } + }, + computed: {}, + mounted() { + document.getElementById('app').classList.remove('opacity-0'); // show app + // scroll to the bottom when the pending message height is updated + const pendingMsgElem = document.getElementById('pending-msg'); + const resizeObserver = new ResizeObserver(() => { + if (this.isGenerating) chatScrollToBottom(true); + }); + resizeObserver.observe(pendingMsgElem); + }, + methods: { + hideSidebar() { + document.getElementById('toggle-drawer').checked = false; + }, + setSelectedTheme(theme) { + this.selectedTheme = theme; + StorageUtils.setTheme(theme); + }, + newConversation() { + if (this.isGenerating) return; + this.viewingConvId = StorageUtils.getNewConvId(); + this.editingMsg = null; + this.fetchMessages(); + chatScrollToBottom(); + this.hideSidebar(); + }, + setViewingConv(convId) { + if (this.isGenerating) return; + this.viewingConvId = convId; + this.editingMsg = null; + this.fetchMessages(); + chatScrollToBottom(); + this.hideSidebar(); + }, + deleteConv(convId) { + if (this.isGenerating) return; + if (window.confirm('Are you sure to delete this conversation?')) { + StorageUtils.remove(convId); + if (this.viewingConvId === convId) { + this.viewingConvId = StorageUtils.getNewConvId(); + this.editingMsg = null; + } + this.fetchConversation(); + this.fetchMessages(); + } + }, + downloadConv(convId) { + const conversation = StorageUtils.getOneConversation(convId); + if (!conversation) { + alert('Conversation not found.'); + return; + } + const conversationJson = JSON.stringify(conversation, null, 2); + const blob = new Blob([conversationJson], { type: 'application/json' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `conversation_${convId}.json`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + }, + async sendMessage() { + if (!this.inputMsg) return; + const currConvId = this.viewingConvId; + + StorageUtils.appendMsg(currConvId, { + id: Date.now(), + role: 'user', + content: this.inputMsg, + }); + this.fetchConversation(); + this.fetchMessages(); + this.inputMsg = ''; + this.editingMsg = null; + this.generateMessage(currConvId); + chatScrollToBottom(); + }, + async generateMessage(currConvId) { + if (this.isGenerating) return; + this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null }; + this.isGenerating = true; + this.editingMsg = null; + + try { + const abortController = new AbortController(); + this.stopGeneration = () => abortController.abort(); + const params = { + messages: [ + { role: 'system', content: this.config.systemMessage }, + ...this.messages, + ], + stream: true, + cache_prompt: true, + samplers: this.config.samplers, + temperature: this.config.temperature, + dynatemp_range: this.config.dynatemp_range, + dynatemp_exponent: this.config.dynatemp_exponent, + top_k: this.config.top_k, + top_p: this.config.top_p, + min_p: this.config.min_p, + typical_p: this.config.typical_p, + xtc_probability: this.config.xtc_probability, + xtc_threshold: this.config.xtc_threshold, + repeat_last_n: this.config.repeat_last_n, + repeat_penalty: this.config.repeat_penalty, + presence_penalty: this.config.presence_penalty, + frequency_penalty: this.config.frequency_penalty, + dry_multiplier: this.config.dry_multiplier, + dry_base: this.config.dry_base, + dry_allowed_length: this.config.dry_allowed_length, + dry_penalty_last_n: this.config.dry_penalty_last_n, + max_tokens: this.config.max_tokens, + ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}), + ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}), + }; + const config = { + controller: abortController, + api_url: BASE_URL, + endpoint: '/chat/completions', + }; + for await (const chunk of llama(prompt, params, config)) { + const stop = chunk.data.stop; + const addedContent = chunk.data.choices[0].delta.content; + const lastContent = this.pendingMsg.content || ''; + if (addedContent) { + this.pendingMsg = { + id: this.pendingMsg.id, + role: 'assistant', + content: lastContent + addedContent, + }; + } + } + + StorageUtils.appendMsg(currConvId, this.pendingMsg); + this.fetchConversation(); + this.fetchMessages(); + setTimeout(() => document.getElementById('msg-input').focus(), 1); + } catch (error) { + if (error.name === 'AbortError') { + // user stopped the generation via stopGeneration() function + StorageUtils.appendMsg(currConvId, this.pendingMsg); + this.fetchConversation(); + this.fetchMessages(); + } else { + console.error(error); + alert(error); + // pop last user message + const lastUserMsg = StorageUtils.popMsg(currConvId); + this.inputMsg = lastUserMsg ? lastUserMsg.content : ''; + } + } + + this.pendingMsg = null; + this.isGenerating = false; + this.stopGeneration = () => {}; + this.fetchMessages(); + chatScrollToBottom(); + }, + + // message actions + regenerateMsg(msg) { + if (this.isGenerating) return; + // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds + const currConvId = this.viewingConvId; + StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); + this.fetchConversation(); + this.fetchMessages(); + this.generateMessage(currConvId); + }, + copyMsg(msg) { + copyStr(msg.content); + }, + editUserMsgAndRegenerate(msg) { + if (this.isGenerating) return; + const currConvId = this.viewingConvId; + const newContent = msg.content; + this.editingMsg = null; + StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); + StorageUtils.appendMsg(currConvId, { + id: Date.now(), + role: 'user', + content: newContent, + }); + this.fetchConversation(); + this.fetchMessages(); + this.generateMessage(currConvId); + }, + + // settings dialog methods + closeAndSaveConfigDialog() { + try { + if (this.config.custom.length) JSON.parse(this.config.custom); + } catch (error) { + alert('Invalid JSON for custom config. Please either fix it or leave it empty.'); + return; + } + for (const key of CONFIG_NUMERIC_KEYS) { + if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) { + alert(`Invalid number for ${key} (expected an integer or a float)`); + return; + } + this.config[key] = parseFloat(this.config[key]); + } + this.showConfigDialog = false; + StorageUtils.setConfig(this.config); + }, + closeAndDiscardConfigDialog() { + this.showConfigDialog = false; + this.config = StorageUtils.getConfig(); + }, + resetConfigDialog() { + if (window.confirm('Are you sure to reset all settings?')) { + this.config = {...CONFIG_DEFAULT}; + } + }, + + // sync state functions + fetchConversation() { + this.conversations = StorageUtils.getAllConversations(); + }, + fetchMessages() { + this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? []; + }, + }, +}); +mainApp.config.errorHandler = alert; +try { + mainApp.mount('#app'); +} catch (err) { + console.error(err); + document.getElementById('app').innerHTML = `
+ Failed to start app. Please try clearing localStorage and try again.
+
+ +
`; +} diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css new file mode 100644 index 000000000..67d35b99e --- /dev/null +++ b/examples/server/webui/src/styles.css @@ -0,0 +1,26 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +.markdown { + h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } + pre { + @apply whitespace-pre-wrap rounded-lg p-2; + border: 1px solid currentColor; + } + /* TODO: fix markdown table */ +} + +.show-on-hover { + @apply md:opacity-0 md:group-hover:opacity-100; +} +.btn-mini { + @apply cursor-pointer hover:shadow-md; +} +.chat-screen { max-width: 900px; } + +.chat-bubble-base-300 { + --tw-bg-opacity: 1; + --tw-text-opacity: 1; + @apply bg-base-300 text-base-content; +} diff --git a/examples/server/webui/tailwind.config.js b/examples/server/webui/tailwind.config.js new file mode 100644 index 000000000..c43066a19 --- /dev/null +++ b/examples/server/webui/tailwind.config.js @@ -0,0 +1,16 @@ +/** @type {import('tailwindcss').Config} */ +export default { + content: [ + "./index.html", + "./src/**/*.{js,ts,jsx,tsx}", + ], + theme: { + extend: {}, + }, + plugins: [ + require('daisyui'), + ], + daisyui: { + themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'], + } +} diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js new file mode 100644 index 000000000..789bf9cbb --- /dev/null +++ b/examples/server/webui/vite.config.js @@ -0,0 +1,36 @@ + +import { viteSingleFile } from 'vite-plugin-singlefile'; +import path from 'path'; +import fs from 'fs'; + +const GUIDE_FOR_FRONTEND = ` + +`.trim(); + +export default { + plugins: [ + viteSingleFile(), + (function llamaCppPlugin() { + let config; + return { + name: 'llamacpp:build', + apply: 'build', + async configResolved(_config) { + config = _config; + }, + writeBundle() { + const outputIndexHtml = path.join(config.build.outDir, 'index.html'); + const content = fs.readFileSync(outputIndexHtml, 'utf-8'); + + const targetOutputFile = path.join(config.build.outDir, '../../public/index.html'); + fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content); + } + } + })(), + ], +}; From cc98896db858df7aa40d0e16a505883ef196a482 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 3 Dec 2024 13:29:54 -0600 Subject: [PATCH 260/279] vulkan: optimize and reenable split_k (#10637) Use vector loads when possible in mul_mat_split_k_reduce. Use split_k when there aren't enough workgroups to fill the shaders. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 51 +++++++++++++++---- .../mul_mat_split_k_reduce.comp | 31 ++++++++--- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index df6a659f4..17e1be105 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -165,6 +165,7 @@ struct vk_device_struct { vk_queue transfer_queue; bool single_queue; uint32_t subgroup_size; + uint32_t shader_core_count; bool uma; size_t idx; @@ -1498,7 +1499,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); @@ -1610,11 +1611,14 @@ static vk_device ggml_vk_get_device(size_t idx) { const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; + bool sm_builtins = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; + } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { + sm_builtins = true; } } @@ -1622,11 +1626,21 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceMaintenance4Properties props4; vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; props2.pNext = &props3; props3.pNext = &subgroup_props; + + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props; + if (maintenance4_support) { - subgroup_props.pNext = &props4; + last_struct->pNext = (VkBaseOutStructure *)&props4; + last_struct = (VkBaseOutStructure *)&props4; } + if (sm_builtins) { + last_struct->pNext = (VkBaseOutStructure *)&sm_props; + last_struct = (VkBaseOutStructure *)&sm_props; + } + device->physical_device.getProperties2(&props2); device->properties = props2.properties; @@ -1643,6 +1657,11 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->subgroup_size = subgroup_props.subgroupSize; device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + if (sm_builtins) { + device->shader_core_count = sm_props.shaderSMCount; + } else { + device->shader_core_count = 0; + } bool fp16_storage = false; bool fp16_compute = false; @@ -2732,15 +2751,25 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz dst->device->device.resetFences({ dst->device->fence }); } -static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { +static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) { VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"); - // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { - // return 4; - // } - return 1; + uint32_t split_k = 1; + if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) { + // If k is 'large' and the SMs will fill less than halfway, use split_k. + uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]); + uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]); + if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) { + split_k = ctx->device->shader_core_count / (m_tiles * n_tiles); + // Clamp to 2 or 4 + split_k = std::min(split_k, 4u); + if (split_k == 3) { + split_k = 2; + } + } + } - GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k); + return split_k; } static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { @@ -2964,10 +2993,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); + const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; @@ -2993,7 +3022,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub if (dryrun) { const uint64_t x_sz_upd = x_sz * ne02 * ne03; const uint64_t y_sz_upd = y_sz * ne12 * ne13; - const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0; + const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0; if ( (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) || diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp index 825b91031..4c64fd47a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp @@ -5,7 +5,9 @@ layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {float data_a[];}; +layout (binding = 0) readonly buffer A4 {vec4 data_a4[];}; layout (binding = 1) writeonly buffer D {float data_d[];}; +layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];}; layout (push_constant) uniform parameter { uint ne; @@ -13,17 +15,34 @@ layout (push_constant) uniform parameter { } p; void main() { - const uint idx = gl_GlobalInvocationID.x; + // Each invocation handles four consecutive components + const uint idx = gl_GlobalInvocationID.x * 4; if (idx >= p.ne) { return; } - float result = 0.0f; + // Check if all four components are in bounds and aligned, + // then use vector loads + if (idx + 3 < p.ne && (p.ne % 4) == 0) { + vec4 result = vec4(0.0f); - [[unroll]] for (uint i = 0; i < p.k_num; i++) { - result += data_a[i * p.ne + idx]; + [[unroll]] for (uint i = 0; i < p.k_num; i++) { + result += data_a4[(i * p.ne + idx) / 4]; + } + + data_d4[idx / 4] = result; + } else { + [[unroll]] for (uint j = 0; j < 4; ++j) { + if (idx + j < p.ne) { + float result = 0.0f; + + [[unroll]] for (uint i = 0; i < p.k_num; i++) { + result += data_a[i * p.ne + idx + j]; + } + + data_d[idx + j] = result; + } + } } - - data_d[idx] = result; } From 01e6d9bb71eb71fe1f811f2fdef15753232cd0f2 Mon Sep 17 00:00:00 2001 From: piDack <104877312+piDack@users.noreply.github.com> Date: Wed, 4 Dec 2024 08:26:37 +0800 Subject: [PATCH 261/279] clip : add sycl support (#10574) Co-authored-by: piDack --- examples/llava/clip.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7ba4cea58..d7c94352b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -12,6 +12,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif @@ -1169,6 +1173,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif +#ifdef GGML_USE_SYCL + new_clip->backend = ggml_backend_sycl_init(0); + LOG_INF("%s: CLIP using SYCL backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); LOG_INF("%s: CLIP using CPU backend\n", __func__); From da6aac91f150a3b0bcc26d3fd50288accb15f179 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Tue, 3 Dec 2024 16:40:36 -0800 Subject: [PATCH 262/279] Add docs for creating a static build (#10268) (#10630) * Add notes for a static build * Update docs/build.md --------- Co-authored-by: Diego Devesa --- docs/build.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/build.md b/docs/build.md index 97e340ab6..a4964cbd1 100644 --- a/docs/build.md +++ b/docs/build.md @@ -39,6 +39,11 @@ cmake --build build --config Release ``` For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html). +- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: + ``` + cmake -B build -DBUILD_SHARED_LIBS=OFF + cmake --build build --config Release + ``` - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): From cd2f37b304f8e88b9de8424b31078b97f9cf7c60 Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Wed, 4 Dec 2024 02:41:37 +0200 Subject: [PATCH 263/279] Avoid using __fp16 on ARM with old nvcc (#10616) --- ggml/src/ggml-impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 78e3af8f2..00a1546a7 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -310,14 +310,14 @@ void ggml_aligned_free(void * ptr, size_t size); // FP16 to FP32 conversion #if defined(__ARM_NEON) - #ifdef _MSC_VER + #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) typedef uint16_t ggml_fp16_internal_t; #else typedef __fp16 ggml_fp16_internal_t; #endif #endif -#if defined(__ARM_NEON) && !defined(_MSC_VER) +#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) From 98036d5670f21e9b9a99d5e3dbb3bf7589f5c4e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Wed, 4 Dec 2024 09:22:50 +0800 Subject: [PATCH 264/279] fix typo of README.md (#10605) --- grammars/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grammars/README.md b/grammars/README.md index 4e57bca5f..976954091 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -46,7 +46,7 @@ Terminals support the full range of Unicode. Unicode characters can be specified Character ranges can be negated with `^`: ``` -single-line ::= [^\n]+ "\n"` +single-line ::= [^\n]+ "\n" ``` ## Sequences and Alternatives From 40c6d79fb52f995f47507fedfeaae2ac05d9b35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Wed, 4 Dec 2024 02:29:20 +0100 Subject: [PATCH 265/279] SYCL : Move to compile time oneMKL interface backend selection for NVIDIA backend (#10584) * [SYCL] Move to Compile Time backend selection on oneMKL Interface for NVIDIA backend Move to compile time selection to backend to avoid latency at run time. Add it to all mkl gemm calls and only for NVIDIA backend. Signed-off-by: nscipione * Formatting * Address PR comments to increase readibility --------- Signed-off-by: nscipione --- ggml/src/ggml-sycl/CMakeLists.txt | 3 ++- ggml/src/ggml-sycl/dpct/helper.hpp | 43 +++++++++++++++++++++--------- ggml/src/ggml-sycl/ggml-sycl.cpp | 13 ++++++--- ggml/src/ggml-sycl/outprod.cpp | 16 +++++------ 4 files changed, 50 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 83f223fd7..3579a311a 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -68,7 +68,8 @@ else() target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) + add_compile_definitions(GGML_SYCL_NVIDIA) + target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index c2f28bb49..d1b5dd87c 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -1689,9 +1689,14 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::mkl::blas::column_major::gemm( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - data_b, ldb, beta_value, data_c, ldc); +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector{ q }, + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); +#else + oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); +#endif } template @@ -1754,14 +1759,22 @@ namespace dpct matrix_info->ld_info[2] = ldc; matrix_info->groupsize_info = batch_size; +#ifdef GGML_SYCL_NVIDIA sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( - q, matrix_info->transpose_info, matrix_info->transpose_info + 1, - matrix_info->size_info, matrix_info->size_info + 1, - matrix_info->size_info + 2, matrix_info->value_info, - reinterpret_cast(a), matrix_info->ld_info, - reinterpret_cast(b), matrix_info->ld_info + 1, - matrix_info->value_info + 1, reinterpret_cast(c), + oneapi::mkl::backend_selector{ q }, matrix_info->transpose_info, + matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, + matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), + matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, + matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, + &(matrix_info->groupsize_info)); +#else + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, + matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, + reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), + matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); +#endif q.submit([&](sycl::handler &cgh) { @@ -1783,10 +1796,16 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); +#ifdef GGML_SYCL_NVIDIA oneapi::mkl::blas::column_major::gemm_batch( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - stride_a, data_b, ldb, stride_b, beta_value, - data_c, ldc, stride_c, batch_size); + oneapi::mkl::backend_selector{ q }, a_trans, b_trans, m, n, k, + alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, + batch_size); +#else + oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, + stride_c, batch_size); +#endif } } // namespace detail diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 1310981e5..135efb521 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2573,12 +2573,17 @@ inline void ggml_sycl_op_mul_mat_sycl( const float alpha = 1.0f; const float beta = 0.0f; #if !GGML_SYCL_DNNL +# ifdef GGML_SYCL_NVIDIA SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( - *stream, oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, - dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, - src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), + oneapi::mkl::backend_selector{ *stream }, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, + ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); +# else + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); +# endif #else auto dnnl_stream = ctx.stream_dnnl(stream); DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt(), diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index e61cdc2ca..ef9af0b76 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -40,14 +40,14 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr try { // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::column_major::gemm(*stream, - oneapi::mkl::transpose::nontrans, src1_op, - ne0, ne1, ne01, - alpha, - src0_d, ne00, - src1_d, ldb, - beta, - dst_d, ne0); +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector{ *stream }, + oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d, + ne00, src1_d, ldb, beta, dst_d, ne0); +#else + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, + src0_d, ne00, src1_d, ldb, beta, dst_d, ne0); +#endif } catch (sycl::exception const& exc) { std::cerr << exc.what() << std::endl; From 2759916d86b70e7aceaed4d0b4e7ed126f0f9e51 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 4 Dec 2024 01:28:59 -0600 Subject: [PATCH 266/279] vulkan: Implement "fast divide" (mul+shift) for unary ops like copy (#10642) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 45 ++++++++++++++++++- .../vulkan-shaders/generic_unary_head.comp | 27 ++++++++--- tests/test-backend-ops.cpp | 2 + 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 17e1be105..07b45d6b9 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -353,7 +353,45 @@ struct vk_op_unary_push_constants { uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t d_offset; float param1; float param2; + uint32_t ne0_012mp; uint32_t ne0_012L; + uint32_t ne0_01mp; uint32_t ne0_01L; + uint32_t ne0_0mp; uint32_t ne0_0L; + uint32_t ne1_012mp; uint32_t ne1_012L; + uint32_t ne1_01mp; uint32_t ne1_01L; + uint32_t ne1_0mp; uint32_t ne1_0L; }; +static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); + +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +{ + // compute L = ceil(log2(d)); + L = 0; + while (L < 32 && (uint32_t{1} << L) < d) { + L++; + } + + mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); +} + +template void init_pushconst_fastdiv(T &p) { + static_assert(!std::is_const::value, "unexpected type"); +} + +template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { + // Compute magic values to divide by these six numbers. + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); +} struct vk_op_binary_push_constants { uint32_t ne; @@ -2914,13 +2952,14 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - const vk_op_unary_push_constants pc = { + vk_op_unary_push_constants pc = { (uint32_t)ne, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, }; + init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); } @@ -4125,7 +4164,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4165,6 +4204,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint64_t ned3 = dst->ne[3]; const uint64_t ned = ned0 * ned1; + init_pushconst_fastdiv(pc); + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); if (pipeline == nullptr) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp index 4e1fa3af3..ab7c9d7eb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp @@ -8,6 +8,13 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint d_offset; float param1; float param2; + + uint ne0_012mp; uint ne0_012L; + uint ne0_01mp; uint ne0_01L; + uint ne0_0mp; uint ne0_0L; + uint ne1_012mp; uint ne1_012L; + uint ne1_01mp; uint ne1_01L; + uint ne1_0mp; uint ne1_0L; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -17,22 +24,30 @@ uint get_idx() { return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; } +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + // msbs = mulhi(n, mp) + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} + uint src0_idx(uint idx) { - const uint i03 = idx / (p.ne02*p.ne01*p.ne00); + const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = idx / (p.ne12*p.ne11*p.ne10); + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 87c92dadd..807d271c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3862,6 +3862,8 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); From 8d0cfd554a9ae545ff94d27e04458f537b4e8c0e Mon Sep 17 00:00:00 2001 From: JFLFY2255 Date: Wed, 4 Dec 2024 17:42:50 +0800 Subject: [PATCH 267/279] llama: Support MiniCPM-1B (with & w/o longrope) (#10559) --- convert_hf_to_gguf.py | 57 ++++++++----- gguf-py/gguf/constants.py | 9 +- include/llama.h | 3 +- src/llama.cpp | 175 +++++--------------------------------- 4 files changed, 61 insertions(+), 183 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b931049d1..d8df5cc00 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1831,29 +1831,40 @@ class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + if self.hparams.get("rope_scaling") is not None: + if self.hparams["rope_scaling"].get("type") == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_llama_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -1863,9 +1874,9 @@ class MiniCPMModel(Model): # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7df23371c..703199fcb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -896,6 +896,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1388,9 +1390,10 @@ class TokenType(IntEnum): class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + LONGROPE = 'longrope' class PoolingType(IntEnum): diff --git a/include/llama.h b/include/llama.h index e85f459fc..168c3fa1f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -185,7 +185,8 @@ extern "C" { LLAMA_ROPE_SCALING_TYPE_NONE = 0, LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, LLAMA_ROPE_SCALING_TYPE_YARN = 2, - LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, + LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE, }; enum llama_pooling_type { diff --git a/src/llama.cpp b/src/llama.cpp index 6a6f4c2a5..00f78639e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1036,6 +1036,8 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, @@ -1683,9 +1685,10 @@ struct LLM_TN { // static const std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, - { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" }, }; static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { @@ -5580,8 +5583,12 @@ static void llm_load_hparams( case LLM_ARCH_MINICPM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); switch (hparams.n_layer) { + case 52: model.type = e_model::MODEL_1B; break; case 40: model.type = e_model::MODEL_2B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -7065,7 +7072,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { + if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); @@ -7690,7 +7697,13 @@ static bool llm_load_tensors( layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } if (n_expert == 0) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); @@ -13497,153 +13510,6 @@ struct llm_build_context { return gf; } - // ref: https://arxiv.org/abs/2203.03466 - // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 - // based on the original build_llama() function - struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const int64_t n_embd = hparams.n_embd; - //TODO: if the model varies, these parameters need to be read from the model - const int64_t n_embd_base = 256; - const float scale_embd = 12.0f; - const float scale_depth = 1.4f; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - - // scale the input embeddings - inpL = ggml_scale(ctx0, inpL, scale_embd); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // scale_res - scale the hidden states for residual connection - const float scale_res = scale_depth/sqrtf(float(n_layer)); - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled", -1); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // scale the hidden states for residual connection - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled_ffn", -1); - - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head scaling - const float scale_lmhead = float(n_embd_base)/float(n_embd); - cur = ggml_scale(ctx0, cur, scale_lmhead); - cb(cur, "lmhead_scaling", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - struct ggml_cgraph * build_minicpm3() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -16742,6 +16608,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: { @@ -16825,10 +16692,6 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_internlm2(); } break; - case LLM_ARCH_MINICPM: - { - result = llm.build_minicpm(); - } break; case LLM_ARCH_MINICPM3: { result = llm.build_minicpm3(); From 253b7fde910731104670724391bfbcb94d97d0c3 Mon Sep 17 00:00:00 2001 From: ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:45:48 +0000 Subject: [PATCH 268/279] Fix HF repo commit to clone lora test models (#10649) --- tests/test-lora-conversion-inference.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh index fe90ce0d1..fb308a9ff 100755 --- a/tests/test-lora-conversion-inference.sh +++ b/tests/test-lora-conversion-inference.sh @@ -10,11 +10,16 @@ declare -a params=( MODELS_REPO=lora-tests MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO +COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890 # Clone the Hugging Face repository if the directory does not exist if [ ! -d "$MODELS_REPO" ]; then echo "Cloning the Hugging Face repository..." git clone $MODELS_REPO_URL --depth 1 + cd $MODELS_REPO + git fetch --depth=1 origin $COMMIT + git reset --hard $COMMIT + cd - else echo "Repository already exists. Skipping clone." fi From 2803540814bf0a4e44d0960ff6afda6bac971c17 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:40:44 +0100 Subject: [PATCH 269/279] ggml-cpu : fix HWCAP2_I8MM value (#10646) --- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 23ae2e10c..e4a9ca013 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2425,7 +2425,7 @@ bool ggml_is_numa(void) { #endif #if !defined(HWCAP2_I8MM) -#define HWCAP2_I8MM 0 +#define HWCAP2_I8MM (1 << 13) #endif static void ggml_init_arm_arch_features(void) { From 59f4db10883a4f3e855cffbf2c3ab68430e95272 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:45:40 +0100 Subject: [PATCH 270/279] ggml : add predefined list of CPU backend variants to build (#10626) * ggml : add predefined list of CPU backend variants to build * update CPU dockerfiles --- .devops/full.Dockerfile | 33 +- .devops/llama-cli.Dockerfile | 18 +- .devops/llama-server.Dockerfile | 22 +- ggml/CMakeLists.txt | 49 ++- ggml/src/CMakeLists.txt | 35 ++ ggml/src/ggml-backend-reg.cpp | 32 +- ggml/src/ggml-cpu/CMakeLists.txt | 595 +++++++++++++++------------- ggml/src/ggml-cpu/cpu-feats-x86.cpp | 85 ++-- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- ggml/src/ggml-cpu/ggml-cpu.cpp | 10 +- scripts/build-cpu.sh | 12 - 11 files changed, 502 insertions(+), 391 deletions(-) delete mode 100755 scripts/build-cpu.sh diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 2a06f82b7..d93c0be6a 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -ENV LLAMA_CURL=1 +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; +FROM ubuntu:$UBUNTU_VERSION as runtime -RUN make -j$(nproc) +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt /app/requirements.txt +COPY requirements /app/requirements +COPY .devops/tools.sh /app/tools.sh + +RUN pip install --upgrade pip setuptools wheel && \ + pip install -r /app/requirements.txt + +COPY --from=build /app/build/bin/ /app/ +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/convert_hf_to_gguf.py /app/ +COPY --from=build /app/gguf-py /app/gguf-py ENV LC_ALL=C.utf8 -ENTRYPOINT ["/app/.devops/tools.sh"] +ENTRYPOINT ["/app/tools.sh"] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 7f741aa46..be234d55d 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -RUN make -j$(nproc) llama-cli +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime -RUN apt-get update && \ - apt-get install -y libgomp1 +WORKDIR /app -COPY --from=build /app/llama-cli /llama-cli +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 curl + +COPY --from=build /app/build/bin/llama-cli /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 7110dda9e..72ccde2fe 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -9,28 +9,20 @@ WORKDIR /app COPY . . - -RUN \ - # Build multiple versions of the CPU backend - scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \ - scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \ - scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ - scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ - # Build llama-server - cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build --target llama-server -j $(nproc) && \ - # Copy the built libraries to /app/lib +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ mkdir -p /app/lib && \ - mv libggml-cpu* /app/lib/ && \ find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime +WORKDIR /app + RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/build/bin/llama-server /llama-server -COPY --from=build /app/lib/ / +COPY --from=build /app/build/bin/llama-server /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine @@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/app/llama-server" ] diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 06d371e09..1b3d98967 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -92,30 +92,33 @@ else() set(INS_ENB ON) endif() -option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) - -option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) -option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) -option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) -option(GGML_AVX512 "ggml: enable AVX512" OFF) -option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) -option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) -option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) -option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) -option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) -option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) -option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) +option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) +option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) +option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) +option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) +option(GGML_AVX512 "ggml: enable AVX512F" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) if (NOT MSVC) - option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 + # in MSVC F16C and FMA is implied with AVX2/AVX512 + option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) + option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) + # MSVC does not seem to support AMX + option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) + option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) + option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) endif() -option(GGML_LASX "ggml: enable lasx" ON) -option(GGML_LSX "ggml: enable lsx" ON) -option(GGML_RVV "ggml: enable rvv" ON) -option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_LASX "ggml: enable lasx" ON) +option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_RVV "ggml: enable rvv" ON) +option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) + if (WIN32) - set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") + set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") endif() # ggml core @@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 11) -endif() +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 19289f32b..f07533fdb 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -269,7 +269,42 @@ function(ggml_add_backend backend) endif() endfunction() +function(ggml_add_cpu_backend_variant tag_name) + set(GGML_CPU_TAG_NAME ${tag_name}) + # other: OPENMP LLAMAFILE CPU_HBM + foreach (feat NATIVE + AVX AVX2 AVX_VNNI FMA F16C + AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 + AMX_TILE AMX_INT8 AMX_BF16) + set(GGML_${feat} OFF) + endforeach() + + foreach (feat ${ARGN}) + set(GGML_${feat} ON) + endforeach() + + ggml_add_cpu_backend_variant_impl(${tag_name}) +endfunction() + ggml_add_backend(CPU) + +if (GGML_CPU_ALL_VARIANTS) + if (NOT GGML_BACKEND_DL) + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") + endif() + ggml_add_cpu_backend_variant(sandybridge AVX) + ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA) + ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + if (NOT MSVC) + # MSVC doesn't support AVX-VNNI or AMX + ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) + endif() +else () + ggml_add_cpu_backend_variant_impl("") +endif() + ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 2c4bf11b0..5cb0fb9d1 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) best_score = s; best_path = entry.path().string(); } + } else { + if (!silent) { + GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); + } } } } @@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { - ggml_backend_load_best("blas", true); - ggml_backend_load_best("cann", true); - ggml_backend_load_best("cuda", true); - ggml_backend_load_best("hip", true); - ggml_backend_load_best("kompute", true); - ggml_backend_load_best("metal", true); - ggml_backend_load_best("rpc", true); - ggml_backend_load_best("sycl", true); - ggml_backend_load_best("vulkan", true); - ggml_backend_load_best("musa", true); - ggml_backend_load_best("cpu", true); +#ifdef NDEBUG + bool silent = true; +#else + bool silent = false; +#endif + + ggml_backend_load_best("blas", silent); + ggml_backend_load_best("cann", silent); + ggml_backend_load_best("cuda", silent); + ggml_backend_load_best("hip", silent); + ggml_backend_load_best("kompute", silent); + ggml_backend_load_best("metal", silent); + ggml_backend_load_best("rpc", silent); + ggml_backend_load_best("sycl", silent); + ggml_backend_load_best("vulkan", silent); + ggml_backend_load_best("musa", silent); + ggml_backend_load_best("cpu", silent); } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 5df63884c..bc326c059 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,319 +1,354 @@ -ggml_add_backend_library(ggml-cpu) - -list (APPEND GGML_CPU_SOURCES - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - amx/amx.cpp - amx/amx.h - amx/mmq.cpp - amx/mmq.h - ggml-cpu-impl.h - ) - -target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) -target_include_directories(ggml-cpu PRIVATE .) - -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) - - target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) +function(ggml_add_cpu_backend_variant_impl tag_name) + if (tag_name) + set(GGML_CPU_NAME ggml-cpu-${tag_name}) else() - message(WARNING "Accelerate framework not found") + set(GGML_CPU_NAME ggml-cpu) endif() -endif() -if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - message(STATUS "OpenMP found") + ggml_add_backend_library(${GGML_CPU_NAME}) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) + list (APPEND GGML_CPU_SOURCES + ggml-cpu/ggml-cpu.c + ggml-cpu/ggml-cpu.cpp + ggml-cpu/ggml-cpu-aarch64.c + ggml-cpu/ggml-cpu-aarch64.h + ggml-cpu/ggml-cpu-quants.c + ggml-cpu/ggml-cpu-quants.h + ggml-cpu/amx/amx.cpp + ggml-cpu/amx/amx.h + ggml-cpu/amx/mmq.cpp + ggml-cpu/amx/mmq.h + ggml-cpu/ggml-cpu-impl.h + ) - target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") + target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) + target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu) + + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64) + + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() endif() -endif() -if (GGML_LLAMAFILE) - message(STATUS "Using llamafile") + if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() + endif() - list(APPEND GGML_CPU_SOURCES - llamafile/sgemm.cpp - llamafile/sgemm.h) -endif() + if (GGML_LLAMAFILE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE) -if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) + list(APPEND GGML_CPU_SOURCES + ggml-cpu/llamafile/sgemm.cpp + ggml-cpu/llamafile/sgemm.h) + endif() - message(STATUS "Using memkind for CPU HBM") + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) + message(STATUS "Using memkind for CPU HBM") - target_link_libraries(ggml-cpu PUBLIC memkind) -endif() + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM) -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR - CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND - NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind) + endif() - message(STATUS "ARM detected") + if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - if (MSVC) - list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead - list(APPEND ARCH_DEFINITIONS __ARM_NEON) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) + message(STATUS "ARM detected") - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + if (MSVC) + list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead + list(APPEND ARCH_DEFINITIONS __ARM_NEON) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - message(STATUS "ARM feature DOTPROD enabled") - endif () + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - - message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - elseif (APPLE) - if (GGML_NATIVE) - set(USER_PROVIDED_MARCH FALSE) - foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) - if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") - set(USER_PROVIDED_MARCH TRUE) - break() - endif() - endforeach() - - if (NOT USER_PROVIDED_MARCH) - set(MARCH_FLAGS "-march=armv8.2a") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - - message(STATUS "ARM feature DOTPROD enabled") - endif () - - set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") - - set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) - - list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + message(STATUS "ARM feature DOTPROD enabled") endif () - endif () - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + + message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + elseif (APPLE) + if (GGML_NATIVE) + set(USER_PROVIDED_MARCH FALSE) + foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) + if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") + set(USER_PROVIDED_MARCH TRUE) + break() + endif() + endforeach() + + if (NOT USER_PROVIDED_MARCH) + set(MARCH_FLAGS "-march=armv8.2a") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") + endif () + + set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") + + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + endif () + endif () + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) endif() endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (GGML_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (GGML_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (GGML_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (GGML_AVX512_VBMI) - list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + include(ggml-cpu/cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__ + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + if (GGML_AVX512_VBMI) + list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + else () + list(APPEND ARCH_FLAGS /arch:SSE4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + endif() + if (GGML_AVX_VNNI) + # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2 + #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI) + endif() + else () + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + else () + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + list(APPEND ARCH_DEFINITIONS GGML_F16C) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + list(APPEND ARCH_DEFINITIONS GGML_FMA) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2) + endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512cd) + list(APPEND ARCH_FLAGS -mavx512vl) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + endif() + if (GGML_AVX512_VBMI) list(APPEND ARCH_FLAGS -mavx512vbmi) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI) endif() - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + if (GGML_AVX512_VNNI) list(APPEND ARCH_FLAGS -mavx512vnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI) endif() - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_DEFINITIONS __AVX512BF16__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + if (GGML_AVX512_BF16) list(APPEND ARCH_FLAGS -mavx512bf16) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16) endif() endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_DEFINITIONS __AMX_TILE__) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_DEFINITIONS __AMX_INT8__) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_DEFINITIONS __AMX_BF16__) - endif() - elseif (GGML_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (GGML_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_DEFINITIONS __AVXVNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavxvnni) - endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) + endif() + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") + message(STATUS "RISC-V detected") + if (GGML_RVV) + list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) endif() else() - if (GGML_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (GGML_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (GGML_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (GGML_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (GGML_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_FLAGS -mavxvnni) - endif() - if (GGML_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512dq) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (GGML_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_FLAGS -mamx-tile) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_FLAGS -mamx-int8) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_FLAGS -mamx-bf16) - endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) - string(FIND "${POWER10_M}" "POWER10" substring_index) - if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") - set(substring_index -1) + message(STATUS "Unknown architecture") endif() - if (${substring_index} GREATER_EQUAL 0) - list(APPEND ARCH_FLAGS -mcpu=power10) - elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) - else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + if (GGML_CPU_AARCH64) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64) endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - list(APPEND ARCH_FLAGS -march=loongarch64) - if (GGML_LASX) - list(APPEND ARCH_FLAGS -mlasx) + message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}") + target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES}) + target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS}) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) + + if (GGML_BACKEND_DL) + # The feature detection code is compiled as a separate target so that + # it can be built without the architecture flags + # Since multiple variants of the CPU backend may be included in the same + # build, using set_source_files_properties() to set the arch flags is not possible + set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats) + add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS}) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) + set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME}) endif() - if (GGML_LSX) - list(APPEND ARCH_FLAGS -mlsx) + + if (EMSCRIPTEN) + set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128") endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") - message(STATUS "RISC-V detected") - if (GGML_RVV) - list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) - endif() -else() - message(STATUS "Unknown architecture") -endif() - -if (GGML_CPU_AARCH64) - message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) -endif() - -target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") - -# the feature detection code must be compiled without any architecture flags -target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) -# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection - -if (EMSCRIPTEN) - set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") -endif() +endfunction() diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 514701ffe..e8133d411 100644 --- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -1,4 +1,3 @@ -#include "ggml-cpu.h" #include "ggml-backend-impl.h" #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) @@ -13,6 +12,7 @@ #include #include +// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf struct cpuid_x86 { bool SSE3(void) { return f_1_ecx[0]; } bool PCLMULQDQ(void) { return f_1_ecx[1]; } @@ -50,11 +50,15 @@ struct cpuid_x86 { bool INVPCID(void) { return f_7_ebx[10]; } bool RTM(void) { return is_intel && f_7_ebx[11]; } bool AVX512F(void) { return f_7_ebx[16]; } + bool AVX512DQ(void) { return f_7_ebx[17]; } bool RDSEED(void) { return f_7_ebx[18]; } bool ADX(void) { return f_7_ebx[19]; } bool AVX512PF(void) { return f_7_ebx[26]; } bool AVX512ER(void) { return f_7_ebx[27]; } bool AVX512CD(void) { return f_7_ebx[28]; } + bool AVX512BW(void) { return f_7_ebx[30]; } + bool AVX512VL(void) { return f_7_ebx[31]; } + bool SHA(void) { return f_7_ebx[29]; } bool PREFETCHWT1(void) { return f_7_ecx[0]; } @@ -259,36 +263,57 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - cpuid_x86 is; - // if the CPU backend was built with any features not supported by the current CPU, it cannot be used - if (ggml_cpu_has_fma() && !is.FMA()) { return 0; } - if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; } - if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; } - if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; } - if (ggml_cpu_has_avx() && !is.AVX()) { return 0; } - if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; } - if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; } - if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; } - if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; } - if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; } - if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; } - if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; } - - // calculate a backend score based on the supported features - // more important features have a higher weight int score = 0; - score += ggml_cpu_has_fma () * 1; - score += ggml_cpu_has_f16c () * 1<<1; - score += ggml_cpu_has_ssse3 () * 1<<2; - score += ggml_cpu_has_sse3 () * 1<<3; - score += ggml_cpu_has_avx_vnni () * 1<<4; - score += ggml_cpu_has_avx () * 1<<5; - score += ggml_cpu_has_avx2 () * 1<<6; - score += ggml_cpu_has_avx512 () * 1<<7; - // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used - score += ggml_cpu_has_avx512_bf16() * 1<<9; - score += ggml_cpu_has_avx512_vnni() * 1<<10; - score += ggml_cpu_has_amx_int8 () * 1<<11; + cpuid_x86 is; + +#ifdef GGML_FMA + if (!is.FMA()) { return 0; } + score += 1; +#endif +#ifdef GGML_F16C + if (!is.F16C()) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_SSE42 + if (!is.SSE42()) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_AVX + if (!is.AVX()) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_AVX2 + if (!is.AVX2()) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_AVX_VNNI + if (!is.AVX_VNNI()) { return 0; } + score += 1<<6; +#endif +#ifdef GGML_AVX512 + if (!is.AVX512F()) { return 0; } + if (!is.AVX512CD()) { return 0; } + if (!is.AVX512VL()) { return 0; } + if (!is.AVX512DQ()) { return 0; } + if (!is.AVX512BW()) { return 0; } + score += 1<<7; +#endif +#ifdef GGML_AVX512_VBMI + if (!is.AVX512_VBMI()) { return 0; } + score += 1<<8; +#endif +#ifdef GGML_AVX512_BF16 + if (!is.AVX512_BF16()) { return 0; } + score += 1<<9; +#endif +#ifdef GGML_AVX512_VNNI + if (!is.AVX512_VNNI()) { return 0; } + score += 1<<10; +#endif +#ifdef GGML_AMX_INT8 + if (!is.AMX_INT8()) { return 0; } + score += 1<<11; +#endif return score; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e4a9ca013..40ca7bb68 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -756,7 +756,7 @@ do { \ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else -static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { +static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 77e5d87a8..d3b4bdb96 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } - // TODO: rename this + #ifdef GGML_USE_ACCELERATE + features.push_back({ "ACCELERATE", "1" }); + #endif + #ifdef GGML_USE_CPU_HBM + features.push_back({ "CPU_HBM", "1" }); + #endif + #ifdef GGML_USE_OPENMP + features.push_back({ "OPENMP", "1" }); + #endif #ifdef GGML_USE_CPU_AARCH64 features.push_back({ "AARCH64_REPACK", "1" }); #endif diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh deleted file mode 100755 index 4b2ad816e..000000000 --- a/scripts/build-cpu.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -name="$1" -args="${@:2}" - -echo "Building $name with args: $args" - -rm -fr build-cpu-$1 -cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args -cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc) -cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so -rm -fr build-cpu-$1 From 1da7b765692764a8b33b08da61cbee63812a7bd9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Dec 2024 22:38:20 +0200 Subject: [PATCH 271/279] server : fix speculative decoding with context shift (#10641) * server : fix speculative decoding with context shift ggml-ci * server : take into account speculative limits ggml-ci * server : add tests --- examples/server/server.cpp | 29 +++++++++++++++-- .../server/tests/unit/test_speculative.py | 31 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9bca3f30e..31dfd6240 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -921,6 +921,8 @@ struct server_context { slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); + slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2); + slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); if (slot.params.sampling.dry_base < 1.0f) { slot.params.sampling.dry_base = defaults.sampling.dry_base; @@ -2322,10 +2324,29 @@ struct server_context { continue; } + // determine the max draft that fits the current slot state + int n_draft_max = slot.params.speculative.n_max; + + // note: n_past is not yet increased for the `id` token sampled above + // also, need to leave space for 1 extra token to allow context shifts + n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2); + + if (slot.n_remaining > 0) { + n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); + } + + SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); + + if (n_draft_max < slot.params.speculative.n_min) { + SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min); + + continue; + } + llama_token id = slot.sampled; struct common_speculative_params params_spec; - params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_draft = n_draft_max; params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; params_spec.p_min = slot.params.speculative.p_min; @@ -2333,6 +2354,8 @@ struct server_context { // ignore small drafts if (slot.params.speculative.n_min > (int) draft.size()) { + SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); + continue; } @@ -2344,6 +2367,8 @@ struct server_context { common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); } + SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); + llama_decode(ctx, slot.batch_spec); // the accepted tokens from the speculation @@ -2372,7 +2397,7 @@ struct server_context { } } - SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); } } diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py index 982d6abb4..3bb5733cb 100644 --- a/examples/server/tests/unit/test_speculative.py +++ b/examples/server/tests/unit/test_speculative.py @@ -82,6 +82,37 @@ def test_different_draft_min_draft_max(): last_content = res.body["content"] +def test_slot_ctx_not_exceeded(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + + +def test_with_ctx_shift(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "n_predict": 64, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + assert res.body["tokens_predicted"] == 64 + assert res.body["truncated"] == True + + @pytest.mark.parametrize("n_slots,n_requests", [ (1, 2), (2, 2), From f112d198cd9953b633ac662c2705d6e423438717 Mon Sep 17 00:00:00 2001 From: aryantandon01 <80969509+aryantandon01@users.noreply.github.com> Date: Thu, 5 Dec 2024 03:49:20 +0530 Subject: [PATCH 272/279] Update deprecation-warning.cpp (#10619) Fixed Path Separator Handling for Cross-Platform Support (Windows File Systems) --- examples/deprecation-warning/deprecation-warning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp index 11b35d2c2..c2958ea12 100644 --- a/examples/deprecation-warning/deprecation-warning.cpp +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) { } // Get only the program name from the full path - auto pos = filename.find_last_of('/'); + auto pos = filename.find_last_of("/\\"); if (pos != std::string::npos) { filename = filename.substr(pos+1); } From d405804be84cfdac936f28b3446ce8cb988408d8 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 5 Dec 2024 08:47:55 +0100 Subject: [PATCH 273/279] py : update outdated copy-paste instructions [no ci] (#10667) This commit updates the copy-paste instruction in convert_hf_to_gguf_update.py to reflect that convert_hf_to_gguf.py will have already been updated with the new get_vocab_base_pre() function when this script completes. --- convert_hf_to_gguf_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 28cd02e5a..2a51fce2d 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -17,7 +17,7 @@ # # python3 convert_hf_to_gguf_update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py +# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp From c2082d93a82d66dca112098c63775d7a97e6d47b Mon Sep 17 00:00:00 2001 From: PAB Date: Tue, 3 Dec 2024 20:20:04 +0100 Subject: [PATCH 274/279] ggml : add `GGML_PAD_REFLECT_1D` operation (ggml/1034) * ggml_pad_reflect_1d defined in header * implemented on CPU * called the forward pass * impl Metal kernel * added Metal kernel * added OP_PAD_REFLECT_1D in test-backend-ops.cpp * add test-pad-reflect-1d test case * test case support multiple backend --- ggml/include/ggml.h | 8 +++++ ggml/src/ggml-cpu/ggml-cpu.c | 39 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.m | 35 +++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 47 ++++++++++++++++++++++++++++ ggml/src/ggml.c | 37 ++++++++++++++++++++-- tests/test-backend-ops.cpp | 28 +++++++++++++++++ 6 files changed, 192 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 65cb92c44..1c8cc11b6 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -499,6 +499,7 @@ extern "C" { GGML_OP_POOL_2D_BACK, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_PAD_REFLECT_1D, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1695,6 +1696,13 @@ extern "C" { int p2, int p3); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] + GGML_API struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 40ca7bb68..f82e877f0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -10439,6 +10439,40 @@ static void ggml_compute_forward_pad( } } +// ggml_compute_forward_pad_reflect_1d + +static void ggml_compute_forward_pad_reflect_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t * opts = (const int32_t *) dst->op_params; + const int p0 = opts[0]; + const int p1 = opts[1]; + + GGML_TENSOR_UNARY_OP_LOCALS + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); + float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); + + ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); + + for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } + for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } + } + } + } +} // ggml_compute_forward_arange @@ -12535,6 +12569,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_PAD_REFLECT_1D: + { + ggml_compute_forward_pad_reflect_1d(params, tensor); + } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -12877,6 +12915,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 093ae9000..71f5525f3 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -310,6 +310,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -877,6 +878,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3258,6 +3261,38 @@ static void ggml_metal_encode_node( const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD_REFLECT_1D: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int32_t p0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[1]; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14]; + [encoder setBytes:&p0 length:sizeof(p0) atIndex:15]; + [encoder setBytes:&p1 length:sizeof(p1) atIndex:16]; + + const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 5caa0846a..ee3e353ab 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32( } } +kernel void kernel_pad_reflect_1d_f32( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & ne0, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int32_t & p0, + constant int32_t & p1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3; + const int64_t i02 = i2; + const int64_t i01 = i1; + + device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); + device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); + + if (i1 < ne01 && i2 < ne02 && i3 < ne03) { + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + if (i0 < p0) { + dst_ptr[i0] = src0_ptr[p0 - i0]; + } else if (i0 < ne0 - p1) { + dst_ptr[i0] = src0_ptr[i0 - p0]; + } else { + dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1]; + } + } + } +} + kernel void kernel_arange_f32( device char * dst, constant int64_t & ne0, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1a9a7efaf..2c338dee5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D_BACK", "UPSCALE", "PAD", + "PAD_REFLECT_1D", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d_back(x)", "upscale(x)", "pad(x)", + "pad_reflect_1d(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad( return result; } +// ggml_pad_reflect_1d + +struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1) { + GGML_ASSERT(p0 >= 0); + GGML_ASSERT(p1 >= 0); + + GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the + GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] + p0 + p1, + a->ne[1], + a->ne[2], + a->ne[3]); + + int32_t params[] = { p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_PAD_REFLECT_1D; + result->src[0] = a; + + return result; +} + // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 807d271c6..042eeaaab 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2697,6 +2697,33 @@ struct test_pad : public test_case { } }; +// GGML_OP_PAD_REFLECT_1D +struct test_pad_reflect_1d : public test_case { + const ggml_type type; + const std::array ne_a; + const int pad_0; + const int pad_1; + + std::string vars() override { + return VARS_TO_STR4(type, ne_a, pad_0, pad_1); + } + + test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32, + std::array ne_a = {512, 34, 2, 1}, + int pad_0 = 10, int pad_1 = 9) + : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_ARANGE struct test_arange : public test_case { const ggml_type type; @@ -3816,6 +3843,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_pad()); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_arange()); test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); From a8cbab201dcf4c76c30b8028427494d71f02bb57 Mon Sep 17 00:00:00 2001 From: PAB Date: Wed, 4 Dec 2024 09:19:30 +0100 Subject: [PATCH 275/279] ggml: add `GGML_SET` Metal kernel + i32 CPU kernel (ggml/1037) * implemented cpu kernel * add i32 test cases in test-backend-ops * typedef `ggml_metal_kargs_set` * implemented `kernel_set` * memcpy --- ggml/src/ggml-cpu/ggml-cpu.c | 80 ++++++++++++++++++++++++++- ggml/src/ggml-metal/ggml-metal-impl.h | 15 +++++ ggml/src/ggml-metal/ggml-metal.m | 76 +++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 32 +++++++++++ tests/test-backend-ops.cpp | 4 ++ 5 files changed, 206 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f82e877f0..f12b62e3b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1374,7 +1374,10 @@ struct ggml_compute_state { inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } + inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } @@ -8248,6 +8251,77 @@ static void ggml_compute_forward_set_f32( } } +static void ggml_compute_forward_set_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitly element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + if (params->ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(int32_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_i32(nc, + (int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + static void ggml_compute_forward_set( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8259,6 +8333,10 @@ static void ggml_compute_forward_set( { ggml_compute_forward_set_f32(params, dst); } break; + case GGML_TYPE_I32: + { + ggml_compute_forward_set_i32(params, dst); + } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index e19586719..e3dc25f16 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -102,6 +102,21 @@ typedef struct { uint64_t nb3; } ggml_metal_kargs_cpy; +typedef struct { + int64_t ne10; + int64_t ne11; + int64_t ne12; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + uint64_t offs; + bool inplace; +} ggml_metal_kargs_set; + typedef struct { int32_t ne00; int32_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 71f5525f3..f80fda7a4 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -372,6 +372,8 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, + GGML_METAL_KERNEL_TYPE_SET_I32, + GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -940,6 +942,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -1159,6 +1163,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } + case GGML_OP_SET: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + return true; + default: + return false; + }; + } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -3824,6 +3838,68 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_SET: + { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // src0 and dst as viewed during set + const size_t dst_nb0 = ggml_element_size(src0); + + const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; + const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; + const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; + const size_t offset = ((int32_t *) dst->op_params)[3]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); + } + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + GGML_ASSERT(nb10 == sizeof(float)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; + case GGML_TYPE_I32: + GGML_ASSERT(nb10 == sizeof(int32_t)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + ggml_metal_kargs_set args = { + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb1 =*/ dst_nb1, + /*.nb2 =*/ dst_nb2, + /*.nb3 =*/ dst_nb3, + /*.offs =*/ offset, + /*.inplace =*/ inplace, + }; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ee3e353ab..8ba43904d 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -3927,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ #undef FA_TYPES +template +kernel void kernel_set( + constant ggml_metal_kargs_set & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i13 = tgpig[2]; + const int i12 = tgpig[1]; + const int i11 = tgpig[0]; + + const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; + + const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); + const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); + const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; + + device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); + + for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { + device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); + dst_data[i10] = (T) src[0]; + } +} + +typedef decltype(kernel_set) kernel_set_t; + +template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; +template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; + template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 042eeaaab..9dd41260a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3521,6 +3521,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim)); } + for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) { + test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim)); + } + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); From 0cd182ebcc2c45907240e727e80e8fbf0f5fdee9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Dec 2024 13:27:42 +0200 Subject: [PATCH 276/279] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 27769c93b..47eae44f7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -b903ffe79daf18c0aaacbebe44a7b93a6b8d0982 +74d66b63eaf207a24f3e93bb922aba131cbf2906 From 6fe624783166e7355cec915de0094e63cd3558eb Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Thu, 5 Dec 2024 19:30:59 +0100 Subject: [PATCH 277/279] llama : add Minerva 7B model support (#10673) * Support for Minerva 7B * Update convert_hf_to_gguf_update.py --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + include/llama.h | 1 + src/llama-vocab.cpp | 1 + src/llama.cpp | 3 +++ 5 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d8df5cc00..9f1419e29 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -658,6 +658,9 @@ class Model: if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2a51fce2d..ce3c571df 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -102,6 +102,7 @@ models = [ {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, + {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, ] diff --git a/include/llama.h b/include/llama.h index 168c3fa1f..d121354c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -104,6 +104,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d1dc96276..8c9aaf5a0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_EXAONE: + case LLAMA_VOCAB_PRE_TYPE_MINERVA: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/src/llama.cpp b/src/llama.cpp index 00f78639e..ba4a9dfcf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6479,6 +6479,9 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "minerva-7b") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From c9c6e01daedac542b174c235872569fce5385982 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Dec 2024 13:15:05 -0600 Subject: [PATCH 278/279] vulkan: Add VK_NV_cooperative_matrix2 support for mul_mat and flash attention (#10206) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 741 ++++++++++++++++-- .../ggml-vulkan/vulkan-shaders/CMakeLists.txt | 2 + .../vulkan-shaders/dequant_funcs_cm2.comp | 305 +++++++ .../vulkan-shaders/flash_attn_cm2.comp | 289 +++++++ .../vulkan-shaders/mul_mm_cm2.comp | 328 ++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 97 ++- 6 files changed, 1665 insertions(+), 97 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 07b45d6b9..c7ac0e8f7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -167,6 +167,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool coopmat2; size_t idx; @@ -176,6 +177,7 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_matmul_f16_f32; vk_pipeline pipeline_matmul_split_k_reduce; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32; @@ -229,6 +231,14 @@ struct vk_device_struct { vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_pool2d_f32; + // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} + vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + std::unordered_map pipelines; std::unordered_map pipeline_descriptor_set_requirements; @@ -340,6 +350,40 @@ struct vk_mat_vec_id_push_constants { uint32_t nei0; uint32_t ne11; }; +struct vk_flash_attn_push_constants { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +}; + struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -1265,6 +1309,23 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events ); } +// number of rows/cols for flash attention shader +static constexpr uint32_t flash_attention_num_small_rows = 32; +static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) { + GGML_UNUSED(clamp); + + // small rows, large cols + if (small_rows) { + return {flash_attention_num_small_rows, 128}; + } + // small cols to reduce register count + if (ggml_is_quantized(type) || D == 256) { + return {64, 32}; + } + return {64, 64}; +}; + + static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1275,59 +1336,98 @@ static void ggml_vk_load_shaders(vk_device& device) { // mulmat std::vector l_warptile, m_warptile, s_warptile, - l_warptile_mmq, m_warptile_mmq, s_warptile_mmq; + l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, + l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, + l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid; std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, - l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms; + l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, + l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, + l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; + uint32_t l_align, m_align, s_align; + if (device->coopmat2) { + // spec constants and tile sizes for non-quant matmul/matmul_id + l_warptile = { 256, 128, 256, 64 }; + m_warptile = { 256, 128, 128, 64 }; + s_warptile = { 128, 32, 16, 64 }; + l_wg_denoms = {128, 256, 1 }; + m_wg_denoms = {128, 128, 1 }; + s_wg_denoms = { 32, 16, 1 }; - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + // spec constants and tile sizes for quant matmul (non-Qi_K) + l_warptile_mmq = { 256, 128, 256, 64 }; + m_warptile_mmq = { 256, 128, 128, 64 }; + s_warptile_mmq = { 256, 128, 128, 64 }; + l_mmq_wg_denoms = { 128, 256, 1 }; + m_mmq_wg_denoms = { 128, 128, 1 }; + s_mmq_wg_denoms = { 128, 128, 1 }; - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + // spec constants and tile sizes for quant matmul (Qi_K) + l_warptile_mmq_k = { 256, 128, 512, 16 }; + m_warptile_mmq_k = { 256, 128, 256, 16 }; + s_warptile_mmq_k = { 256, 32, 128, 64 }; + l_mmq_wg_denoms_k = { 128, 512, 1 }; + m_mmq_wg_denoms_k = { 128, 256, 1 }; + s_mmq_wg_denoms_k = { 32, 128, 1 }; - l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; - m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; - s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + // spec constants and tile sizes for quant matmul_id + l_warptile_mmqid = { 256, 128, 128, 16 }; + m_warptile_mmqid = { 256, 128, 64, 16 }; + s_warptile_mmqid = { 256, 64, 64, 16 }; + l_mmqid_wg_denoms = { 128, 128, 1 }; + m_mmqid_wg_denoms = { 128, 64, 1 }; + s_mmqid_wg_denoms = { 64, 64, 1 }; - l_align = 128; - m_align = 64; - s_align = 32; + l_align = 128; + m_align = 64; + s_align = 32; + } else { + l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; + m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; + s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + l_align = 128; + m_align = 64; + s_align = 32; - // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders - // and tile sizes, this should handle 16KB, 32KB, and 48KB+. - // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. - // But the numbers happen to work out for 32KB shared memory size that when using the medium - // size there's enough room for everything, and we assert for this. - uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - l_warptile = m_warptile; - l_wg_denoms = m_wg_denoms; - shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); - } - - shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - if (device->properties.limits.maxComputeSharedMemorySize == 32768) { - l_warptile_mmq = m_warptile_mmq; - l_mmq_wg_denoms = m_mmq_wg_denoms; - } else { - l_warptile_mmq = s_warptile_mmq; - l_mmq_wg_denoms = s_mmq_wg_denoms; + // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders + // and tile sizes, this should handle 16KB, 32KB, and 48KB+. + // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. + // But the numbers happen to work out for 32KB shared memory size that when using the medium + // size there's enough room for everything, and we assert for this. + uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + l_warptile = m_warptile; + l_wg_denoms = m_wg_denoms; + shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + if (device->properties.limits.maxComputeSharedMemorySize == 32768) { + l_warptile_mmq = m_warptile_mmq; + l_mmq_wg_denoms = m_mmq_wg_denoms; + } else { + l_warptile_mmq = s_warptile_mmq; + l_mmq_wg_denoms = s_mmq_wg_denoms; + } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); + } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + } } device->pipeline_matmul_f32 = std::make_shared(); @@ -1362,6 +1462,105 @@ static void ggml_vk_load_shaders(vk_device& device) { compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; +#if defined(VK_NV_cooperative_matrix2) + if (device->coopmat2) { + + auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { + return {fa_rows_cols(D, clamp, type, small_rows)[0], 1, 1}; + }; + + auto const &fa_spec_constants = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector { + // For large number of rows, 128 invocations seems to work best. + // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we + // can't use 256 for D==80. + uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128; + auto rows_cols = fa_rows_cols(D, clamp, type, small_rows); + return {wg_size, rows_cols[0], rows_cols[1], (D), clamp}; + }; + +#define CREATE_FA2(TYPE, NAMELC, D) \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + +#define CREATE_FA(TYPE, NAMELC) \ + CREATE_FA2(TYPE, NAMELC, 64) \ + CREATE_FA2(TYPE, NAMELC, 80) \ + CREATE_FA2(TYPE, NAMELC, 96) \ + CREATE_FA2(TYPE, NAMELC, 112) \ + CREATE_FA2(TYPE, NAMELC, 128) \ + CREATE_FA2(TYPE, NAMELC, 256) + + CREATE_FA(GGML_TYPE_F16, f16) + CREATE_FA(GGML_TYPE_Q4_0, q4_0) + CREATE_FA(GGML_TYPE_Q4_1, q4_1) + CREATE_FA(GGML_TYPE_Q5_0, q5_0) + CREATE_FA(GGML_TYPE_Q5_1, q5_1) + CREATE_FA(GGML_TYPE_Q8_0, q8_0) + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //CREATE_FA(GGML_TYPE_Q2_K, q2_k) + //CREATE_FA(GGML_TYPE_Q3_K, q3_k) + //CREATE_FA(GGML_TYPE_Q4_K, q4_k) + //CREATE_FA(GGML_TYPE_Q5_K, q5_k) + //CREATE_FA(GGML_TYPE_Q6_K, q6_k) + CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl) +#undef CREATE_FA + + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) +#undef CREATE_MM +#undef CREATE_MM2 + } else +#endif if (device->fp16) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ @@ -1648,15 +1847,28 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + bool fp16_storage = false; + bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; + bool pipeline_robustness = false; + bool coopmat2_support = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; + } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VULKAN_DISABLE_COOPMAT2")) { + coopmat2_support = true; } } @@ -1679,6 +1891,14 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct = (VkBaseOutStructure *)&sm_props; } +#if defined(VK_NV_cooperative_matrix2) + vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; + last_struct = (VkBaseOutStructure *)&coopmat2_props; + } +#endif + device->physical_device.getProperties2(&props2); device->properties = props2.properties; @@ -1701,20 +1921,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->shader_core_count = 0; } - bool fp16_storage = false; - bool fp16_compute = false; - bool pipeline_robustness = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; - } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { - pipeline_robustness = true; - } - } - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; @@ -1757,22 +1963,112 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + last_struct = (VkBaseOutStructure *)&vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; pl_robustness_features.pNext = nullptr; pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; pl_robustness_features.pipelineRobustness = VK_FALSE; if (pipeline_robustness) { - vk12_features.pNext = &pl_robustness_features; + last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; + last_struct = (VkBaseOutStructure *)&pl_robustness_features; device_extensions.push_back("VK_EXT_pipeline_robustness"); } +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.pNext = nullptr; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + device_extensions.push_back("VK_NV_cooperative_matrix2"); + } +#endif + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (coopmat2_support) { +#if defined(VK_NV_cooperative_matrix2) + if (coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads && + vk12_features.bufferDeviceAddress) { + + std::vector flexible_dimensions; + uint32_t count = 0; + + PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = + (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) + vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); + + VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; + empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; + flexible_dimensions.resize(count, empty_prop); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); + + bool found_fp16_128 = false, + found_fp16_256 = false, + found_fp32_128 = false, + found_fp32_256 = false; + // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 + // with 32x16x16 and 256 with 32x32x16. + for (auto &prop : flexible_dimensions) { + if (prop.saturatingAccumulation == VK_FALSE && + prop.scope == VK_SCOPE_WORKGROUP_KHR && + prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_128 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_128 = true; + } + } + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_256 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_256 = true; + } + } + } + } + if (found_fp16_128 && found_fp16_256 && + found_fp32_128 && found_fp32_256 && + coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { + device->coopmat2 = true; + } + } +#endif + } + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -2124,7 +2420,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type return ctx->device->pipeline_dequant[type]; } -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; @@ -2132,14 +2428,23 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f16_f32.f32acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f16.f32acc; + if (prec == GGML_PREC_DEFAULT && ctx->device->coopmat2) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f32acc; + } } - if (src1_type != GGML_TYPE_F32) { + if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { return nullptr; } @@ -2160,6 +2465,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return nullptr; } + if (ctx->device->coopmat2) { + assert(src1_type == GGML_TYPE_F16); + return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc; + } return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } @@ -2844,6 +3153,16 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, break; } + if (ctx->device->coopmat2) { + if ((m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) { + return aligned ? mmp->a_l : mmp->l; + } + if ((m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + if (m <= 32 || n <= 32) { return aligned ? mmp->a_s : mmp->s; } @@ -3008,18 +3327,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + // Reformat and convert to fp16 if src1 is non-contiguous, or for coopmat2 for better perf + const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + !ggml_vk_dim01_contiguous(src1); const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16); + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]); } // Not implemented @@ -3930,6 +4251,167 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx } } +static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) { + VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; + std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; + std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const uint32_t nem1 = mask ? mask->ne[1] : 0; + const uint32_t nbm1 = mask ? mask->nb[1] : 0; + + const uint32_t D = neq0; + const uint32_t N = neq1; + const uint32_t KV = nek1; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(nev1 == nek1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + assert(dst->type == GGML_TYPE_F32); + assert(q->type == GGML_TYPE_F32); + assert(k->type == v->type); + + vk_pipeline *pipelines; + // XXX TODO other backends may be changing accumulator precision to default to f32 soon + bool f32acc = dst->op_params[3] == GGML_PREC_F32; + bool small_rows = N <= flash_attention_num_small_rows; + switch (D) { + case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break; + case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break; + case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break; + case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break; + case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break; + case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break; + default: + assert(!"unsupported D value"); + return; + } + assert(pipelines); + + bool aligned = (KV % pipelines[1]->align) == 0; + vk_pipeline pipeline = pipelines[aligned]; + assert(pipeline); + + if (dryrun) { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return; + } + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + const uint32_t n_head_kv = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + ggml_vk_sync_buffers(subctx); + + vk_buffer d_Q, d_K, d_V, d_D, d_M; + uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset; + + bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset); + ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset); + ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset); + ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset); + Q_uma = d_Q != nullptr; + K_uma = d_K != nullptr; + V_uma = d_V != nullptr; + D_uma = d_D != nullptr; + if (mask) { + ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset); + M_uma = d_M != nullptr; + } + } + + + ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context; + ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context; + ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context; + + if (!Q_uma) { + d_Q = q_buf_ctx->dev_buffer; + q_buf_offset = vk_tensor_offset(q) + q->view_offs; + } + if (!K_uma) { + d_K = k_buf_ctx->dev_buffer; + k_buf_offset = vk_tensor_offset(k) + k->view_offs; + } + if (!V_uma) { + d_V = v_buf_ctx->dev_buffer; + v_buf_offset = vk_tensor_offset(v) + v->view_offs; + } + if (!D_uma) { + d_D = d_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + + if (!M_uma) { + d_M = d_Q; + m_buf_offset = q_buf_offset; + if (mask) { + ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context; + d_M = m_buf_ctx->dev_buffer; + m_buf_offset = vk_tensor_offset(mask) + mask->view_offs; + } + } + + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, + }, + sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 }); +} + static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: @@ -5044,16 +5526,16 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); @@ -5391,16 +5873,16 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_buffer_write(y_buf, 0, y, y_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -5621,7 +6103,8 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 4096, 512, 11008, 32000, 512, 4096, }; - const size_t num_it = 1; + const size_t num_it = 100; + for (size_t i = 0; i < vals.size(); i += 3) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); @@ -5676,6 +6159,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; const ggml_tensor * src2 = node->src[2]; + const ggml_tensor * src3 = node->src[3]; switch (node->op) { // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor @@ -5728,6 +6212,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_LEAKY_RELU: + case GGML_OP_FLASH_ATTN_EXT: break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; @@ -5920,6 +6405,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_MUL_MAT_ID: ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); + break; + + case GGML_OP_FLASH_ATTN_EXT: + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun); + break; default: return false; @@ -6020,6 +6510,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: + case GGML_OP_FLASH_ATTN_EXT: buf = tensor->buffer; break; @@ -6751,6 +7242,57 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; } break; + case GGML_OP_FLASH_ATTN_EXT: + { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + if (!ggml_vk_get_device(ctx->device)->coopmat2) { + return false; + } + switch (op->src[0]->ne[0]) { + case 64: + case 80: + case 96: + case 112: + case 128: + case 256: + break; + default: + return false; + } + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + if (op->type != GGML_TYPE_F32) { + return false; + } + if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { + return false; + } + // It's straightforward to support different K/V dequant, but would + // significantly increase the number of pipelines + if (op->src[1]->type != op->src[2]->type) { + return false; + } + switch (op->src[1]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //case GGML_TYPE_Q2_K: + //case GGML_TYPE_Q3_K: + //case GGML_TYPE_Q4_K: + //case GGML_TYPE_Q5_K: + //case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_NL: + break; + default: + return false; + } + return true; + } case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -7065,6 +7607,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; ggml_tensor * src2 = tensor->src[2]; + ggml_tensor * src3 = tensor->src[3]; struct ggml_init_params iparams = { /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, @@ -7077,15 +7620,18 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { struct ggml_tensor * src0_clone = nullptr; struct ggml_tensor * src1_clone = nullptr; struct ggml_tensor * src2_clone = nullptr; + struct ggml_tensor * src3_clone = nullptr; struct ggml_tensor * tensor_clone = nullptr; size_t src0_size; size_t src1_size; size_t src2_size; + size_t src3_size; void * src0_buffer = nullptr; void * src1_buffer = nullptr; void * src2_buffer = nullptr; + void * src3_buffer = nullptr; if (src0 != nullptr) { src0_clone = ggml_dup_tensor(ggml_ctx, src0); @@ -7213,8 +7759,53 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_vk_print_tensor(src2, "src2"); } } + if (src3 != nullptr) { + src3_clone = ggml_dup_tensor(ggml_ctx, src3); - if (tensor->op == GGML_OP_MUL_MAT) { + src3_size = ggml_nbytes(src3); + + src3_buffer = malloc(src3_size); + src3_clone->data = src3_buffer; + if (ggml_backend_buffer_is_host(src3->buffer)) { + memcpy(src3_clone->data, src3->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(src3->buffer)) { + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src3->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src3) + src3->view_offs; + if (!ggml_is_contiguous(src3) && ggml_vk_dim01_contiguous(src3)) { + for (int i3 = 0; i3 < src3->ne[3]; i3++) { + for (int i2 = 0; i2 < src3->ne[2]; i2++) { + const int idx = i3*src3->ne[2] + i2; + ggml_vk_buffer_read(buffer_gpu, offset + idx * src3->nb[2], ((char *)src3_clone->data + idx * src3_clone->nb[2]), src3->ne[1] * src3->nb[1]); + } + } + + src3_clone->nb[0] = src3->nb[0]; + src3_clone->nb[1] = src3->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src3_clone->nb[i] = src3_clone->nb[i - 1]*src3_clone->ne[i - 1]; + } + } else { + if (offset + src3_size >= buffer_gpu->size) { + src3_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(buffer_gpu, offset, src3_clone->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } + } else { + GGML_ABORT("fatal error"); + } + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(src3, "src3"); + } + } + + if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { + const float *params = (const float *)tensor->op_params; + tensor_clone = ggml_flash_attn_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, src3_clone, params[0], params[1], params[2]); + } else if (tensor->op == GGML_OP_MUL_MAT) { tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_MUL_MAT_ID) { tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 51c78b7d2..bd0c74cb1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -1,7 +1,9 @@ find_package (Threads REQUIRED) +find_package(Vulkan COMPONENTS glslc REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) +target_link_libraries(vulkan-shaders-gen PRIVATE Vulkan::Vulkan) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp new file mode 100644 index 000000000..a8707b621 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -0,0 +1,305 @@ + +#include "types.comp" + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 { + block_q4_0_packed16 block; +}; + +float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1]; + qs >>= shift; + qs &= 0xF; + float16_t ret = (float16_t(qs) - float16_t(8)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 { + block_q4_1 block; +}; + +float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(qs) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 { + block_q5_0 block; +}; + +float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0]; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 { + block_q5_1 block; +}; + +float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = bl.block.qh; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = float16_t(qs | qh) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 { + block_q8_0_packed16 block; +}; + +float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + // Load 16b and select the byte for this element + int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1]; + float16_t ret = float16_t(qs) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K { + block_q2_K block; +}; + +float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const f16vec2 d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31 + const uint scalesi = iqs / 16; // 0..15 + const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6 + + uint32_t qs = bl.block.qs[qsi]; + const uint scales = bl.block.scales[scalesi]; + float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4); + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K { + block_q3_K block; +}; + +float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint hmi = (iqs % 32); // 0..31 + const uint j = (iqs % 128) / 8; // 0..15 + const uint is = iqs / 16; // 0..15 + const uint halfsplit = ((iqs % 128) / 32); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + uint32_t scaleidx0 = (is < 8) ? is : (is-8); + uint32_t scaleidx0shift = (is < 8) ? 0 : 4; + uint32_t scaleidx1 = is + 8 - (is/4)*4; + uint32_t scaleidx1shift = (is/4)*2; + + const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4)); + + const float16_t dl = bl.block.d * float16_t(us - 32); + + float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi ] >> qsshift) & 3) - (((bl.block.hmask[hmi ] & m) != 0) ? 0 : 4)); + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K { + block_q4_K block; +}; + +float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K { + block_q5_K block; +}; + +float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + const uint qhi = (iqs % 32); // 0..31 + + const uint8_t hm = uint8_t(1 << (iqs / 32)); + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K { + block_q6_K block; +}; + +float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint b = (iqs % 128) / 64; // 0,1 + const uint is_b = (iqs % 32) / 16; // 0,1 + const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 64); // 0..127 + const uint qhi = n * 32 + (iqs % 32); // 0..63 + + const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]); + + float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32); + + return ret; +} + +#if defined(DATA_A_IQ4_NL) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL { + block_iq4_nl block; +}; + +float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(kvalues_iq4nl[qs]) * d; + return ret; +} +#endif + +#if defined(DATA_A_Q4_0) +#define dequantFuncA dequantFuncQ4_0 +#elif defined(DATA_A_Q4_1) +#define dequantFuncA dequantFuncQ4_1 +#elif defined(DATA_A_Q5_0) +#define dequantFuncA dequantFuncQ5_0 +#elif defined(DATA_A_Q5_1) +#define dequantFuncA dequantFuncQ5_1 +#elif defined(DATA_A_Q8_0) +#define dequantFuncA dequantFuncQ8_0 +#elif defined(DATA_A_Q2_K) +#define dequantFuncA dequantFuncQ2_K +#elif defined(DATA_A_Q3_K) +#define dequantFuncA dequantFuncQ3_K +#elif defined(DATA_A_Q4_K) +#define dequantFuncA dequantFuncQ4_K +#elif defined(DATA_A_Q5_K) +#define dequantFuncA dequantFuncQ5_K +#elif defined(DATA_A_Q6_K) +#define dequantFuncA dequantFuncQ6_K +#elif defined(DATA_A_IQ4_NL) +#define dequantFuncA dequantFuncIQ4_NL +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp new file mode 100644 index 000000000..c5be8131b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -0,0 +1,289 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable +#extension GL_EXT_null_initializer : enable + +#include "types.comp" +#include "dequant_funcs_cm2.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint32_t Br = 32; +layout (constant_id = 2) const uint32_t Bc = 32; +layout (constant_id = 3) const uint32_t D = 32; +layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV; + +layout (push_constant) uniform parameter { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +} p; + +layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; +layout (binding = 1) readonly buffer K {uint8_t data_k[];}; +layout (binding = 2) readonly buffer V {uint8_t data_v[];}; +layout (binding = 3) readonly buffer M {uint8_t data_m[];}; +layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + +ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return max(x, y); +} + +ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return x; +} + +// Replace matrix elements >= numRows or numCols with 'replace' +ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) { + if (row >= numRows || col >= numCols) { + return replace; + } + return elem; +} + +ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem) +{ + return exp(elem); +} + +ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1) +{ + return max(elem0, elem1); +} + +#if defined(BLOCK_SIZE) +#define DECODEFUNC , DEQUANTFUNC +#else +#define DECODEFUNC +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + + const uint32_t N = p.N; + const uint32_t KV = p.KV; + + const uint32_t Tr = CEIL_DIV(N, Br); + const uint32_t Tc = CEIL_DIV(KV, Bc); + + const uint32_t i = gl_WorkGroupID.x; + + const uint32_t iq2 = gl_WorkGroupID.y; + const uint32_t iq3 = gl_WorkGroupID.z; + + // broadcast factors + const uint32_t rk2 = p.neq2/p.nek2; + const uint32_t rk3 = p.neq3/p.nek3; + + const uint32_t rv2 = p.neq2/p.nev2; + const uint32_t rv3 = p.neq3/p.nev3; + + // k indices + const uint32_t ik3 = iq3 / rk3; + const uint32_t ik2 = iq2 / rk2; + + // v indices + const uint32_t iv3 = iq3 / rv3; + const uint32_t iv2 = iq2 / rv2; + + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp); + tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if defined(BLOCK_SIZE) + tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE); + tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE); +#endif + + tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D); + tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); + tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); + + coopmat Q; + coopmat Qf16; + + uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; + coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D)); + + Qf16 = coopmat(Q); + Qf16 *= float16_t(p.scale); + + coopmat O = coopmat(0); + + coopmat L, M; + + L = coopmat(0); + M = coopmat(-1.0/0.0); + + ACC_TYPE slope = ACC_TYPE(1.0); + + // ALiBi + if (p.max_bias > 0.0f) { + const uint32_t h = iq2; + + const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + + slope = pow(base, ACC_TYPE(exph)); + } + + [[dont_unroll]] + for (uint32_t j = 0; j < Tc; ++j) { + + coopmat S = coopmat(0); + + coopmat K_T; + + uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC); + S = coopMatMulAdd(Qf16, K_T, S); + + if (p.logit_softcap != 0.0f) { + [[unroll]] + for (int k = 0; k < S.length(); ++k) { + S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]); + } + } + + if (p.mask != 0) { + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); + + coopmat mv; + + coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + + S += slope*coopmat(mv); + } + + // Clear padding elements to -inf, so they don't contribute to rowmax + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C); + } + + coopmat rowmax, P, rowsum, eM; + + coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce); + + coopmat Mold = M; + + // M = max(rowmax, Mold) + // P = e^(S - M) + // eM = e^(Mold - M) + coopMatPerElementNV(M, rowmax, Max, Mold); + coopMatPerElementNV(P, S - M, Exp); + coopMatPerElementNV(eM, Mold - M, Exp); + + // Clear padding elements to 0, so they don't contribute to rowsum + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C); + } + + coopmat P_A = coopmat(P); + + // compute rowsum by multiplying by matrix of all ones. + coopmat One = coopmat(1.0); + + rowsum = coopmat(0.0); + rowsum = coopMatMulAdd(P_A, One, rowsum); + + coopmat V; + uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC); + + L = eM*L + rowsum; + + // This is the "diagonal" matrix in the paper, but since we do componentwise + // multiply rather than matrix multiply it has the diagonal element smeared + // across the row + coopmat eMdiag; + + // resize eM by using smear/reduce + coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); + + O = eMdiag * O; + + O = coopMatMulAdd(P_A, V, O); + } + + coopmat Ldiag; + + // resize L by using smear/reduce + coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); + + [[unroll]] + for (int k = 0; k < Ldiag.length(); ++k) { + Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k]; + } + + O = Ldiag*O; + + tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); + + // permute dimensions + tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); + uint32_t o_offset = iq3*p.ne2*p.ne1; + + coopmat O_D = coopmat(O); + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp new file mode 100644 index 000000000..cbfa5dce1 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -0,0 +1,328 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable + +#include "types.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint BM = 64; +layout (constant_id = 2) const uint BN = 64; +layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant + +layout (push_constant) uniform parameter +{ + uint M; + uint N; + uint K; + uint stride_a; + uint stride_b; + uint stride_d; + + uint batch_stride_a; + uint batch_stride_b; + uint batch_stride_d; + +#ifdef MUL_MAT_ID + uint nei0; + uint nei1; + uint nbi1; + uint ne11; +#else + uint k_split; + uint ne02; + uint ne12; + uint broadcast2; + uint broadcast3; +#endif +} p; + + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +#if QUANT_K > 1 +#define DECODEFUNCA , dequantFuncA +#define MAT_A_TYPE float16_t + +#include "dequant_funcs_cm2.comp" + +#else +#define DECODEFUNCA +#define MAT_A_TYPE A_TYPE +#endif + +#define MAT_B_TYPE B_TYPE + +#ifdef MUL_MAT_ID +layout (binding = 3) readonly buffer IDS {int data_ids[];}; + +shared u16vec4 row_ids[3072]; + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { + B_TYPE b[]; +}; + +uint _ne1; +shared uint _ne1_sh; + +B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint row_i = blockCoords[0]; + + if (row_i >= _ne1) { + return B_TYPE(0.0); + } + + const u16vec4 row_idx = row_ids[row_i]; + B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]]; + + return ret; +} + +D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic) +{ + uint dr = ir * BM + r; + uint dc = ic * BN + c; + + if (dr < p.M && dc < _ne1) { + uint row_i = dc; + const u16vec4 row_idx = row_ids[row_i]; + data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem; + } + return elem; +} + +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + +#ifdef MUL_MAT_ID + const uint expert_idx = gl_GlobalInvocationID.z; +#else + const uint batch_idx = gl_GlobalInvocationID.z; + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; +#endif + + const uint blocks_m = (p.M + BM - 1) / BM; + const uint ir = gl_WorkGroupID.x % blocks_m; + const uint ik = gl_WorkGroupID.x / blocks_m; + const uint ic = gl_WorkGroupID.y; + +#ifdef MUL_MAT_ID + // Spread the search across all elements in the first subgroup + if (gl_SubgroupID == 0) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + + for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) { + bool in_range = i < num_elements; + uint ii0 = i % p.nei0; + uint ii1 = i / p.nei0; + uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + uint idx = subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); + } + _ne1 += subgroupBallotBitCount(ballot); + } + _ne1_sh = _ne1; + } + + barrier(); + + _ne1 = _ne1_sh; + + // Workgroup has no work + if (ic * BN >= _ne1) return; +#endif + +#ifdef MUL_MAT_ID + uint start_k = 0; + const uint end_k = p.K; +#else + uint start_k = ik * p.k_split; + const uint end_k = min(p.K, (ik + 1) * p.k_split); +#endif + + coopmat sum; + sum = coopmat(0.0); + +#ifdef MUL_MAT_ID + uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K; + uint pos_b = 0; +#else + uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K; + uint pos_b = batch_idx * p.batch_stride_b; +#endif + + uint stride_a = p.stride_a / QUANT_K; + uint stride_b = p.stride_b; + + // Hint to the compiler that values are aligned (want 16B alignment). + // Quants are always block-aligned, no alignment needed. +#if ALIGNED +#if QUANT_K == 1 + stride_a &= ~7; +#endif + stride_b &= ~7; +#endif + + // Create layouts for both clamped and unclamped accesses + tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + +#if QUANT_K > 1 + tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K); + tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K); +#endif + + // Use end_k rather than p.K as the dimension because that's what + // we need to bound check against when using split_k + tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k); + tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M); + tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k); + tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if !defined(MUL_MAT_ID) + // Detect a fast path where all loads are entirely in bounds and no clamping is required + if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 && +#if QUANT_K == 1 + (stride_a % 8) == 0 && +#endif + (stride_b % 8) == 0 && (start_k % 8) == 0) { + // Hint to the compiler that values are aligned (want 16B alignment) + start_k &= ~7; + stride_b &= ~7; +#if QUANT_K == 1 + stride_a &= ~7; +#endif + + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + uint k_iters = (end_k - start_k + BK - 1) / BK; + + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopmat mat_a_ft = coopmat(mat_a); + + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); + coopmat mat_b_ft = coopmat(mat_b); + + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } else +#endif // !defined(MUL_MAT_ID) + { + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + + tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1); + + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1); + + [[dont_unroll]] + for (uint block_k = start_k; block_k < end_k; block_k += BK) { + + coopmat mat_a; + coopmat mat_b; + coopmat mat_a_ft; + coopmat mat_b_ft; + + // Clamping is expensive, so detect different code paths for each combination + // of A and B needing clamping. + bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0; +#ifdef MUL_MAT_ID + bool unclampedB = true; +#else + bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0; +#endif + if (unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } + } + + // Convert from ACC_TYPE to D_TYPE + coopmat mat_d; + mat_d = coopmat(sum); + +#ifdef MUL_MAT_ID + // Call callback to store each element, remapping row through shared memory + coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); +#else + tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); + + uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); +#endif +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 5c317b68b..4716e2c80 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,6 +30,8 @@ #include #endif +#include + #define ASYNCIO_CONCURRENCY 64 std::mutex lock; @@ -196,15 +198,17 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { - std::string name = _name + (fp16 ? "" : "_fp32"); +void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { + std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); + std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + #ifdef _WIN32 - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; #else - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname}; #endif #ifdef GGML_VULKAN_SHADER_DEBUG_INFO @@ -254,7 +258,7 @@ std::map merge_maps(const std::map> compiles; -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { { // wait until fewer than N compiles are in progress. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. @@ -265,15 +269,15 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const } compile_count++; } - compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16)); + compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); } -void matmul_shaders(bool fp16, bool matmul_id) { - std::string load_vec = fp16 ? "8" : "4"; - std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4"; - std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4"; +void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { + std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; + std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; + std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; - std::map base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}}; + std::map base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}}; std::string shader_name = "matmul"; if (matmul_id) { @@ -285,21 +289,31 @@ void matmul_shaders(bool fp16, bool matmul_id) { base_dict["FLOAT16"] = "1"; } - // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; - string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; + + // Shaders with f16 B_TYPE + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); // For unaligned, load one at a time for f32/f16, or two at a time for quants - std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; + std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2"; // For aligned matmul loads - std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; - string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); + std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; + + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + + if (tname != "f16" && tname != "f32") { + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + } } } @@ -307,11 +321,50 @@ void process_shaders() { std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl; std::map base_dict = {{"FLOAT_TYPE", "float"}}; + // matmul for (const auto& fp16 : {false, true}) { - matmul_shaders(fp16, false); - matmul_shaders(fp16, true); + for (const auto& matmul_id : {false, true}) { + for (const auto& coopmat2 : {false, true}) { + for (const auto& f16acc : {false, true}) { +#if !defined(VK_NV_cooperative_matrix2) + if (coopmat2) { + continue; + } +#endif + if (coopmat2 && !fp16) { + continue; + } + if (!coopmat2 && f16acc) { + continue; + } + matmul_shaders(fp16, matmul_id, coopmat2, f16acc); + } + } + } } +#if defined(VK_NV_cooperative_matrix2) + // flash attention + for (const auto& f16acc : {false, true}) { + std::string acctype = f16acc ? "float16_t" : "float"; + + for (const auto& tname : type_names) { + if (tname == "f32") { + continue; + } + + if (tname == "f16") { + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); + } else { + std::string data_a_key = "DATA_A_" + to_uppercase(tname); + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); + } + } + } +#endif + for (const auto& tname : type_names) { // mul mat vec std::string data_a_key = "DATA_A_" + to_uppercase(tname); From 7736837d62efed1dbebfe579472fca041eda12d6 Mon Sep 17 00:00:00 2001 From: Plamen Minev Date: Thu, 5 Dec 2024 23:36:41 +0200 Subject: [PATCH 279/279] fix(server) : not show alert when DONE is received (#10674) --- examples/server/public_simplechat/simplechat.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 8e0df3b61..2fcd24a86 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -407,6 +407,9 @@ class SimpleChat { if (curLine.startsWith("data:")) { curLine = curLine.substring(5); } + if (curLine.trim() === "[DONE]") { + break; + } let curJson = JSON.parse(curLine); console.debug("DBUG:SC:PART:Json:", curJson); this.append_response(this.response_extract_stream(curJson, apiEP));