From e9b4a1bf68c18beff4e33f23ea62c1245b296915 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 21 Apr 2024 00:17:47 +0000 Subject: [PATCH 01/14] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/1042fd8b148a9105f3c0aca3a6177fd1d9360ba5?narHash=sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw%3D' (2024-04-10) → 'github:NixOS/nixpkgs/5c24cf2f0a12ad855f444c30b2421d044120c66f?narHash=sha256-XtTSSIB2DA6tOv%2Bl0FhvfDMiyCmhoRbNB%2B0SeInZkbk%3D' (2024-04-19) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 2e2ab4932..9c1b0af37 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1712791164, - "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", + "lastModified": 1713537308, + "narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", + "rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f", "type": "github" }, "original": { From c0956b09ba845a7cd787d5580d7c8b96e80f40f5 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Mon, 22 Apr 2024 13:22:54 +0200 Subject: [PATCH 02/14] ci: fix job are cancelling each other (#6781) --- .github/workflows/bench.yml | 2 +- .github/workflows/server.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 758796632..d50af0b70 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -32,7 +32,7 @@ on: - cron: '04 2 * * *' concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }} + group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }} cancel-in-progress: true jobs: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 521cc29ae..3e68a3c8c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -23,7 +23,7 @@ on: - cron: '2 4 * * *' concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.ref || github.run_id }} cancel-in-progress: true jobs: From 8960fe86ae075c846c5df8848230d1904ba8877f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 22 Apr 2024 15:41:11 +0300 Subject: [PATCH 03/14] llama : fix typo in <|im_end|> token text (#6745) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7440c740f..a25d115c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4340,7 +4340,7 @@ static void llm_load_vocab( } } - // find EOT token: "<|eot_id|>", "<|im_emd|>", "", etc. + // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc. // // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID // for now, we apply this workaround to find the EOT token based on its text @@ -4351,7 +4351,7 @@ static void llm_load_vocab( // need to fix convert script //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL && (t.first == "<|eot_id|>" || - t.first == "<|im_emd|>" || + t.first == "<|im_end|>" || t.first == "" ) ) { From e931888d5024de814ce7119a18d6a959bfff3821 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 23 Apr 2024 00:05:06 +1000 Subject: [PATCH 04/14] ggml : fix calloc argument ordering. (#6820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latest gcc complains here: /home/airlied/devel/llama.cpp/ggml-alloc.c: In function ‘ggml_gallocr_new_n’: /home/airlied/devel/llama.cpp/ggml-alloc.c:374:59: warning: ‘calloc’ sizes specified with ‘sizeof’ in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 374 | ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); | ^~~~~~ /home/airlied/devel/llama.cpp/ggml-alloc.c:374:59: note: earlier argument should specify number of elements, later size of each element and a bunch more. calloc is specified to take nmemb first then size, so realign the code. In a couple of places there was a * x, 1 so I fixed those to use calloc properly. --- ggml-alloc.c | 16 ++++++++-------- ggml-backend.c | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 7ceafec30..1fbd376ed 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -371,16 +371,16 @@ struct ggml_gallocr { }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { - ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); + ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); GGML_ASSERT(galloc != NULL); - galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); + galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); GGML_ASSERT(galloc->bufts != NULL); - galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); + galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs); GGML_ASSERT(galloc->buffers != NULL); - galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); + galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); GGML_ASSERT(galloc->buf_tallocs != NULL); for (int i = 0; i < n_bufs; i++) { @@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c free(galloc->hash_set.keys); free(galloc->hash_values); galloc->hash_set.size = hash_size; - galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); - galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *)); + galloc->hash_values = calloc(hash_size, sizeof(struct hash_node)); GGML_ASSERT(galloc->hash_set.keys != NULL); GGML_ASSERT(galloc->hash_values != NULL); } else { @@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // set the node_allocs from the hash table if (galloc->n_nodes < graph->n_nodes) { free(galloc->node_allocs); - galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc)); GGML_ASSERT(galloc->node_allocs != NULL); } galloc->n_nodes = graph->n_nodes; @@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } if (galloc->n_leafs < graph->n_leafs) { free(galloc->leaf_allocs); - galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs); + galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0])); GGML_ASSERT(galloc->leaf_allocs != NULL); } galloc->n_leafs = graph->n_leafs; diff --git a/ggml-backend.c b/ggml-backend.c index 189b5c140..e91d97cd9 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1725,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new( GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU - struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); - sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); - sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); + sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); + sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); - sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); + sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; const int initial_splits_capacity = 16; - sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); + sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { @@ -1972,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = { /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT + /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT }; - struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT - bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); + struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT + bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), From 192090bae47960f0d38d4967abe398a5d190057e Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Mon, 22 Apr 2024 15:00:36 -0400 Subject: [PATCH 05/14] llamafile : improve sgemm.cpp (#6796) * llamafile : improve sgemm.cpp - Re-enable by default - Fix issue described in #6716 - Make code more abstract, elegant, and maintainable - Faster handling of weirdly shaped `m` an `n` edge cases * Address review comments * Help clang produce fma instructions * Address review comments --- CMakeLists.txt | 16 +- Makefile | 4 - ggml.c | 8 +- sgemm.cpp | 945 +++++++++++++++++++++---------------------------- 4 files changed, 406 insertions(+), 567 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f134a153b..58a1805ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,17 +43,11 @@ else() set(LLAMA_METAL_DEFAULT OFF) endif() -# TODO: fix this for Android CI -# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191 -#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID") -# set(LLAMA_LLAMAFILE_DEFAULT OFF) -#else() -# set(LLAMA_LLAMAFILE_DEFAULT ON) -#endif() - -# TODO: temporary disable until MoE is fixed -# https://github.com/ggerganov/llama.cpp/pull/6716 -set(LLAMA_LLAMAFILE_DEFAULT OFF) +if (CMAKE_SYSTEM_NAME MATCHES "ANDROID") + set(LLAMA_LLAMAFILE_DEFAULT OFF) +else() + set(LLAMA_LLAMAFILE_DEFAULT ON) +endif() # general option(BUILD_SHARED_LIBS "build shared libraries" OFF) diff --git a/Makefile b/Makefile index b0b2ea997..24acb8013 100644 --- a/Makefile +++ b/Makefile @@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS MK_LDFLAGS += $(shell pkg-config --libs openblas) endif # LLAMA_OPENBLAS -# TODO: temporary disable until MoE is fixed -# https://github.com/ggerganov/llama.cpp/pull/6716 -LLAMA_NO_LLAMAFILE := 1 - ifndef LLAMA_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE OBJS += sgemm.o diff --git a/ggml.c b/ggml.c index a3b312e4a..086db96af 100644 --- a/ggml.c +++ b/ggml.c @@ -10825,7 +10825,7 @@ static void ggml_compute_forward_mul_mat( #endif #if GGML_USE_LLAMAFILE - if (nb10 == ggml_type_size(src1->type)) { + if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), @@ -10878,15 +10878,13 @@ UseGgmlGemm1:; const size_t row_size = ggml_row_size(vec_dot_type, ne10); #if GGML_USE_LLAMAFILE - if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) { + if (src1->type != vec_dot_type) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), - (const char *)wdata + ggml_row_size(vec_dot_type, - nb12/ggml_type_size(src1->type)*i12 + - nb13/ggml_type_size(src1->type)*i13), + (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), diff --git a/sgemm.cpp b/sgemm.cpp index 6900f04cf..531e12af3 100644 --- a/sgemm.cpp +++ b/sgemm.cpp @@ -50,6 +50,7 @@ #pragma GCC diagnostic ignored "-Wignored-attributes" #include "sgemm.h" +#include #include "ggml-impl.h" #include "ggml-quants.h" @@ -65,22 +66,6 @@ #define VECTOR_REGISTERS 16 #endif -// there will be blocks -#define BEGIN_KERNEL(RM, RN) \ - int ytiles = (m - m0) / RM; \ - int xtiles = (n - n0) / RN; \ - int tiles = ytiles * xtiles; \ - int duty = (tiles + nth - 1) / nth; \ - int start = duty * ith; \ - int end = start + duty; \ - if (end > tiles) \ - end = tiles; \ - for (int job = start; job < end; ++job) { \ - int i = m0 + job / xtiles * RM; \ - int j = n0 + job % xtiles * RN; - -#define END_KERNEL() } - #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) namespace { @@ -122,6 +107,45 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); } inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +//////////////////////////////////////////////////////////////////////////////////////////////////// +// VECTORIZED FUSED MULTIPLY ADD + +/** + * Computes a * b + c. + */ +template +inline U madd(T a, T b, U c) { + return add(mul(a, b), c); +} + +#if defined(__FMA__) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +template <> +inline __m256 madd(__m256 a, __m256 b, __m256 c) { + return _mm256_fmadd_ps(a, b, c); +} +#endif +#if defined(__AVX512F__) +template <> +inline __m512 madd(__m512 a, __m512 b, __m512 c) { + return _mm512_fmadd_ps(a, b, c); +} +#endif +#endif + +#if defined(__ARM_FEATURE_FMA) +template <> +inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) { + return vfmaq_f32(c, b, a); +} +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) +template <> +inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) { + return vfmaq_f16(c, b, a); +} +#endif +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // VECTORIZED HORIZONTAL SUM @@ -213,36 +237,6 @@ template <> inline __m512 load(const ggml_fp16_t *p) { } #endif // __AVX512F__ -//////////////////////////////////////////////////////////////////////////////////////////////////// -// ABSTRACTIONS - -/** - * Computes a * b + c. - * - * This operation will become fused into a single arithmetic instruction - * if the hardware has support for this feature, e.g. Intel Haswell+ (c. - * 2013), AMD Bulldozer+ (c. 2011), etc. - */ -template -inline U madd(T a, T b, U c) { - return add(mul(a, b), c); -} - -/** - * Computes a * b + c with error correction. - * - * @see W. Kahan, "Further remarks on reducing truncation errors," - * Communications of the ACM, vol. 8, no. 1, p. 40, Jan. 1965, - * doi: 10.1145/363707.363723. - */ -template -inline U madder(T a, T b, U c, U *e) { - U y = sub(mul(a, b), *e); - U t = add(c, y); - *e = sub(sub(t, c), y); - return t; -} - //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION @@ -265,226 +259,179 @@ class tinyBLAS { private: NOINLINE void mnpack(int m0, int m, int n0, int n) { int mc, nc, mp, np; - if (m - m0 <= 0 || n - n0 <= 0) - return; - if (VECTOR_REGISTERS >= 32 && n - n0 >= 5 && m - m0 >= 5) { + switch ((std::min(m - m0, 5) << 4) | std::min(n - n0, 5)) { +#if VECTOR_REGISTERS == 32 + case 0x55: mc = 5; nc = 5; - gemm5x5(m0, m, n0, n); - } else if (n - n0 >= 4 && m - m0 >= 3) { + gemm<5, 5>(m0, m, n0, n); + break; + case 0x45: + mc = 4; + nc = 5; + gemm<4, 5>(m0, m, n0, n); + break; + case 0x54: + mc = 5; + nc = 4; + gemm<5, 4>(m0, m, n0, n); + break; + case 0x44: + mc = 4; + nc = 4; + gemm<4, 4>(m0, m, n0, n); + break; + case 0x53: + mc = 5; + nc = 3; + gemm<5, 3>(m0, m, n0, n); + break; + case 0x35: + mc = 3; + nc = 5; + gemm<3, 5>(m0, m, n0, n); + break; + case 0x43: + mc = 4; + nc = 3; + gemm<4, 3>(m0, m, n0, n); + break; +#else + case 0x55: + case 0x54: + case 0x53: + case 0x45: + case 0x44: + case 0x43: + mc = 4; + nc = 3; + gemm<4, 3>(m0, m, n0, n); + break; + case 0x35: +#endif + case 0x34: mc = 3; nc = 4; - gemm3x4(m0, m, n0, n); - } else if (n - n0 >= 4) { - mc = 1; + gemm<3, 4>(m0, m, n0, n); + break; + case 0x52: + mc = 5; + nc = 2; + gemm<5, 2>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm<3, 3>(m0, m, n0, n); + break; + case 0x25: + mc = 2; + nc = 5; + gemm<2, 5>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x24: + mc = 2; nc = 4; - gemm1x4(m0, m, n0, n); - } else if (m - m0 >= 4) { + gemm<2, 4>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x51: + mc = 5; + nc = 1; + gemm<5, 1>(m0, m, n0, n); + break; + case 0x41: mc = 4; nc = 1; - gemm4x1(m0, m, n0, n); - } else { + gemm<4, 1>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x15: + mc = 1; + nc = 5; + gemm<1, 5>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm<1, 4>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: mc = 1; nc = 1; - gemm1x1(m0, m, n0, n); + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; } mp = m0 + (m - m0) / mc * mc; np = n0 + (n - n0) / nc * nc; mnpack(mp, m, n0, np); - mnpack(m0, mp, np, n); - mnpack(mp, m, np, n); + mnpack(m0, m, np, n); } - NOINLINE void gemm5x5(int m0, int m, int n0, int n) { - BEGIN_KERNEL(5, 5) - D c00 = {0}; - D c01 = {0}; - D c02 = {0}; - D c03 = {0}; - D c04 = {0}; - D c10 = {0}; - D c11 = {0}; - D c12 = {0}; - D c13 = {0}; - D c14 = {0}; - D c20 = {0}; - D c21 = {0}; - D c22 = {0}; - D c23 = {0}; - D c24 = {0}; - D c30 = {0}; - D c31 = {0}; - D c32 = {0}; - D c33 = {0}; - D c34 = {0}; - D c40 = {0}; - D c41 = {0}; - D c42 = {0}; - D c43 = {0}; - D c44 = {0}; - for (int l = 0; l < k; l += KN) { - V k0 = load(B + ldb * (j + 0) + l); - V k1 = load(B + ldb * (j + 1) + l); - V k2 = load(B + ldb * (j + 2) + l); - V k3 = load(B + ldb * (j + 3) + l); - V k4 = load(B + ldb * (j + 4) + l); - V a0 = load(A + lda * (i + 0) + l); - c00 = madd(a0, k0, c00); - c01 = madd(a0, k1, c01); - c02 = madd(a0, k2, c02); - c03 = madd(a0, k3, c03); - c04 = madd(a0, k4, c04); - V a1 = load(A + lda * (i + 1) + l); - c10 = madd(a1, k0, c10); - c11 = madd(a1, k1, c11); - c12 = madd(a1, k2, c12); - c13 = madd(a1, k3, c13); - c14 = madd(a1, k4, c14); - V a2 = load(A + lda * (i + 2) + l); - c20 = madd(a2, k0, c20); - c21 = madd(a2, k1, c21); - c22 = madd(a2, k2, c22); - c23 = madd(a2, k3, c23); - c24 = madd(a2, k4, c24); - V a3 = load(A + lda * (i + 3) + l); - c30 = madd(a3, k0, c30); - c31 = madd(a3, k1, c31); - c32 = madd(a3, k2, c32); - c33 = madd(a3, k3, c33); - c34 = madd(a3, k4, c34); - V a4 = load(A + lda * (i + 4) + l); - c40 = madd(a4, k0, c40); - c41 = madd(a4, k1, c41); - c42 = madd(a4, k2, c42); - c43 = madd(a4, k3, c43); - c44 = madd(a4, k4, c44); + template + NOINLINE void gemm(int m0, int m, int n0, int n) { + int ytiles = (m - m0) / RM; + int xtiles = (n - n0) / RN; + int tiles = xtiles * ytiles; + int duty = (tiles + nth - 1) / nth; + int start = duty * ith; + int end = start + duty; + if (end > tiles) + end = tiles; + for (int job = start; job < end; ++job) { + int ii = m0 + job / xtiles * RM; + int jj = n0 + job % xtiles * RN; + D Cv[RN][RM] = {}; + for (int l = 0; l < k; l += KN) + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + Cv[j][i] = madd(load(A + lda * (ii + i) + l), + load(B + ldb * (jj + j) + l), + Cv[j][i]); + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 0) + (i + 1)] = hsum(c10); - C[ldc * (j + 0) + (i + 2)] = hsum(c20); - C[ldc * (j + 0) + (i + 3)] = hsum(c30); - C[ldc * (j + 0) + (i + 4)] = hsum(c40); - C[ldc * (j + 1) + (i + 0)] = hsum(c01); - C[ldc * (j + 1) + (i + 1)] = hsum(c11); - C[ldc * (j + 1) + (i + 2)] = hsum(c21); - C[ldc * (j + 1) + (i + 3)] = hsum(c31); - C[ldc * (j + 1) + (i + 4)] = hsum(c41); - C[ldc * (j + 2) + (i + 0)] = hsum(c02); - C[ldc * (j + 2) + (i + 1)] = hsum(c12); - C[ldc * (j + 2) + (i + 2)] = hsum(c22); - C[ldc * (j + 2) + (i + 3)] = hsum(c32); - C[ldc * (j + 2) + (i + 4)] = hsum(c42); - C[ldc * (j + 3) + (i + 0)] = hsum(c03); - C[ldc * (j + 3) + (i + 1)] = hsum(c13); - C[ldc * (j + 3) + (i + 2)] = hsum(c23); - C[ldc * (j + 3) + (i + 3)] = hsum(c33); - C[ldc * (j + 3) + (i + 4)] = hsum(c43); - C[ldc * (j + 4) + (i + 0)] = hsum(c04); - C[ldc * (j + 4) + (i + 1)] = hsum(c14); - C[ldc * (j + 4) + (i + 2)] = hsum(c24); - C[ldc * (j + 4) + (i + 3)] = hsum(c34); - C[ldc * (j + 4) + (i + 4)] = hsum(c44); - END_KERNEL() - } - - NOINLINE void gemm3x4(int m0, int m, int n0, int n) { - BEGIN_KERNEL(3, 4) - D c00 = {0}; - D c01 = {0}; - D c02 = {0}; - D c03 = {0}; - D c10 = {0}; - D c11 = {0}; - D c12 = {0}; - D c13 = {0}; - D c20 = {0}; - D c21 = {0}; - D c22 = {0}; - D c23 = {0}; - for (int l = 0; l < k; l += KN) { - V k0 = load(B + ldb * (j + 0) + l); - V k1 = load(B + ldb * (j + 1) + l); - V k2 = load(B + ldb * (j + 2) + l); - V k3 = load(B + ldb * (j + 3) + l); - V a0 = load(A + lda * (i + 0) + l); - c00 = madd(a0, k0, c00); - c01 = madd(a0, k1, c01); - c02 = madd(a0, k2, c02); - c03 = madd(a0, k3, c03); - V a1 = load(A + lda * (i + 1) + l); - c10 = madd(a1, k0, c10); - c11 = madd(a1, k1, c11); - c12 = madd(a1, k2, c12); - c13 = madd(a1, k3, c13); - V a2 = load(A + lda * (i + 2) + l); - c20 = madd(a2, k0, c20); - c21 = madd(a2, k1, c21); - c22 = madd(a2, k2, c22); - c23 = madd(a2, k3, c23); - } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 0) + (i + 1)] = hsum(c10); - C[ldc * (j + 0) + (i + 2)] = hsum(c20); - C[ldc * (j + 1) + (i + 0)] = hsum(c01); - C[ldc * (j + 1) + (i + 1)] = hsum(c11); - C[ldc * (j + 1) + (i + 2)] = hsum(c21); - C[ldc * (j + 2) + (i + 0)] = hsum(c02); - C[ldc * (j + 2) + (i + 1)] = hsum(c12); - C[ldc * (j + 2) + (i + 2)] = hsum(c22); - C[ldc * (j + 3) + (i + 0)] = hsum(c03); - C[ldc * (j + 3) + (i + 1)] = hsum(c13); - C[ldc * (j + 3) + (i + 2)] = hsum(c23); - END_KERNEL() - } - - NOINLINE void gemm1x4(int m0, int m, int n0, int n) { - BEGIN_KERNEL(1, 4) - D c00 = {0}, e00 = {0}; - D c01 = {0}, e01 = {0}; - D c02 = {0}, e02 = {0}; - D c03 = {0}, e03 = {0}; - for (int l = 0; l < k; l += KN) { - V a = load(A + lda * (i + 0) + l); - c00 = madder(a, load(B + ldb * (j + 0) + l), c00, &e00); - c01 = madder(a, load(B + ldb * (j + 1) + l), c01, &e01); - c02 = madder(a, load(B + ldb * (j + 2) + l), c02, &e02); - c03 = madder(a, load(B + ldb * (j + 3) + l), c03, &e03); - } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 1) + (i + 0)] = hsum(c01); - C[ldc * (j + 2) + (i + 0)] = hsum(c02); - C[ldc * (j + 3) + (i + 0)] = hsum(c03); - END_KERNEL() - } - - NOINLINE void gemm4x1(int m0, int m, int n0, int n) { - BEGIN_KERNEL(4, 1) - D c00 = {0}, e00 = {0}; - D c10 = {0}, e10 = {0}; - D c20 = {0}, e20 = {0}; - D c30 = {0}, e30 = {0}; - for (int l = 0; l < k; l += KN) { - V b = load(B + ldb * (j + 0) + l); - c00 = madder(load(A + lda * (i + 0) + l), b, c00, &e00); - c10 = madder(load(A + lda * (i + 1) + l), b, c10, &e10); - c20 = madder(load(A + lda * (i + 2) + l), b, c20, &e20); - c30 = madder(load(A + lda * (i + 3) + l), b, c30, &e30); - } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 0) + (i + 1)] = hsum(c10); - C[ldc * (j + 0) + (i + 2)] = hsum(c20); - C[ldc * (j + 0) + (i + 3)] = hsum(c30); - END_KERNEL() - } - - NOINLINE void gemm1x1(int m0, int m, int n0, int n) { - BEGIN_KERNEL(1, 1) - D c = {0}, e = {0}; - for (int l = 0; l < k; l += KN) - c = madder(load(A + lda * i + l), - load(B + ldb * j + l), c, &e); - C[ldc * j + i] = hsum(c); - END_KERNEL() } const TA *const A; @@ -521,120 +468,97 @@ class tinyBLAS_Q0_ARM { private: NOINLINE void mnpack(int m0, int m, int n0, int n) { int mc, nc, mp, np; - if (m - m0 <= 0 || n - n0 <= 0) - return; - if (m - m0 >= 3 && n - n0 >= 3) { + switch ((std::min(m - m0, 3) << 4) | std::min(n - n0, 3)) { + case 0x33: mc = 3; nc = 3; - gemm3x3(m0, m, n0, n); - } else { + gemm<3, 3>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: mc = 1; nc = 1; - gemm1x1(m0, m, n0, n); + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; } mp = m0 + (m - m0) / mc * mc; np = n0 + (n - n0) / nc * nc; mnpack(mp, m, n0, np); - mnpack(m0, mp, np, n); - mnpack(mp, m, np, n); + mnpack(m0, m, np, n); } - NOINLINE void gemm3x3(int m0, int m, int n0, int n) { - BEGIN_KERNEL(3, 3) - int32x4_t zero = vdupq_n_s32(0); - float32x4_t c00 = vdupq_n_f32(0.f); - float32x4_t c01 = vdupq_n_f32(0.f); - float32x4_t c02 = vdupq_n_f32(0.f); - float32x4_t c10 = vdupq_n_f32(0.f); - float32x4_t c11 = vdupq_n_f32(0.f); - float32x4_t c12 = vdupq_n_f32(0.f); - float32x4_t c20 = vdupq_n_f32(0.f); - float32x4_t c21 = vdupq_n_f32(0.f); - float32x4_t c22 = vdupq_n_f32(0.f); - const TA *Ap0 = A + lda * (i + 0); - const TA *Ap1 = A + lda * (i + 1); - const TA *Ap2 = A + lda * (i + 2); - const block_q8_0 *Bp0 = B + ldb * (j + 0); - const block_q8_0 *Bp1 = B + ldb * (j + 1); - const block_q8_0 *Bp2 = B + ldb * (j + 2); - for (int l = 0; l < k; ++l) { - c00 = vmlaq_n_f32( - c00, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap0 + l), load_lo(Bp0 + l)), - load_hi(Ap0 + l), load_hi(Bp0 + l))), - unhalf(Ap0[l].d) * unhalf(Bp0[l].d)); - c01 = vmlaq_n_f32( - c01, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap0 + l), load_lo(Bp1 + l)), - load_hi(Ap0 + l), load_hi(Bp1 + l))), - unhalf(Ap0[l].d) * unhalf(Bp1[l].d)); - c02 = vmlaq_n_f32( - c02, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap0 + l), load_lo(Bp2 + l)), - load_hi(Ap0 + l), load_hi(Bp2 + l))), - unhalf(Ap0[l].d) * unhalf(Bp2[l].d)); - c10 = vmlaq_n_f32( - c10, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap1 + l), load_lo(Bp0 + l)), - load_hi(Ap1 + l), load_hi(Bp0 + l))), - unhalf(Ap1[l].d) * unhalf(Bp0[l].d)); - c11 = vmlaq_n_f32( - c11, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap1 + l), load_lo(Bp1 + l)), - load_hi(Ap1 + l), load_hi(Bp1 + l))), - unhalf(Ap1[l].d) * unhalf(Bp1[l].d)); - c12 = vmlaq_n_f32( - c12, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap1 + l), load_lo(Bp2 + l)), - load_hi(Ap1 + l), load_hi(Bp2 + l))), - unhalf(Ap1[l].d) * unhalf(Bp2[l].d)); - c20 = vmlaq_n_f32( - c20, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap2 + l), load_lo(Bp0 + l)), - load_hi(Ap2 + l), load_hi(Bp0 + l))), - unhalf(Ap2[l].d) * unhalf(Bp0[l].d)); - c21 = vmlaq_n_f32( - c21, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap2 + l), load_lo(Bp1 + l)), - load_hi(Ap2 + l), load_hi(Bp1 + l))), - unhalf(Ap2[l].d) * unhalf(Bp1[l].d)); - c22 = vmlaq_n_f32( - c22, - vcvtq_f32_s32(vdotq_s32(vdotq_s32(zero, load_lo(Ap2 + l), load_lo(Bp2 + l)), - load_hi(Ap2 + l), load_hi(Bp2 + l))), - unhalf(Ap2[l].d) * unhalf(Bp2[l].d)); + template + NOINLINE void gemm(int m0, int m, int n0, int n) { + int ytiles = (m - m0) / RM; + int xtiles = (n - n0) / RN; + int tiles = xtiles * ytiles; + int duty = (tiles + nth - 1) / nth; + int start = duty * ith; + int end = start + duty; + if (end > tiles) + end = tiles; + for (int job = start; job < end; ++job) { + int ii = m0 + job / xtiles * RM; + int jj = n0 + job % xtiles * RN; + float32x4_t Cv[RN][RM] = {}; + for (int l = 0; l < k; ++l) + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + Cv[j][i] = vmlaq_n_f32(Cv[j][i], + vcvtq_f32_s32(vdotq_s32( + vdotq_s32(vdupq_n_s32(0), + load_lo(A + lda * (ii + i) + l), + load_lo(B + ldb * (jj + j) + l)), + load_hi(A + lda * (ii + i) + l), + load_hi(B + ldb * (jj + j) + l))), + unhalf(A[lda * (ii + i) + l].d) * + unhalf(B[ldb * (jj + j) + l].d)); + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 0) + (i + 1)] = hsum(c10); - C[ldc * (j + 0) + (i + 2)] = hsum(c20); - C[ldc * (j + 1) + (i + 0)] = hsum(c01); - C[ldc * (j + 1) + (i + 1)] = hsum(c11); - C[ldc * (j + 1) + (i + 2)] = hsum(c21); - C[ldc * (j + 2) + (i + 0)] = hsum(c02); - C[ldc * (j + 2) + (i + 1)] = hsum(c12); - C[ldc * (j + 2) + (i + 2)] = hsum(c22); - END_KERNEL() - } - - NOINLINE void gemm1x1(int m0, int m, int n0, int n) { - BEGIN_KERNEL(1, 1) - float32x4_t acc = vdupq_n_f32(0.f); - const TA *Ap = A + lda * i; - const block_q8_0 *Bp = B + ldb * j; - for (int l = 0; l < k; ++l) { - acc = vmlaq_n_f32(acc, - vcvtq_f32_s32(vdotq_s32( - vdotq_s32(vdupq_n_s32(0), load_lo(Ap + l), load_lo(Bp + l)), - load_hi(Ap + l), load_hi(Bp + l))), - unhalf(Ap[l].d) * unhalf(Bp[l].d)); - } - C[ldc * j + i] = hsum(acc); - END_KERNEL() } inline int8x16_t load_lo(const block_q8_0 *b) { return vld1q_s8(b->qs); } + inline int8x16_t load_hi(const block_q8_0 *b) { return vld1q_s8(b->qs + 16); } @@ -644,6 +568,7 @@ class tinyBLAS_Q0_ARM { vdupq_n_u8(0x0f))), vdupq_n_s8(0x8)); } + inline int8x16_t load_hi(const block_q4_0 *b) { return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)), vdupq_n_s8(0x8)); @@ -679,217 +604,143 @@ class tinyBLAS_Q0_AVX2 { } private: - NOINLINE void mnpack(int m0, int m, int n0, int n) { + void mnpack(int m0, int m, int n0, int n) { int mc, nc, mp, np; - if (m - m0 <= 0 || n - n0 <= 0) - return; - if (m - m0 >= 4 && n - n0 >= 3) { + switch ((std::min(m - m0, 4) << 4) | std::min(n - n0, 4)) { +#if VECTOR_REGISTERS == 32 + case 0x44: + mc = 4; + nc = 4; + gemm<4, 4>(m0, m, n0, n); + break; + case 0x43: mc = 4; nc = 3; - gemm4x3(m0, m, n0, n); - } else if (m - m0 >= 4 && n - n0 >= 1) { + gemm<4, 3>(m0, m, n0, n); + break; + case 0x34: + mc = 3; + nc = 4; + gemm<3, 4>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm<3, 3>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm<2, 4>(m0, m, n0, n); + break; +#else + case 0x44: + case 0x43: + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x34: + case 0x24: + mc = 2; + nc = 4; + gemm<2, 4>(m0, m, n0, n); + break; + case 0x33: +#endif + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x41: mc = 4; nc = 1; - gemm4x1(m0, m, n0, n); - } else if (m - m0 >= 1 && n - n0 >= 4) { + gemm<4, 1>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x14: mc = 1; nc = 4; - gemm1x4(m0, m, n0, n); - } else { + gemm<1, 4>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: mc = 1; nc = 1; - gemm1x1(m0, m, n0, n); + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; } mp = m0 + (m - m0) / mc * mc; np = n0 + (n - n0) / nc * nc; mnpack(mp, m, n0, np); - mnpack(m0, mp, np, n); - mnpack(mp, m, np, n); + mnpack(m0, m, np, n); } - NOINLINE void gemm4x3(int m0, int m, int n0, int n) { - BEGIN_KERNEL(4, 3) - __m256 c00 = _mm256_setzero_ps(); - __m256 c10 = _mm256_setzero_ps(); - __m256 c20 = _mm256_setzero_ps(); - __m256 c30 = _mm256_setzero_ps(); - __m256 c01 = _mm256_setzero_ps(); - __m256 c11 = _mm256_setzero_ps(); - __m256 c21 = _mm256_setzero_ps(); - __m256 c31 = _mm256_setzero_ps(); - __m256 c02 = _mm256_setzero_ps(); - __m256 c12 = _mm256_setzero_ps(); - __m256 c22 = _mm256_setzero_ps(); - __m256 c32 = _mm256_setzero_ps(); - const TA *Ap0 = A + lda * (i + 0); - const TA *Ap1 = A + lda * (i + 1); - const TA *Ap2 = A + lda * (i + 2); - const TA *Ap3 = A + lda * (i + 3); - const TB *Bp0 = B + ldb * (j + 0); - const TB *Bp1 = B + ldb * (j + 1); - const TB *Bp2 = B + ldb * (j + 2); - for (int l = 0; l < k; ++l) { - float da0 = unhalf(Ap0[l].d); - float da1 = unhalf(Ap1[l].d); - float da2 = unhalf(Ap2[l].d); - float da3 = unhalf(Ap3[l].d); - __m256i e0 = load(Ap0 + l); - __m256i e1 = load(Ap1 + l); - __m256i e2 = load(Ap2 + l); - __m256i e3 = load(Ap3 + l); - float db0 = unhalf(Bp0[l].d); - __m256 d00 = _mm256_set1_ps(da0 * db0); - __m256 d10 = _mm256_set1_ps(da1 * db0); - __m256 d20 = _mm256_set1_ps(da2 * db0); - __m256 d30 = _mm256_set1_ps(da3 * db0); - __m256i f0 = load(Bp0 + l); - __m256i u0 = _mm256_sign_epi8(f0, f0); - __m256i s00 = _mm256_sign_epi8(e0, f0); - __m256i s10 = _mm256_sign_epi8(e1, f0); - __m256i s20 = _mm256_sign_epi8(e2, f0); - __m256i s30 = _mm256_sign_epi8(e3, f0); - c00 = madd(d00, updot(u0, s00), c00); - c10 = madd(d10, updot(u0, s10), c10); - c20 = madd(d20, updot(u0, s20), c20); - c30 = madd(d30, updot(u0, s30), c30); - float db1 = unhalf(Bp1[l].d); - __m256 d01 = _mm256_set1_ps(da0 * db1); - __m256 d11 = _mm256_set1_ps(da1 * db1); - __m256 d21 = _mm256_set1_ps(da2 * db1); - __m256 d31 = _mm256_set1_ps(da3 * db1); - __m256i f1 = load(Bp1 + l); - __m256i u1 = _mm256_sign_epi8(f1, f1); - __m256i s01 = _mm256_sign_epi8(e0, f1); - __m256i s11 = _mm256_sign_epi8(e1, f1); - __m256i s21 = _mm256_sign_epi8(e2, f1); - __m256i s31 = _mm256_sign_epi8(e3, f1); - c01 = madd(d01, updot(u1, s01), c01); - c11 = madd(d11, updot(u1, s11), c11); - c21 = madd(d21, updot(u1, s21), c21); - c31 = madd(d31, updot(u1, s31), c31); - float db2 = unhalf(Bp2[l].d); - __m256 d02 = _mm256_set1_ps(da0 * db2); - __m256 d12 = _mm256_set1_ps(da1 * db2); - __m256 d22 = _mm256_set1_ps(da2 * db2); - __m256 d32 = _mm256_set1_ps(da3 * db2); - __m256i f2 = load(Bp2 + l); - __m256i u2 = _mm256_sign_epi8(f2, f2); - __m256i s02 = _mm256_sign_epi8(e0, f2); - __m256i s12 = _mm256_sign_epi8(e1, f2); - __m256i s22 = _mm256_sign_epi8(e2, f2); - __m256i s32 = _mm256_sign_epi8(e3, f2); - c02 = madd(d02, updot(u2, s02), c02); - c12 = madd(d12, updot(u2, s12), c12); - c22 = madd(d22, updot(u2, s22), c22); - c32 = madd(d32, updot(u2, s32), c32); + template + NOINLINE void gemm(int m0, int m, int n0, int n) { + int ytiles = (m - m0) / RM; + int xtiles = (n - n0) / RN; + int tiles = xtiles * ytiles; + int duty = (tiles + nth - 1) / nth; + int start = duty * ith; + int end = start + duty; + if (end > tiles) + end = tiles; + for (int job = start; job < end; ++job) { + int ii = m0 + job / xtiles * RM; + int jj = n0 + job % xtiles * RN; + __m256 Cv[RN][RM] = {}; + for (int l = 0; l < k; ++l) + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) * + unhalf(B[ldb * (jj + j) + l].d)), + updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l), + load(A + lda * (ii + i) + l)), + _mm256_sign_epi8(load(B + ldb * (jj + j) + l), + load(A + lda * (ii + i) + l))), + Cv[j][i]); + for (int j = 0; j < RN; ++j) + for (int i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } - C[ldc * (j + 0) + (i + 0)] = hsum(c00); - C[ldc * (j + 0) + (i + 1)] = hsum(c10); - C[ldc * (j + 0) + (i + 2)] = hsum(c20); - C[ldc * (j + 0) + (i + 3)] = hsum(c30); - C[ldc * (j + 1) + (i + 0)] = hsum(c01); - C[ldc * (j + 1) + (i + 1)] = hsum(c11); - C[ldc * (j + 1) + (i + 2)] = hsum(c21); - C[ldc * (j + 1) + (i + 3)] = hsum(c31); - C[ldc * (j + 2) + (i + 0)] = hsum(c02); - C[ldc * (j + 2) + (i + 1)] = hsum(c12); - C[ldc * (j + 2) + (i + 2)] = hsum(c22); - C[ldc * (j + 2) + (i + 3)] = hsum(c32); - END_KERNEL() - } - - NOINLINE void gemm4x1(int m0, int m, int n0, int n) { - BEGIN_KERNEL(4, 1) - __m256 c0 = _mm256_setzero_ps(); - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - const TA *Ap0 = A + lda * (i + 0); - const TA *Ap1 = A + lda * (i + 1); - const TA *Ap2 = A + lda * (i + 2); - const TA *Ap3 = A + lda * (i + 3); - const TB *Bp = B + ldb * j; - for (int l = 0; l < k; ++l) { - float db0 = unhalf(Bp[l].d); - __m256i f = load(Bp + l); - __m256i u = _mm256_sign_epi8(f, f); - __m256 d0 = _mm256_set1_ps(unhalf(Ap0[l].d) * db0); - __m256 d1 = _mm256_set1_ps(unhalf(Ap1[l].d) * db0); - __m256 d2 = _mm256_set1_ps(unhalf(Ap2[l].d) * db0); - __m256 d3 = _mm256_set1_ps(unhalf(Ap3[l].d) * db0); - __m256i e0 = load(Ap0 + l); - __m256i e1 = load(Ap1 + l); - __m256i e2 = load(Ap2 + l); - __m256i e3 = load(Ap3 + l); - __m256i s0 = _mm256_sign_epi8(e0, f); - __m256i s1 = _mm256_sign_epi8(e1, f); - __m256i s2 = _mm256_sign_epi8(e2, f); - __m256i s3 = _mm256_sign_epi8(e3, f); - __m256 g0 = updot(u, s0); - __m256 g1 = updot(u, s1); - __m256 g2 = updot(u, s2); - __m256 g3 = updot(u, s3); - c0 = madd(d0, g0, c0); - c1 = madd(d1, g1, c1); - c2 = madd(d2, g2, c2); - c3 = madd(d3, g3, c3); - } - C[ldc * j + (i + 0)] = hsum(c0); - C[ldc * j + (i + 1)] = hsum(c1); - C[ldc * j + (i + 2)] = hsum(c2); - C[ldc * j + (i + 3)] = hsum(c3); - END_KERNEL() - } - - NOINLINE void gemm1x4(int m0, int m, int n0, int n) { - BEGIN_KERNEL(1, 4) - __m256 c0 = _mm256_setzero_ps(); - __m256 c1 = _mm256_setzero_ps(); - __m256 c2 = _mm256_setzero_ps(); - __m256 c3 = _mm256_setzero_ps(); - const TB *Bp0 = B + ldb * (j + 0); - const TB *Bp1 = B + ldb * (j + 1); - const TB *Bp2 = B + ldb * (j + 2); - const TB *Bp3 = B + ldb * (j + 3); - const TA *Ap = A + lda * i; - for (int l = 0; l < k; ++l) { - float da0 = unhalf(Ap[l].d); - __m256i f = load(Ap + l); - __m256i u = _mm256_sign_epi8(f, f); - __m256 d0 = _mm256_set1_ps(unhalf(Bp0[l].d) * da0); - __m256 d1 = _mm256_set1_ps(unhalf(Bp1[l].d) * da0); - __m256 d2 = _mm256_set1_ps(unhalf(Bp2[l].d) * da0); - __m256 d3 = _mm256_set1_ps(unhalf(Bp3[l].d) * da0); - __m256 g0 = updot(u, _mm256_sign_epi8(load(Bp0 + l), f)); - __m256 g1 = updot(u, _mm256_sign_epi8(load(Bp1 + l), f)); - __m256 g2 = updot(u, _mm256_sign_epi8(load(Bp2 + l), f)); - __m256 g3 = updot(u, _mm256_sign_epi8(load(Bp3 + l), f)); - c0 = madd(d0, g0, c0); - c1 = madd(d1, g1, c1); - c2 = madd(d2, g2, c2); - c3 = madd(d3, g3, c3); - } - C[ldc * (j + 0) + i] = hsum(c0); - C[ldc * (j + 1) + i] = hsum(c1); - C[ldc * (j + 2) + i] = hsum(c2); - C[ldc * (j + 3) + i] = hsum(c3); - END_KERNEL() - } - - NOINLINE void gemm1x1(int m0, int m, int n0, int n) { - BEGIN_KERNEL(1, 1) - __m256 c = _mm256_setzero_ps(); - const TA *Ap = A + lda * i; - const TB *Bp = B + ldb * j; - for (int l = 0; l < k; ++l) { - __m256 d = _mm256_set1_ps(unhalf(Ap[l].d) * unhalf(Bp[l].d)); - __m256i e = load(Ap + l); - __m256i f = load(Bp + l); - __m256 g = updot(_mm256_sign_epi8(e, e), _mm256_sign_epi8(f, e)); - c = madd(d, g, c); - } - C[ldc * j + i] = hsum(c); - END_KERNEL() } inline __m256i load(const block_q8_0 *b) { @@ -911,10 +762,10 @@ class tinyBLAS_Q0_AVX2 { } static inline __m256i denibble(const uint8_t *p) { - const __m128i tmp = _mm_loadu_si128((const __m128i *)p); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8(15); - return _mm256_and_si256(lowMask, bytes); + __m128i x = _mm_loadu_si128((const __m128i *)p); + return _mm256_and_si256(_mm256_set1_epi8(15), + _mm256_insertf128_si256(_mm256_castsi128_si256(x), + _mm_srli_epi16(x, 4), 1)); } const TA *const A; From 4e96a812b3ce7322a29a3008db2ed73d9087b176 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Tue, 23 Apr 2024 02:53:18 +0200 Subject: [PATCH 06/14] [SYCL] Windows default build instructions without -DLLAMA_SYCL_F16 flag activated (#6767) * Fix FP32/FP16 build instructions * Fix typo * Recommended build instruction Co-authored-by: Neo Zhang Jianyu * Recommended build instruction Co-authored-by: Neo Zhang Jianyu * Recommended build instruction Co-authored-by: Neo Zhang Jianyu * Add comments in Intel GPU linux --------- Co-authored-by: Anas Ahouzi <112881240+aahouzi-intel@users.noreply.github.com> Co-authored-by: Neo Zhang Jianyu --- README-sycl.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index 2aa465070..dc98c7b3e 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh # Build LLAMA with MKL BLAS acceleration for intel GPU mkdir -p build && cd build -# Option 1: Use FP16 for better performance in long-prompt inference -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON - -# Option 2: Use FP32 by default +# Option 1: Use FP32 (recommended for better performance in most cases) cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Option 2: Use FP16 +cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + #build all binary cmake --build . --config Release -j -v ``` @@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL mkdir -p build && cd build -# Option 1: Use FP16 for better performance in long-prompt inference -cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON - -# Option 2: Use FP32 by default +# Option 1: Use FP32 (recommended for better performance in most cases) cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Option 2: Use FP16 +cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + #build all binary cmake --build . --config Release -j -v @@ -416,6 +416,10 @@ mkdir -p build cd build @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force +# Option 1: Use FP32 (recommended for better performance in most cases) +cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release + +# Option 2: Or FP16 cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON make -j From c8297c6af5693555652c40b95974b95d49d2674d Mon Sep 17 00:00:00 2001 From: liuwei-git <14815172+liuwei-git@users.noreply.github.com> Date: Wed, 24 Apr 2024 15:00:37 +0800 Subject: [PATCH 07/14] llama : add phi3 support (#6852) * add explicit phi3 support * add explicit phi3 support * remove unused code * convert : add BOS token * llama : match EOT token <|end|> * llama : minor / style * llama : tabs -> spaces * convert : fix lint checks --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 85 +++++++++++++++ gguf-py/gguf/constants.py | 16 +++ gguf-py/gguf/tensor_mapping.py | 2 + llama.cpp | 192 ++++++++++++++++++++++++++++++++- 4 files changed, 294 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4fd916cba..4ace13eb6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1979,6 +1979,91 @@ class Phi2Model(Model): self.gguf_writer.add_add_bos_token(False) +@Model.register("Phi3ForCausalLM") +class Phi3MiniModel(Model): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = 1.0 + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + + self.gguf_writer.add_name("Phi3") + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(8192) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + + @Model.register("PlamoForCausalLM") class PlamoModel(Model): model_arch = gguf.MODEL_ARCH.PLAMO diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 06cb26a7d..d2f1de198 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -124,6 +124,7 @@ class MODEL_ARCH(IntEnum): QWEN2 = auto() QWEN2MOE = auto() PHI2 = auto() + PHI3 = auto() PLAMO = auto() CODESHELL = auto() ORION = auto() @@ -200,6 +201,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", @@ -550,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.PHI3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.CODESHELL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.POS_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 10de36fa8..e5750d419 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -117,6 +117,7 @@ class TensorNameMap: "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.qkv_proj" # phi3 ), # Attention query @@ -234,6 +235,7 @@ class TensorNameMap: "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert diff --git a/llama.cpp b/llama.cpp index a25d115c1..30fe19037 100644 --- a/llama.cpp +++ b/llama.cpp @@ -211,6 +211,7 @@ enum llm_arch { LLM_ARCH_QWEN2, LLM_ARCH_QWEN2MOE, LLM_ARCH_PHI2, + LLM_ARCH_PHI3, LLM_ARCH_PLAMO, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, @@ -246,6 +247,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, @@ -793,6 +795,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PHI3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_PLAMO, { @@ -3955,6 +3974,16 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 32: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_PHI3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; @@ -4352,6 +4381,7 @@ static void llm_load_vocab( //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL && (t.first == "<|eot_id|>" || t.first == "<|im_end|>" || + t.first == "<|end|>" || t.first == "" ) ) { @@ -5375,6 +5405,33 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; + case LLM_ARCH_PHI3: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context* ctx_layer = ctx_for_layer(i); + ggml_context* ctx_split = ctx_for_layer_split(i); + + auto& layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }); + } + } break; case LLM_ARCH_PLAMO: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -6326,7 +6383,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 ggml_mul_mat_set_prec(kq, GGML_PREC_F32); @@ -8967,12 +9024,140 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); + return gf; + } + + struct ggml_cgraph * build_phi3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + auto residual = inpL; + + // self-attention + { + struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(attn_norm_output, "attn_norm", il); + + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); + } + else { + Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor* inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, residual); + residual = cur; + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // FF + // special-case: the up and gate tensors are merged into a single tensor + // TOOD: support into llm_build_ffn + { + struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); + cb(up, "ffn_up", il); + + auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0)); + auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2)); + + y = ggml_mul(ctx0, y, ggml_silu(ctx0, g)); + cb(y, "ffn_gate", il); + + auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y); + cb(down, "ffn_down", il); + + cur = down; + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, residual, cur); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); return gf; } + struct ggml_cgraph * build_plamo() { struct ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -10474,6 +10659,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_phi2(); } break; + case LLM_ARCH_PHI3: + { + result = llm.build_phi3(); + } break; case LLM_ARCH_PLAMO: { result = llm.build_plamo(); @@ -15393,6 +15582,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_PHI2: + case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: case LLM_ARCH_STARCODER2: return LLAMA_ROPE_TYPE_NEOX; From 3fec68be4e9577fc53158366d3b3af039c17bb1f Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 24 Apr 2024 15:16:21 +0800 Subject: [PATCH 08/14] convert : add support of codeqwen due to tokenizer (#6707) * add support of codeqwen due to tokenizer * override load_hparams * fix typo * fix load_params * convert : fix whitespace --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4ace13eb6..5763b6664 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -363,6 +363,16 @@ class Model(ABC): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + print( + f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" + ) + for i in range(1, pad_count + 1): + tokens.append(f"[PAD{i}]") + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") @@ -1789,6 +1799,12 @@ class QwenModel(Model): class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): From abd3314064cd3c513f9eef34c3ba6c23a107442c Mon Sep 17 00:00:00 2001 From: Tristan Druyen Date: Wed, 24 Apr 2024 10:52:37 +0200 Subject: [PATCH 09/14] llama : add phi 3 chat template (#6857) * Add phi 3 chat template & tests * test : fix chat template result --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 9 +++++++++ tests/test-chat-template.cpp | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/llama.cpp b/llama.cpp index 30fe19037..e4ca34bd1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17447,6 +17447,15 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } + } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) { + // Phi 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index cddf86a41..4fe9183b9 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -49,6 +49,8 @@ int main(void) { "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}", // Llama-3 "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", + // Phi-3 + "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}" }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -77,6 +79,8 @@ int main(void) { "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", // Llama 3 "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + // Phi 3 + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", }; std::vector formatted_chat(1024); int32_t res; From c0d1b3e03e27634ac2871761f5033cf9324d472d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 24 Apr 2024 12:00:07 +0300 Subject: [PATCH 10/14] ggml : move 32-bit arm compat in ggml-impl.h (#6865) ggml-ci --- ggml-impl.h | 260 ++++++++++++++++++++++++++++++++++++++++++++- ggml-quants.c | 287 -------------------------------------------------- 2 files changed, 256 insertions(+), 291 deletions(-) diff --git a/ggml-impl.h b/ggml-impl.h index 0c997d3ed..2ffacc299 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -45,7 +45,7 @@ extern "C" { // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t -#if defined(__ARM_NEON) && !defined(_MSC_VER) +#if defined(__ARM_NEON) // if YCM cannot find , make a symbolic link to it, for example: // @@ -53,8 +53,262 @@ extern "C" { // #include +#ifdef _MSC_VER + +typedef uint16_t ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + typedef __fp16 ggml_fp16_internal_t; +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif // _MSC_VER + +#if !defined(__aarch64__) + +// 32-bit ARM compatibility + +// vaddvq_s16 +// vpaddq_s16 +// vpaddq_s32 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct ggml_int16x8x2_t { + int16x8_t val[2]; +} ggml_int16x8x2_t; + +inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { + ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct ggml_uint8x16x2_t { + uint8x16_t val[2]; +} ggml_uint8x16x2_t; + +inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { + ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct ggml_uint8x16x4_t { + uint8x16_t val[4]; +} ggml_uint8x16x4_t; + +inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { + ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct ggml_int8x16x2_t { + int8x16_t val[2]; +} ggml_int8x16x2_t; + +inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { + ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct ggml_int8x16x4_t { + int8x16_t val[4]; +} ggml_int8x16x4_t; + +inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +// NOTE: not tested +inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + uint8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +#else + +#define ggml_int16x8x2_t int16x8x2_t +#define ggml_uint8x16x2_t uint8x16x2_t +#define ggml_uint8x16x4_t uint8x16x4_t +#define ggml_int8x16x2_t int8x16x2_t +#define ggml_int8x16x4_t int8x16x4_t + +#define ggml_vld1q_s16_x2 vld1q_s16_x2 +#define ggml_vld1q_u8_x2 vld1q_u8_x2 +#define ggml_vld1q_u8_x4 vld1q_u8_x4 +#define ggml_vld1q_s8_x2 vld1q_s8_x2 +#define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 + +#endif // !defined(__aarch64__) + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + +#endif // defined(__ARM_NEON) + +#if defined(__ARM_NEON) && !defined(__MSC_VER) + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) @@ -75,8 +329,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #else -typedef uint16_t ggml_fp16_internal_t; - #ifdef __wasm_simd128__ #include #else @@ -221,7 +473,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #endif // __F16C__ -#endif // __ARM_NEON +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() diff --git a/ggml-quants.c b/ggml-quants.c index 32360a1f1..11e11c219 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -14,41 +14,6 @@ #include // for qsort #include // for GGML_ASSERT -#ifdef __ARM_NEON - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#else - -#ifdef __wasm_simd128__ -#include -#else -#if defined(__POWER9_VECTOR__) || defined(__powerpc64__) -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - #undef MIN #undef MAX @@ -276,258 +241,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if defined(__ARM_NEON) - -#ifdef _MSC_VER - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif - -#if !defined(__aarch64__) - -// 64-bit compatibility - -// vaddvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif - -#endif - #if defined(__ARM_NEON) || defined(__wasm_simd128__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) From 28103f4832e301a9c84d44ff0df9d75d46ab6c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 24 Apr 2024 11:08:36 +0200 Subject: [PATCH 11/14] Server: fix seed for multiple slots (#6835) * Server: add tests for consistent results * sampling: separate rng per sampling context --- common/common.cpp | 2 + common/sampling.cpp | 13 ++++- common/sampling.h | 47 ++++++++------- examples/lookup/lookup-stats.cpp | 1 - examples/lookup/lookup.cpp | 1 - examples/main/main.cpp | 1 - examples/server/server.cpp | 3 +- .../server/tests/features/results.feature | 57 +++++++++++++++++++ examples/server/tests/features/steps/steps.py | 34 +++++++++++ llama.cpp | 7 ++- llama.h | 9 ++- 11 files changed, 145 insertions(+), 30 deletions(-) create mode 100644 examples/server/tests/features/results.feature diff --git a/common/common.cpp b/common/common.cpp index 06f252ea6..a0d1f8d59 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -242,7 +242,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } + // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); + sparams.seed = std::stoul(argv[i]); return true; } if (arg == "-t" || arg == "--threads") { diff --git a/common/sampling.cpp b/common/sampling.cpp index 45d68b26c..f24665501 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,4 +1,6 @@ +#define LLAMA_API_INTERNAL #include "sampling.h" +#include struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); @@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ result->prev.resize(params.n_prev); + llama_sampling_set_rng_seed(result, params.seed); + return result; } @@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) { ctx->cur.clear(); } +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + seed = time(NULL); + } + ctx->rng.seed(seed); +} + void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) { if (dst->grammar) { llama_grammar_free(dst->grammar); @@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl( sampler_queue(ctx_main, params, cur_p, min_keep); - id = llama_sample_token(ctx_main, &cur_p); + id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng); //{ // const int n_top = 10; diff --git a/common/sampling.h b/common/sampling.h index 639b819ab..cf7081e36 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -4,9 +4,10 @@ #include "grammar-parser.h" +#include #include -#include #include +#include // sampler types enum class llama_sampler_type : char { @@ -20,25 +21,26 @@ enum class llama_sampler_type : char { // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context std::vector samplers_sequence = { llama_sampler_type::TOP_K, @@ -79,6 +81,8 @@ struct llama_sampling_context { // TODO: replace with ring-buffer std::vector prev; std::vector cur; + + std::mt19937 rng; }; #include "common.h" @@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx); // - reset grammar void llama_sampling_reset(llama_sampling_context * ctx); +// Set the sampler seed +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); + // Copy the sampler context void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 41b62c2fe..87ecc0a4f 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -30,7 +30,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - llama_set_rng_seed(ctx, params.seed); GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 9526e898f..eebbd00a5 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -38,7 +38,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - llama_set_rng_seed(ctx, params.seed); GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1180734b9..a74d4d9c7 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -240,7 +240,6 @@ int main(int argc, char ** argv) { return 1; } session_tokens.resize(n_token_count_out); - llama_set_rng_seed(ctx, params.seed); LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 25bc29639..68c63f9f1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -854,7 +854,7 @@ struct server_context { slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); - slot.params.seed = json_value(data, "seed", default_params.seed); + slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -1028,7 +1028,6 @@ struct server_context { send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); return false; } - llama_set_rng_seed(ctx, slot.params.seed); } slot.command = SLOT_COMMAND_LOAD_PROMPT; diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature new file mode 100644 index 000000000..f17120f7b --- /dev/null +++ b/examples/server/tests/features/results.feature @@ -0,0 +1,57 @@ +@llama.cpp +@results +Feature: Results + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf + And 128 as batch size + And 256 KV cache size + And 128 max tokens to predict + + Scenario Outline: Multi users completion + Given slots + And continuous batching + Then the server is starting + Then the server is healthy + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all predictions are equal + Examples: + | n_slots | + | 1 | + | 2 | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index ca400efa4..f71e0d706 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port): context.server_metrics = False context.server_process = None context.seed = None + context.draft = None context.server_seed = None context.user_api_key = None context.response_format = None @@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl): context.n_gpu_layer = ngl +@step('{draft:d} as draft') +def step_draft(context, draft): + context.draft = draft + + @step('{n_ctx:d} KV cache size') def step_n_ctx(context, n_ctx): context.n_ctx = n_ctx @@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n): assert_n_tokens_predicted(context.completion, predicted_n) +@step('all predictions are equal') +@async_run_until_complete +async def step_predictions_equal(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_predictions_equal(context.tasks_result) + context.tasks_result = [] + + @step('the completion is truncated') def step_assert_completion_truncated(context): step_assert_completion_truncated(context, '') @@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' f' {n_predicted} <> {expected_predicted_n}') +def assert_all_predictions_equal(completion_responses): + content_0 = completion_responses[0]['content'] + + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"content 0: {content_0}") + + i = 1 + for response in completion_responses[1:]: + content = response['content'] + + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"content {i}: {content}") + + assert content == content_0, "contents not equal" + + i += 1 + async def gather_tasks_results(context): n_tasks = len(context.concurrent_tasks) @@ -1148,6 +1180,8 @@ def start_server_background(context): server_args.extend(['--ubatch-size', context.n_ubatch]) if context.n_gpu_layer: server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) + if context.draft is not None: + server_args.extend(['--draft', context.draft]) if context.server_continuous_batching: server_args.append('--cont-batching') if context.server_embeddings: diff --git a/llama.cpp b/llama.cpp index e4ca34bd1..3a4a03d8f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13667,7 +13667,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da return result; } -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { +llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -13680,7 +13680,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra } std::discrete_distribution<> dist(probs.begin(), probs.end()); - auto & rng = ctx->rng; int idx = dist(rng); llama_token result = candidates->data[idx].id; @@ -13690,6 +13689,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra return result; } +llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { + return llama_sample_token_with_rng(ctx, candidates, ctx->rng); +} + void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { const int64_t t_start_sample_us = ggml_time_us(); diff --git a/llama.h b/llama.h index 4effca42c..7bfd13740 100644 --- a/llama.h +++ b/llama.h @@ -987,7 +987,7 @@ extern "C" { struct llama_context * ctx, llama_token_data_array * candidates); - /// @details Randomly selects a token from the candidates based on their probabilities. + /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. LLAMA_API llama_token llama_sample_token( struct llama_context * ctx, llama_token_data_array * candidates); @@ -1074,8 +1074,9 @@ extern "C" { // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL -#include +#include #include +#include struct ggml_tensor; @@ -1112,6 +1113,10 @@ std::pair, llama_partial_utf8> decode_utf8( const std::string & src, llama_partial_utf8 partial_start); +// Randomly selects a token from the candidates based on their probabilities using given std::mt19937. +// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences. +llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng); + #endif // LLAMA_API_INTERNAL #endif // LLAMA_H From 37246b1031b1680c0dcaf20aef736d6b446203fa Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Wed, 24 Apr 2024 05:15:29 -0500 Subject: [PATCH 12/14] common : revert showing control tokens by default for server (#6860) * fix: revert showing control tokens by default * feat: revert changes to default behavior of llama_token_to_piece; provide overridden declaration to receive "bool special" param to toggle showing control tokens * feat: use the overridden declaration of llama_token_to_piece from common/common.cpp to specify "false" so that control tokens are not shown in chat completion responses" * common : simplify --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 6 +++--- common/common.h | 5 +++-- examples/server/server.cpp | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a0d1f8d59..97f55b053 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2328,12 +2328,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index cca44268e..157b54a3e 100644 --- a/common/common.h +++ b/common/common.h @@ -237,11 +237,12 @@ std::vector llama_tokenize( bool add_special, bool parse_special = false); -// tokenizes a token into a piece +// tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, - llama_token token); + llama_token token, + bool special = true); // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // that takes into account the tokenizer type and decides how to handle the leading space diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 68c63f9f1..3acbd17df 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1117,7 +1117,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); + const std::string token_str = llama_token_to_piece(ctx, result.tok, false); slot.sampled = result.tok; // search stop word and delete it From 3fe847b5747676ee1bf90371c46ed0bc66b57240 Mon Sep 17 00:00:00 2001 From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:54:24 +0200 Subject: [PATCH 13/14] server : do not apply Markdown formatting in code sections (#6850) --- examples/server/public/index.html | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 9fe61eb1b..2961999f2 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -881,11 +881,11 @@ .replace(/&/g, '&') .replace(//g, '>') - .replace(/^#{1,6} (.*)$/gim, '

$1

') - .replace(/\*\*(.*?)\*\*/g, '$1') - .replace(/__(.*?)__/g, '$1') - .replace(/\*(.*?)\*/g, '$1') - .replace(/_(.*?)_/g, '$1') + .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1

$2

') + .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1') + .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1') + .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1') + .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1') .replace(/```.*?\n([\s\S]*?)```/g, '
$1
') .replace(/`(.*?)`/g, '$1') .replace(/\n/gim, '
'); From b4e4b8a9351d918a56831c73cf9f25c1837b80d1 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Wed, 24 Apr 2024 08:10:07 -0500 Subject: [PATCH 14/14] llama : add llama_get_pooling_type function (#6862) * add llama_get_pooling_type function * fix argument name, move with ctx funcs --- common/common.h | 4 ++-- llama.cpp | 4 ++++ llama.h | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/common/common.h b/common/common.h index 157b54a3e..87361e8e9 100644 --- a/common/common.h +++ b/common/common.h @@ -86,8 +86,8 @@ struct gpt_params { ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; - llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; - llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings + enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings // // sampling parameters struct llama_sampling_params sparams; diff --git a/llama.cpp b/llama.cpp index 3a4a03d8f..3a84b4916 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15599,6 +15599,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { return LLAMA_ROPE_TYPE_NONE; } +enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { + return ctx->cparams.pooling_type; +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->hparams.n_vocab; } diff --git a/llama.h b/llama.h index 7bfd13740..0eb2a1e9a 100644 --- a/llama.h +++ b/llama.h @@ -390,8 +390,10 @@ extern "C" { LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); - LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); - LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + + LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);