From 7c4263d4261d6ee6f0539d53eb9e1b4d120ba8af Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Wed, 28 Feb 2024 10:37:02 +0200
Subject: [PATCH 01/16] ggml : make i-quants work with super-blocks of 64
 (CPU,Metal) (#5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml-cuda.cu     |  27 ++++++---
 ggml-metal.metal |  58 ++++++++++---------
 ggml-quants.c    | 148 +++++++++++++++++++++++++++++++++++++++--------
 ggml-quants.h    |   5 ++
 ggml.c           |  15 ++++-
 5 files changed, 194 insertions(+), 59 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dfd28df62..831c84efb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -544,14 +544,19 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
 
 #define QR3_XS 8
 #define QI3_XS (QK_K / (4*QR3_XS))
+#if QK_K == 64
+#define IQ3S_N_SCALE 2
+#else
+#define IQ3S_N_SCALE QK_K/64
+#endif
 typedef struct {
     half d;
     uint8_t qs[QK_K/4];
     uint8_t qh[QK_K/32];
     uint8_t signs[QK_K/8];
-    uint8_t scales[QK_K/64];
+    uint8_t scales[IQ3S_N_SCALE];
 } block_iq3_s;
-static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding");
+static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
 
 #define QR1_S 8
 #define QI1_S (QK_K / (4*QR1_S))
@@ -571,6 +576,11 @@ typedef struct {
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
 
+#if QK_K == 64
+#define block_iq4_xs block_iq4_nl
+#define QR4_XS QR4_NL
+#define QI4_XS QI4_NL
+#else
 // QR4_XS = 8 is very slightly faster than QR4_XS = 4
 #define QR4_XS 8
 #define QI4_XS (QK_K / (4*QR4_XS))
@@ -581,7 +591,7 @@ typedef struct {
     uint8_t  qs[QK_K/2];
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
-
+#endif
 
 #define WARP_SIZE 32
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@@ -2439,9 +2449,9 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
 
 }
 
+#if QK_K != 64
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
     const int i   = blockIdx.x;
     const block_iq4_xs * x = (const block_iq4_xs *)vx;
 
@@ -2455,8 +2465,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
         y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
         y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
-
 }
+#endif
 
 static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
@@ -5382,8 +5392,7 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
     return 0.f;
 #endif
 #else
-    assert(false);
-    return 0.f;
+    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
 #endif
 }
 
@@ -7444,7 +7453,11 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
 template<typename dst_t>
 static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb = (k + QK_K - 1) / QK_K;
+#if QK_K == 64
+    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
+#else
     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
+#endif
 }
 
 template <typename src_t, typename dst_t>
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 689411903..74a5e0b03 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -2560,12 +2560,16 @@ typedef struct {
     uint8_t qs[QK4_NL/2];
 } block_iq4_nl;
 
+#if QK_K == 64
+#define block_iq4_xs block_iq4_nl
+#else
 typedef struct {
     half d;
     uint16_t scales_h;
     uint8_t  scales_l[QK_K/64];
     uint8_t  qs[QK_K/2];
 } block_iq4_xs;
+#endif
 
 //====================================== dot products =========================
 
@@ -4346,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-#if QK_K == 256
     const int ix = tiisg;
 
     device const float * y4 = y + 32 * ix;
@@ -4387,12 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
 
         y4 += 32 * 32;
     }
-#else
-    (void) x;
-    (void) y;
-    (void) yl;
-    (void) nb32;
-#endif
 
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
@@ -4482,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-#if QK_K == 256
     const int ix = tiisg;
 
     device const float * y4 = y + 32 * ix;
@@ -4533,12 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
 
         y4 += 32 * 32;
     }
-#else
-    (void) x;
-    (void) y;
-    (void) yl;
-    (void) nb32;
-#endif
 
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
@@ -4628,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
         threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-#if QK_K == 256
     const int ix = tiisg;
 
     device const float * y4 = y + 32 * ix;
@@ -4672,12 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
 
         y4 += 32 * 32;
     }
-#else
-    (void) x;
-    (void) y;
-    (void) yl;
-    (void) nb32;
-#endif
 
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
@@ -5016,7 +4999,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
 
     const int nb32 = nb * (QK_K / 32);
 
-#if QK_K == 256
     const int ix = tiisg/2;
     const int il = tiisg%2;
 
@@ -5055,12 +5037,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
 
         y4 += 16 * 32;
     }
-#else
-    (void) x;
-    (void) y;
-    (void) yl;
-    (void) nb32;
-#endif
 
     for (int row = 0; row < N_DST; ++row) {
         all_sum = simd_sum(sumf[row]);
@@ -5167,6 +5143,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
     }
 }
 
+#if QK_K != 64
 void kernel_mul_mv_iq4_xs_f32_impl(
         device const  void * src0,
         device const float * src1,
@@ -5260,6 +5237,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
         }
     }
 }
+#endif
 
 [[host_name("kernel_mul_mv_iq1_s_f32")]]
 kernel void kernel_mul_mv_iq1_s_f32(
@@ -5344,7 +5322,11 @@ kernel void kernel_mul_mv_iq4_xs_f32(
         uint tiisg[[thread_index_in_simdgroup]],
         uint sgitg[[simdgroup_index_in_threadgroup]]) {
 
+#if QK_K == 64
+    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+#else
     kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+#endif
 }
 
 //============================= templates and their specializations =============================
@@ -5770,6 +5752,9 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
 
 template <typename type4x4>
 void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
+#if QK_K == 64
+    dequantize_iq4_nl(xb, il, reg);
+#else
     // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
     const int ib32 = il/2;
     il = il%2;
@@ -5786,6 +5771,7 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
         reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
         reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
     }
+#endif
 }
 
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
@@ -6334,7 +6320,11 @@ template [[host_name("kernel_get_rows_iq3_s")]]   kernel get_rows_t kernel_get_r
 template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_rows<block_iq2_s,   QK_NL, dequantize_iq2_s>;
 template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
+#if QK_K == 64
+template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
+#else
 template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+#endif
 
 //
 // matrix-matrix multiplication
@@ -6378,7 +6368,11 @@ template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq2_s,   QK_NL, dequantize_iq2_s>;
 template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
+#if QK_K == 64
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
+#else
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+#endif
 
 //
 // indirect matrix-matrix multiplication
@@ -6434,7 +6428,11 @@ template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel
 template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
 template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
+#if QK_K == 64
+template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
+#else
 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+#endif
 
 //
 // matrix-vector multiplication
@@ -7707,7 +7705,11 @@ kernel void kernel_mul_mv_id_iq4_xs_f32(
 
     const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
 
+#if QK_K == 64
+    kernel_mul_mv_iq4_nl_f32_impl(
+#else
     kernel_mul_mv_iq4_xs_f32_impl(
+#endif
         src0[id],
         (device const float *) (src1 + bid*nb11),
         dst + bid*ne0,
diff --git a/ggml-quants.c b/ggml-quants.c
index f73d17ce2..371826f14 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1877,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
     float mins[QK_K/16];
     float scales[QK_K/16];
     float sw[QK_K/16];
-    float weight[QK_K/16];
+    float weight[16];
     uint8_t Ls[QK_K/16], Lm[QK_K/16];
 
     for (int i = 0; i < nb; i++) {
@@ -1887,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
         float sigma2 = sumx2/QK_K;
         for (int j = 0; j < QK_K/16; ++j) {
             const float * restrict qw = quant_weights + QK_K * i + 16*j;
-            for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
+            for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
             for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
-            scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+            scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
         }
 
-        float dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
-        float mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
+        float dm, mm;
+#if QK_K == 64
+        float max_scale = 0, max_min = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            max_scale = MAX(max_scale, scales[j]);
+            max_min   = MAX(max_min,   mins[j]);
+        }
+        dm = max_scale/15;
+        mm = max_min/15;
+        if (max_scale) {
+            float id = 1/dm;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(id*scales[j]);
+                Ls[j] = MAX(0, MIN(15, l));
+            }
+        } else {
+            memset(Ls, 0, QK_K/16);
+        }
+        if (max_min) {
+            float id = 1/mm;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(id*mins[j]);
+                Lm[j] = MAX(0, MIN(15, l));
+            }
+        } else {
+            memset(Lm, 0, QK_K/16);
+        }
+#else
+        dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
+        mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
+#endif
         y[i].d    = GGML_FP32_TO_FP16(dm);
         y[i].dmin = GGML_FP32_TO_FP16(mm);
         dm        = GGML_FP16_TO_FP32(y[i].d);
@@ -4227,6 +4256,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
 
 void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
     assert(k % QK_K == 0);
+#if QK_K == 64
+    dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
+#else
     const int nb = k / QK_K;
 
     for (int i = 0; i < nb; i++) {
@@ -4246,6 +4278,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
             qs += 16;
         }
     }
+#endif
 }
 
 //===================================== Q8_K ==============================================
@@ -6306,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
 
-    int isum[4];
+    int isum[QK_K/16];
 
     for (int i = 0; i < nb; ++i) {
 
@@ -6322,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        isum[0] = isum[1] = isum[2] = isum[3] = 0;
+        memset(isum, 0, (QK_K/16)*sizeof(int));
         for (int l =  0; l < 16; ++l) {
             isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
             isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
             isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
             isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
         }
-        for (int l = 0; l < 4; ++l) {
+        for (int l = 0; l < QK_K/16; ++l) {
             isum[l] *= (sc[l] & 0xF);
         }
         sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
@@ -9488,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
 
 #elif defined(__AVX2__)
 
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-    const __m256i m511 = _mm256_set1_epi16(511);
     const __m256i mone = _mm256_set1_epi8(1);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
     static const char block_sign_shuffle_mask_1[32] = {
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
         0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -9510,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
     };
 
-    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
     const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
     const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
     const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
 
+#if QK_K == 64
+    static const uint8_t k_bit_helper[16] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
+    const __m128i m511 = _mm_set1_epi16(511);
+    typedef union {
+        __m128i vec_index;
+        uint16_t index[8];
+    } index_t;
+
+    index_t idx;
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
+        idx.vec_index = _mm_and_si128(q2_data, m511);
+
+        const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
+        const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
+        const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
+
+        const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+        const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
+        const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
+
+        const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
+        const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
+
+        const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
+                                               iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
+        const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
+                                               iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
+
+        __m256i signs;
+        signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
+        signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+        const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+        signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
+        signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+        const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+        const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+        const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+
+        const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
+        const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
+
+        const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+#else
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+    const __m256i m511 = _mm256_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
     uint64_t aux64;
 
     // somewhat hacky, but gives a significant boost in performance
@@ -9603,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     }
 
     *s = 0.125f * hsum_float_8(accumf);
+#endif
 
 #else
 
@@ -10199,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
 
     const int nb = n / QK_K;
 
-#if defined __ARM_NEON
+    // TODO: implement for QK_K = 64
+#if defined __ARM_NEON && QK_K == 256
 
     const uint8x16_t m8 = vdupq_n_u8(0x08);
     const uint8x16_t m7 = vdupq_n_u8(0x07);
@@ -10256,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
 
     *s = sumf;
 
-#elif defined __AVX2__
+    // TODO: implement for QK_K = 64
+#elif defined __AVX2__ && QK_K == 256
 
     const __m128i m8 = _mm_set1_epi8(0x08);
     const __m128i m7 = _mm_set1_epi8(0x07);
@@ -10455,6 +10549,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     UNUSED(by);
     UNUSED(bs);
     assert(n % QK_K == 0);
+#if QK_K == 64
+    ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
+#else
 
     const block_iq4_xs * restrict x = vx;
     const block_q8_K   * restrict y = vy;
@@ -10574,6 +10671,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     }
     *s = sumf;
 #endif
+#endif
 }
 
 // ================================ IQ2 quantization =============================================
@@ -10921,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
 
     const int kMaxQ = 3;
 
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
 
     block_iq2_xxs * y = vy;
 
@@ -11094,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
 
     const int kMaxQ = 3;
 
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
 
     block_iq2_xs * y = vy;
 
@@ -12037,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
     GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
 
     block_iq1_s * y = vy;
 
@@ -12315,6 +12413,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
 }
 
 size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+#if QK_K == 64
+    return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
+#else
     (void)hist;
     GGML_ASSERT(n_per_row%QK_K == 0);
     int nblock = n_per_row/QK_K;
@@ -12333,6 +12434,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i
         qrow += nblock*sizeof(block_iq4_xs);
     }
     return nrow * nblock * sizeof(block_iq4_xs);
+#endif
 }
 
 void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
@@ -12363,7 +12465,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
 
     const int kMaxQ = 3;
 
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
 
     block_iq2_s * y = vy;
 
diff --git a/ggml-quants.h b/ggml-quants.h
index 2c61134c4..316e35687 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -230,6 +230,10 @@ typedef struct {
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
 
+#if QK_K == 64
+#define block_iq4_xs block_iq4_nl
+//typedef struct block_iq4_nl block_iq4_xs;
+#else
 typedef struct {
     ggml_fp16_t d;
     uint16_t scales_h;
@@ -237,6 +241,7 @@ typedef struct {
     uint8_t  qs[QK_K/2];
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/ggml.c b/ggml.c
index d66db3352..4591644ad 100644
--- a/ggml.c
+++ b/ggml.c
@@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
     },
     [GGML_TYPE_IQ4_XS] = {
         .type_name                = "iq4_xs",
+#if QK_K == 64
+        .blck_size                = QK4_NL,
+#else
         .blck_size                = QK_K,
+#endif
         .type_size                = sizeof(block_iq4_xs),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
         .from_float               = quantize_row_iq4_xs,
         .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
         .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
+#if QK_K == 64
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#else
         .vec_dot_type             = GGML_TYPE_Q8_K,
+#endif
         .nrows                    = 1,
     },
     [GGML_TYPE_Q8_K] = {
@@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_IQ4_NL:
+#if QK_K == 64
+        case GGML_TYPE_IQ4_XS:
+#endif
             {
                 GGML_ASSERT(start % QK4_NL == 0);
                 GGML_ASSERT(start % n_per_row == 0);
@@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                 GGML_ASSERT(result == row_size * nrows);
             } break;
+#if QK_K != 64
         case GGML_TYPE_IQ4_XS:
             {
-                GGML_ASSERT(start % QK4_NL == 0);
+                GGML_ASSERT(start % QK_K == 0);
                 GGML_ASSERT(start % n_per_row == 0);
                 size_t start_row = start / n_per_row;
                 size_t row_size = ggml_row_size(type, n_per_row);
                 result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                 GGML_ASSERT(result == row_size * nrows);
             } break;
+#endif
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);

From efc72253f7987ed7bdc8bde9d9fa5c7cac2f6292 Mon Sep 17 00:00:00 2001
From: Jorge A <161275481+jorgealias@users.noreply.github.com>
Date: Wed, 28 Feb 2024 01:39:15 -0700
Subject: [PATCH 02/16] server : add "/chat/completions" alias for "/v1/...`
 (#5722)

* Add "/chat/completions" as alias for "/v1/chat/completions"

* merge to upstream master

* minor : fix trailing whitespace

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/server.cpp                    | 133 +++++++++---------
 .../server/tests/features/parallel.feature    |  22 +++
 examples/server/tests/features/steps/steps.py |  28 +++-
 3 files changed, 115 insertions(+), 68 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 846ef7e5f..6b3ee531c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3211,87 +3211,88 @@ int main(int argc, char **argv)
                 res.set_content(models.dump(), "application/json; charset=utf-8");
             });
 
+    const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
+    {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        if (!validate_api_key(req, res)) {
+            return;
+        }
+        json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
 
-    // TODO: add mount point without "/v1" prefix -- how?
-    svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                if (!validate_api_key(req, res)) {
-                    return;
-                }
-                json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+        llama.request_completion(task_id, data, false, false, -1);
 
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, data, false, false, -1);
+        if (!json_value(data, "stream", false)) {
+            std::string completion_text;
+            task_result result = llama.queue_results.recv(task_id);
 
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.queue_results.recv(task_id);
+            if (!result.error && result.stop) {
+                json oaicompat_result = format_final_response_oaicompat(data, result);
 
-                    if (!result.error && result.stop) {
-                        json oaicompat_result = format_final_response_oaicompat(data, result);
+                res.set_content(oaicompat_result.dump(-1, ' ', false,
+                                    json::error_handler_t::replace),
+                                    "application/json; charset=utf-8");
+            } else {
+                res.status = 500;
+                res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
+            }
+            llama.queue_results.remove_waiting_task_id(task_id);
+        } else {
+            const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
+                while (true) {
+                    task_result llama_result = llama.queue_results.recv(task_id);
+                    if (!llama_result.error) {
+                        std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
 
-                        res.set_content(oaicompat_result.dump(-1, ' ', false,
-                                            json::error_handler_t::replace),
-                                            "application/json; charset=utf-8");
-                    } else {
-                        res.status = 500;
-                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                    }
-                    llama.queue_results.remove_waiting_task_id(task_id);
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
-                        while (true) {
-                            task_result llama_result = llama.queue_results.recv(task_id);
-                            if (!llama_result.error) {
-                                std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
-
-                                for (auto it = result_array.begin(); it != result_array.end(); ++it)
-                                {
-                                    if (!it->empty()) {
-                                        const std::string str =
-                                            "data: " +
-                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                                            "\n\n";
-                                        LOG_VERBOSE("data stream", {{"to_send", str}});
-                                        if (!sink.write(str.c_str(), str.size())) {
-                                            llama.queue_results.remove_waiting_task_id(task_id);
-                                            return false;
-                                        }
-                                    }
-                                }
-                                if (llama_result.stop) {
-                                    break;
-                                }
-                            } else {
+                        for (auto it = result_array.begin(); it != result_array.end(); ++it)
+                        {
+                            if (!it->empty()) {
                                 const std::string str =
-                                    "error: " +
-                                    llama_result.result_json.dump(-1, ' ', false,
-                                            json::error_handler_t::replace) +
+                                    "data: " +
+                                    it->dump(-1, ' ', false, json::error_handler_t::replace) +
                                     "\n\n";
                                 LOG_VERBOSE("data stream", {{"to_send", str}});
                                 if (!sink.write(str.c_str(), str.size())) {
                                     llama.queue_results.remove_waiting_task_id(task_id);
                                     return false;
                                 }
-                                break;
                             }
                         }
-                        sink.done();
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                        return true;
-                    };
-
-                    auto on_complete = [task_id, &llama](bool) {
-                        // cancel request
-                        llama.request_cancel(task_id);
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                    };
-
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+                        if (llama_result.stop) {
+                            break;
+                        }
+                    } else {
+                        const std::string str =
+                            "error: " +
+                            llama_result.result_json.dump(-1, ' ', false,
+                                    json::error_handler_t::replace) +
+                            "\n\n";
+                        LOG_VERBOSE("data stream", {{"to_send", str}});
+                        if (!sink.write(str.c_str(), str.size())) {
+                            llama.queue_results.remove_waiting_task_id(task_id);
+                            return false;
+                        }
+                        break;
+                    }
                 }
-            });
+                sink.done();
+                llama.queue_results.remove_waiting_task_id(task_id);
+                return true;
+            };
+
+            auto on_complete = [task_id, &llama](bool) {
+                // cancel request
+                llama.request_cancel(task_id);
+                llama.queue_results.remove_waiting_task_id(task_id);
+            };
+
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+        }
+    };
+
+    svr.Post("/chat/completions", chat_completions);
+    svr.Post("/v1/chat/completions", chat_completions);
 
     svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
             {
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index c85f9de1d..5f895cf90 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -54,6 +54,28 @@ Feature: Parallel
       | disabled  | 128       |
       | enabled   | 64        |
 
+  Scenario Outline: Multi users OAI completions compatibility no v1
+    Given a system prompt You are a writer.
+    And   a model tinyllama-2
+    Given a prompt:
+      """
+      Write a very long book.
+      """
+    And a prompt:
+      """
+      Write another a poem.
+      """
+    And <n_predict> max tokens to predict
+    And streaming is <streaming>
+    Given concurrent OAI completions requests no v1
+    Then the server is busy
+    Then the server is idle
+    Then all prompts are predicted with <n_predict> tokens
+    Examples:
+      | streaming | n_predict |
+      | disabled  | 128       |
+      | enabled   | 64        |
+
   Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
     Given a prompt:
       """
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index ad87fcb82..381da105e 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -231,6 +231,7 @@ async def step_oai_chat_completions(context, api_error):
     completion = await oai_chat_completions(context.prompts.pop(),
                                             context.system_prompt,
                                             context.base_url,
+                                            '/v1/chat',
                                             False,
                                             model=context.model if hasattr(context, 'model') else None,
 
@@ -288,6 +289,28 @@ async def step_oai_chat_completions(context):
                               # user_prompt is inserted automatically
                               context.system_prompt,
                               context.base_url,
+                              '/v1/chat/completions',
+                              True,  # async_client
+                              model=context.model
+                              if hasattr(context, 'model') else None,
+                              n_predict=context.n_predict
+                              if hasattr(context, 'n_predict') else None,
+                              enable_streaming=context.enable_streaming
+                              if hasattr(context, 'enable_streaming') else None,
+                              server_seed=context.server_seed
+                              if hasattr(context, 'server_seed') else None,
+                              user_api_key=context.user_api_key
+                              if hasattr(context, 'user_api_key') else None)
+
+
+@step(u'concurrent OAI completions requests no v1')
+@async_run_until_complete
+async def step_oai_chat_completions(context):
+    await concurrent_requests(context, oai_chat_completions,
+                              # user_prompt is inserted automatically
+                              context.system_prompt,
+                              context.base_url,
+                              '/chat/completions',
                               True,  # async_client
                               model=context.model
                               if hasattr(context, 'model') else None,
@@ -497,6 +520,7 @@ async def request_completion(prompt,
 async def oai_chat_completions(user_prompt,
                                system_prompt,
                                base_url,
+                               base_path,
                                async_client,
                                debug=False,
                                model=None,
@@ -537,7 +561,7 @@ async def oai_chat_completions(user_prompt,
         origin = 'llama.cpp'
         headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
         async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}/v1/chat/completions',
+            async with session.post(f'{base_url}{base_path}',
                                     json=payload,
                                     headers=headers) as response:
                 if enable_streaming:
@@ -579,7 +603,7 @@ async def oai_chat_completions(user_prompt,
     else:
         try:
             openai.api_key = user_api_key
-            openai.api_base = f'{base_url}/v1/chat'
+            openai.api_base = f'{base_url}{base_path}'
             chat_completion = openai.Completion.create(
                 messages=payload['messages'],
                 model=model,

From 6c4416868df2e5455da7d20547f62bcf9735ba8e Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 28 Feb 2024 09:39:39 +0100
Subject: [PATCH 03/16] readme : add link to LLaVA 1.6 models (#5758)

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 507a2888b..5401e197f 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Typically finetunes of the base models below are supported as well.
 
 **Multimodal models:**
 
-- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
+- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)

From 177628bfd85565070916ad66a5ac4071ee0527d8 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Wed, 28 Feb 2024 02:51:11 -0600
Subject: [PATCH 04/16] llama : improve BERT tokenization (#5740)

* implement nfd for stripping accents in wpm tokenizer

* sort nfd map; reuse iterator

* use builtin tolower

* add locale include

* Simplify to_lower cases

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
---
 llama.cpp | 137 +++++++++-------------------
 unicode.h | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 305 insertions(+), 94 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 464e1b89b..356ca1076 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -68,10 +68,12 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <cwctype>
 #include <forward_list>
 #include <fstream>
 #include <functional>
 #include <initializer_list>
+#include <locale>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -8941,37 +8943,46 @@ struct llm_tokenizer_wpm {
     }
 
     std::vector<std::string> preprocess(const std::string & text) {
-        std::string ori_str = normalize(text);
-        uint64_t ori_size = ori_str.size();
+        // normalalization form D
+        std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
+        std::vector<uint32_t> nfd_codepoints;
+        for (uint32_t code : codepoints) {
+            auto it = nfd_map.find(code);
+            if (it != nfd_map.end()) {
+                for (uint32_t c : it->second) {
+                    nfd_codepoints.push_back(c);
+                }
+            } else {
+                nfd_codepoints.push_back(code);
+            }
+        }
 
-        // single punct / single symbol / single digit
-        // baseline: add whitespace on the left and right of punct and chinese characters
-        std::vector<std::string> words;
+        // strip accents, strip control, uniformize whitespace,
+        // to lowercase, pad chinese characters, pad punctuation
         std::string new_str = "";
-        uint64_t i = 0;
-        while (i < ori_size) {
-            int utf_char_len = utf8_len(ori_str[i]);
-            if ((utf_char_len == 1) && ispunct(ori_str[i])) {
-                new_str += " ";
-                new_str += ori_str[i];
-                new_str += " ";
-                i += 1;
+        for (uint32_t code : nfd_codepoints) {
+            int type = codepoint_type(code);
+            if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
+                continue;
             }
-            else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
-                new_str += " ";
-                new_str += ori_str.substr(i, 3);
-                new_str += " ";
-                i += 3;
+            code = to_lower(code);
+            if (type == CODEPOINT_TYPE_WHITESPACE) {
+                code = ' ';
             }
-            else {
-                new_str += ori_str[i];
-                i += 1;
+            std::string s = codepoint_to_utf8(code);
+            if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
+                new_str += " ";
+                new_str += s;
+                new_str += " ";
+            } else {
+                new_str += s;
             }
         }
 
         // split by whitespace
         uint64_t l = 0;
         uint64_t r = 0;
+        std::vector<std::string> words;
         while (r < new_str.size()) {
             // if is whitespace
             if (isspace(new_str[r])) {
@@ -8989,47 +9000,20 @@ struct llm_tokenizer_wpm {
         return words;
     }
 
-    std::string normalize(const std::string & text) {
-        // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
-        std::string text2 = strip_accents(text);
-        for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
-            char c = text2[i];
-            if (c >= 'A' && c <= 'Z') {
-                text2[i] = c - 'A' + 'a';
-            }
+    uint32_t to_lower(uint32_t code) {
+#if defined(_WIN32)
+        if (code > 0xFFFF) {
+            return code;
         }
-        return text2;
+#endif
+        return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
     }
 
-    bool is_chinese_char(const std::string & str) {
-        int len = str.length();
-        unsigned int codepoint = 0;
-        int num_bytes = 0;
-        int i = 0;
-        unsigned char ch = static_cast<unsigned char>(str[i]);
-        if (ch <= 0x7f) {
-            codepoint = ch;
-            num_bytes = 1;
-        } else if ((ch >> 5) == 0x06) {
-            codepoint = ch & 0x1f;
-            num_bytes = 2;
-        } else if ((ch >> 4) == 0x0e) {
-            codepoint = ch & 0x0f;
-            num_bytes = 3;
-        } else if ((ch >> 3) == 0x1e) {
-            codepoint = ch & 0x07;
-            num_bytes = 4;
-        }
-        for (int j = 1; j < num_bytes; ++j) {
-            if (i + j >= len) {
-                return false; // incomplete UTF-8 character
-            }
-            unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
-            if ((next_ch >> 6) != 0x02) {
-                return false; // invalid trailing byte
-            }
-            codepoint = (codepoint << 6) | (next_ch & 0x3f);
-        }
+    bool is_ascii_punct(uint32_t code) {
+        return code < 256 && ispunct(code);
+    }
+
+    bool is_chinese_char(uint32_t codepoint) {
         if ((codepoint >= 0x4E00  && codepoint <= 0x9FFF)  ||
             (codepoint >= 0x3400  && codepoint <= 0x4DBF)  ||
             (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -9045,41 +9029,6 @@ struct llm_tokenizer_wpm {
         return false;
     }
 
-    std::string strip_accents(const std::string & input_string) {
-        std::string resultString;
-        std::map<std::string, char> accent_map = {
-            {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
-            {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
-            {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
-            {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
-            {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
-            {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
-            {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
-            {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
-            {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
-        };
-
-        for (size_t i = 0; i <  input_string.length();) {
-            int len = utf8_len(input_string[i]);
-            std::string curChar = input_string.substr(i, len);
-            auto iter = accent_map.find(curChar);
-            if (iter != accent_map.end()) {
-                resultString += iter->second;
-            } else {
-                resultString += curChar;
-            }
-            i += len;
-        }
-
-        return resultString;
-    }
-
-    static size_t utf8_len(char src) {
-        const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
-        uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-        return lookup[highbits];
-    }
-
     const llama_vocab & vocab;
 };
 
diff --git a/unicode.h b/unicode.h
index 10a5dab01..620e2b580 100644
--- a/unicode.h
+++ b/unicode.h
@@ -223,6 +223,268 @@ static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
 {0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
 };
 
+static const std::unordered_map<uint32_t, std::vector<uint32_t>> nfd_map = {
+{0xC0, {0x41, 0x300}}, {0xC1, {0x41, 0x301}}, {0xC2, {0x41, 0x302}}, {0xC3, {0x41, 0x303}}, {0xC4, {0x41, 0x308}}, {0xC5, {0x41, 0x30A}}, {0xC7, {0x43, 0x327}}, {0xC8, {0x45, 0x300}},
+{0xC9, {0x45, 0x301}}, {0xCA, {0x45, 0x302}}, {0xCB, {0x45, 0x308}}, {0xCC, {0x49, 0x300}}, {0xCD, {0x49, 0x301}}, {0xCE, {0x49, 0x302}}, {0xCF, {0x49, 0x308}}, {0xD1, {0x4E, 0x303}},
+{0xD2, {0x4F, 0x300}}, {0xD3, {0x4F, 0x301}}, {0xD4, {0x4F, 0x302}}, {0xD5, {0x4F, 0x303}}, {0xD6, {0x4F, 0x308}}, {0xD9, {0x55, 0x300}}, {0xDA, {0x55, 0x301}}, {0xDB, {0x55, 0x302}},
+{0xDC, {0x55, 0x308}}, {0xDD, {0x59, 0x301}}, {0xE0, {0x61, 0x300}}, {0xE1, {0x61, 0x301}}, {0xE2, {0x61, 0x302}}, {0xE3, {0x61, 0x303}}, {0xE4, {0x61, 0x308}}, {0xE5, {0x61, 0x30A}},
+{0xE7, {0x63, 0x327}}, {0xE8, {0x65, 0x300}}, {0xE9, {0x65, 0x301}}, {0xEA, {0x65, 0x302}}, {0xEB, {0x65, 0x308}}, {0xEC, {0x69, 0x300}}, {0xED, {0x69, 0x301}}, {0xEE, {0x69, 0x302}},
+{0xEF, {0x69, 0x308}}, {0xF1, {0x6E, 0x303}}, {0xF2, {0x6F, 0x300}}, {0xF3, {0x6F, 0x301}}, {0xF4, {0x6F, 0x302}}, {0xF5, {0x6F, 0x303}}, {0xF6, {0x6F, 0x308}}, {0xF9, {0x75, 0x300}},
+{0xFA, {0x75, 0x301}}, {0xFB, {0x75, 0x302}}, {0xFC, {0x75, 0x308}}, {0xFD, {0x79, 0x301}}, {0xFF, {0x79, 0x308}}, {0x100, {0x41, 0x304}}, {0x101, {0x61, 0x304}}, {0x102, {0x41, 0x306}},
+{0x103, {0x61, 0x306}}, {0x104, {0x41, 0x328}}, {0x105, {0x61, 0x328}}, {0x106, {0x43, 0x301}}, {0x107, {0x63, 0x301}}, {0x108, {0x43, 0x302}}, {0x109, {0x63, 0x302}}, {0x10A, {0x43, 0x307}},
+{0x10B, {0x63, 0x307}}, {0x10C, {0x43, 0x30C}}, {0x10D, {0x63, 0x30C}}, {0x10E, {0x44, 0x30C}}, {0x10F, {0x64, 0x30C}}, {0x112, {0x45, 0x304}}, {0x113, {0x65, 0x304}}, {0x114, {0x45, 0x306}},
+{0x115, {0x65, 0x306}}, {0x116, {0x45, 0x307}}, {0x117, {0x65, 0x307}}, {0x118, {0x45, 0x328}}, {0x119, {0x65, 0x328}}, {0x11A, {0x45, 0x30C}}, {0x11B, {0x65, 0x30C}}, {0x11C, {0x47, 0x302}},
+{0x11D, {0x67, 0x302}}, {0x11E, {0x47, 0x306}}, {0x11F, {0x67, 0x306}}, {0x120, {0x47, 0x307}}, {0x121, {0x67, 0x307}}, {0x122, {0x47, 0x327}}, {0x123, {0x67, 0x327}}, {0x124, {0x48, 0x302}},
+{0x125, {0x68, 0x302}}, {0x128, {0x49, 0x303}}, {0x129, {0x69, 0x303}}, {0x12A, {0x49, 0x304}}, {0x12B, {0x69, 0x304}}, {0x12C, {0x49, 0x306}}, {0x12D, {0x69, 0x306}}, {0x12E, {0x49, 0x328}},
+{0x12F, {0x69, 0x328}}, {0x130, {0x49, 0x307}}, {0x134, {0x4A, 0x302}}, {0x135, {0x6A, 0x302}}, {0x136, {0x4B, 0x327}}, {0x137, {0x6B, 0x327}}, {0x139, {0x4C, 0x301}}, {0x13A, {0x6C, 0x301}},
+{0x13B, {0x4C, 0x327}}, {0x13C, {0x6C, 0x327}}, {0x13D, {0x4C, 0x30C}}, {0x13E, {0x6C, 0x30C}}, {0x143, {0x4E, 0x301}}, {0x144, {0x6E, 0x301}}, {0x145, {0x4E, 0x327}}, {0x146, {0x6E, 0x327}},
+{0x147, {0x4E, 0x30C}}, {0x148, {0x6E, 0x30C}}, {0x14C, {0x4F, 0x304}}, {0x14D, {0x6F, 0x304}}, {0x14E, {0x4F, 0x306}}, {0x14F, {0x6F, 0x306}}, {0x150, {0x4F, 0x30B}}, {0x151, {0x6F, 0x30B}},
+{0x154, {0x52, 0x301}}, {0x155, {0x72, 0x301}}, {0x156, {0x52, 0x327}}, {0x157, {0x72, 0x327}}, {0x158, {0x52, 0x30C}}, {0x159, {0x72, 0x30C}}, {0x15A, {0x53, 0x301}}, {0x15B, {0x73, 0x301}},
+{0x15C, {0x53, 0x302}}, {0x15D, {0x73, 0x302}}, {0x15E, {0x53, 0x327}}, {0x15F, {0x73, 0x327}}, {0x160, {0x53, 0x30C}}, {0x161, {0x73, 0x30C}}, {0x162, {0x54, 0x327}}, {0x163, {0x74, 0x327}},
+{0x164, {0x54, 0x30C}}, {0x165, {0x74, 0x30C}}, {0x168, {0x55, 0x303}}, {0x169, {0x75, 0x303}}, {0x16A, {0x55, 0x304}}, {0x16B, {0x75, 0x304}}, {0x16C, {0x55, 0x306}}, {0x16D, {0x75, 0x306}},
+{0x16E, {0x55, 0x30A}}, {0x16F, {0x75, 0x30A}}, {0x170, {0x55, 0x30B}}, {0x171, {0x75, 0x30B}}, {0x172, {0x55, 0x328}}, {0x173, {0x75, 0x328}}, {0x174, {0x57, 0x302}}, {0x175, {0x77, 0x302}},
+{0x176, {0x59, 0x302}}, {0x177, {0x79, 0x302}}, {0x178, {0x59, 0x308}}, {0x179, {0x5A, 0x301}}, {0x17A, {0x7A, 0x301}}, {0x17B, {0x5A, 0x307}}, {0x17C, {0x7A, 0x307}}, {0x17D, {0x5A, 0x30C}},
+{0x17E, {0x7A, 0x30C}}, {0x1A0, {0x4F, 0x31B}}, {0x1A1, {0x6F, 0x31B}}, {0x1AF, {0x55, 0x31B}}, {0x1B0, {0x75, 0x31B}}, {0x1CD, {0x41, 0x30C}}, {0x1CE, {0x61, 0x30C}}, {0x1CF, {0x49, 0x30C}},
+{0x1D0, {0x69, 0x30C}}, {0x1D1, {0x4F, 0x30C}}, {0x1D2, {0x6F, 0x30C}}, {0x1D3, {0x55, 0x30C}}, {0x1D4, {0x75, 0x30C}}, {0x1D5, {0x55, 0x308, 0x304}}, {0x1D6, {0x75, 0x308, 0x304}},
+{0x1D7, {0x55, 0x308, 0x301}}, {0x1D8, {0x75, 0x308, 0x301}}, {0x1D9, {0x55, 0x308, 0x30C}}, {0x1DA, {0x75, 0x308, 0x30C}}, {0x1DB, {0x55, 0x308, 0x300}}, {0x1DC, {0x75, 0x308, 0x300}},
+{0x1DE, {0x41, 0x308, 0x304}}, {0x1DF, {0x61, 0x308, 0x304}}, {0x1E0, {0x41, 0x307, 0x304}}, {0x1E1, {0x61, 0x307, 0x304}}, {0x1E2, {0xC6, 0x304}}, {0x1E3, {0xE6, 0x304}}, {0x1E6, {0x47, 0x30C}},
+{0x1E7, {0x67, 0x30C}}, {0x1E8, {0x4B, 0x30C}}, {0x1E9, {0x6B, 0x30C}}, {0x1EA, {0x4F, 0x328}}, {0x1EB, {0x6F, 0x328}}, {0x1EC, {0x4F, 0x328, 0x304}}, {0x1ED, {0x6F, 0x328, 0x304}},
+{0x1EE, {0x1B7, 0x30C}}, {0x1EF, {0x292, 0x30C}}, {0x1F0, {0x6A, 0x30C}}, {0x1F4, {0x47, 0x301}}, {0x1F5, {0x67, 0x301}}, {0x1F8, {0x4E, 0x300}}, {0x1F9, {0x6E, 0x300}}, {0x1FA, {0x41, 0x30A, 0x301}},
+{0x1FB, {0x61, 0x30A, 0x301}}, {0x1FC, {0xC6, 0x301}}, {0x1FD, {0xE6, 0x301}}, {0x1FE, {0xD8, 0x301}}, {0x1FF, {0xF8, 0x301}}, {0x200, {0x41, 0x30F}}, {0x201, {0x61, 0x30F}}, {0x202, {0x41, 0x311}},
+{0x203, {0x61, 0x311}}, {0x204, {0x45, 0x30F}}, {0x205, {0x65, 0x30F}}, {0x206, {0x45, 0x311}}, {0x207, {0x65, 0x311}}, {0x208, {0x49, 0x30F}}, {0x209, {0x69, 0x30F}}, {0x20A, {0x49, 0x311}},
+{0x20B, {0x69, 0x311}}, {0x20C, {0x4F, 0x30F}}, {0x20D, {0x6F, 0x30F}}, {0x20E, {0x4F, 0x311}}, {0x20F, {0x6F, 0x311}}, {0x210, {0x52, 0x30F}}, {0x211, {0x72, 0x30F}}, {0x212, {0x52, 0x311}},
+{0x213, {0x72, 0x311}}, {0x214, {0x55, 0x30F}}, {0x215, {0x75, 0x30F}}, {0x216, {0x55, 0x311}}, {0x217, {0x75, 0x311}}, {0x218, {0x53, 0x326}}, {0x219, {0x73, 0x326}}, {0x21A, {0x54, 0x326}},
+{0x21B, {0x74, 0x326}}, {0x21E, {0x48, 0x30C}}, {0x21F, {0x68, 0x30C}}, {0x226, {0x41, 0x307}}, {0x227, {0x61, 0x307}}, {0x228, {0x45, 0x327}}, {0x229, {0x65, 0x327}}, {0x22A, {0x4F, 0x308, 0x304}},
+{0x22B, {0x6F, 0x308, 0x304}}, {0x22C, {0x4F, 0x303, 0x304}}, {0x22D, {0x6F, 0x303, 0x304}}, {0x22E, {0x4F, 0x307}}, {0x22F, {0x6F, 0x307}}, {0x230, {0x4F, 0x307, 0x304}},
+{0x231, {0x6F, 0x307, 0x304}}, {0x232, {0x59, 0x304}}, {0x233, {0x79, 0x304}}, {0x340, {0x300}}, {0x341, {0x301}}, {0x343, {0x313}}, {0x344, {0x308, 0x301}}, {0x374, {0x2B9}}, {0x37E, {0x3B}},
+{0x385, {0xA8, 0x301}}, {0x386, {0x391, 0x301}}, {0x387, {0xB7}}, {0x388, {0x395, 0x301}}, {0x389, {0x397, 0x301}}, {0x38A, {0x399, 0x301}}, {0x38C, {0x39F, 0x301}}, {0x38E, {0x3A5, 0x301}},
+{0x38F, {0x3A9, 0x301}}, {0x390, {0x3B9, 0x308, 0x301}}, {0x3AA, {0x399, 0x308}}, {0x3AB, {0x3A5, 0x308}}, {0x3AC, {0x3B1, 0x301}}, {0x3AD, {0x3B5, 0x301}}, {0x3AE, {0x3B7, 0x301}},
+{0x3AF, {0x3B9, 0x301}}, {0x3B0, {0x3C5, 0x308, 0x301}}, {0x3CA, {0x3B9, 0x308}}, {0x3CB, {0x3C5, 0x308}}, {0x3CC, {0x3BF, 0x301}}, {0x3CD, {0x3C5, 0x301}}, {0x3CE, {0x3C9, 0x301}},
+{0x3D3, {0x3D2, 0x301}}, {0x3D4, {0x3D2, 0x308}}, {0x400, {0x415, 0x300}}, {0x401, {0x415, 0x308}}, {0x403, {0x413, 0x301}}, {0x407, {0x406, 0x308}}, {0x40C, {0x41A, 0x301}}, {0x40D, {0x418, 0x300}},
+{0x40E, {0x423, 0x306}}, {0x419, {0x418, 0x306}}, {0x439, {0x438, 0x306}}, {0x450, {0x435, 0x300}}, {0x451, {0x435, 0x308}}, {0x453, {0x433, 0x301}}, {0x457, {0x456, 0x308}}, {0x45C, {0x43A, 0x301}},
+{0x45D, {0x438, 0x300}}, {0x45E, {0x443, 0x306}}, {0x476, {0x474, 0x30F}}, {0x477, {0x475, 0x30F}}, {0x4C1, {0x416, 0x306}}, {0x4C2, {0x436, 0x306}}, {0x4D0, {0x410, 0x306}}, {0x4D1, {0x430, 0x306}},
+{0x4D2, {0x410, 0x308}}, {0x4D3, {0x430, 0x308}}, {0x4D6, {0x415, 0x306}}, {0x4D7, {0x435, 0x306}}, {0x4DA, {0x4D8, 0x308}}, {0x4DB, {0x4D9, 0x308}}, {0x4DC, {0x416, 0x308}}, {0x4DD, {0x436, 0x308}},
+{0x4DE, {0x417, 0x308}}, {0x4DF, {0x437, 0x308}}, {0x4E2, {0x418, 0x304}}, {0x4E3, {0x438, 0x304}}, {0x4E4, {0x418, 0x308}}, {0x4E5, {0x438, 0x308}}, {0x4E6, {0x41E, 0x308}}, {0x4E7, {0x43E, 0x308}},
+{0x4EA, {0x4E8, 0x308}}, {0x4EB, {0x4E9, 0x308}}, {0x4EC, {0x42D, 0x308}}, {0x4ED, {0x44D, 0x308}}, {0x4EE, {0x423, 0x304}}, {0x4EF, {0x443, 0x304}}, {0x4F0, {0x423, 0x308}}, {0x4F1, {0x443, 0x308}},
+{0x4F2, {0x423, 0x30B}}, {0x4F3, {0x443, 0x30B}}, {0x4F4, {0x427, 0x308}}, {0x4F5, {0x447, 0x308}}, {0x4F8, {0x42B, 0x308}}, {0x4F9, {0x44B, 0x308}}, {0x622, {0x627, 0x653}}, {0x623, {0x627, 0x654}},
+{0x624, {0x648, 0x654}}, {0x625, {0x627, 0x655}}, {0x626, {0x64A, 0x654}}, {0x6C0, {0x6D5, 0x654}}, {0x6C2, {0x6C1, 0x654}}, {0x6D3, {0x6D2, 0x654}}, {0x929, {0x928, 0x93C}}, {0x931, {0x930, 0x93C}},
+{0x934, {0x933, 0x93C}}, {0x958, {0x915, 0x93C}}, {0x959, {0x916, 0x93C}}, {0x95A, {0x917, 0x93C}}, {0x95B, {0x91C, 0x93C}}, {0x95C, {0x921, 0x93C}}, {0x95D, {0x922, 0x93C}}, {0x95E, {0x92B, 0x93C}},
+{0x95F, {0x92F, 0x93C}}, {0x9CB, {0x9C7, 0x9BE}}, {0x9CC, {0x9C7, 0x9D7}}, {0x9DC, {0x9A1, 0x9BC}}, {0x9DD, {0x9A2, 0x9BC}}, {0x9DF, {0x9AF, 0x9BC}}, {0xA33, {0xA32, 0xA3C}}, {0xA36, {0xA38, 0xA3C}},
+{0xA59, {0xA16, 0xA3C}}, {0xA5A, {0xA17, 0xA3C}}, {0xA5B, {0xA1C, 0xA3C}}, {0xA5E, {0xA2B, 0xA3C}}, {0xB48, {0xB47, 0xB56}}, {0xB4B, {0xB47, 0xB3E}}, {0xB4C, {0xB47, 0xB57}}, {0xB5C, {0xB21, 0xB3C}},
+{0xB5D, {0xB22, 0xB3C}}, {0xB94, {0xB92, 0xBD7}}, {0xBCA, {0xBC6, 0xBBE}}, {0xBCB, {0xBC7, 0xBBE}}, {0xBCC, {0xBC6, 0xBD7}}, {0xC48, {0xC46, 0xC56}}, {0xCC0, {0xCBF, 0xCD5}}, {0xCC7, {0xCC6, 0xCD5}},
+{0xCC8, {0xCC6, 0xCD6}}, {0xCCA, {0xCC6, 0xCC2}}, {0xCCB, {0xCC6, 0xCC2, 0xCD5}}, {0xD4A, {0xD46, 0xD3E}}, {0xD4B, {0xD47, 0xD3E}}, {0xD4C, {0xD46, 0xD57}}, {0xDDA, {0xDD9, 0xDCA}},
+{0xDDC, {0xDD9, 0xDCF}}, {0xDDD, {0xDD9, 0xDCF, 0xDCA}}, {0xDDE, {0xDD9, 0xDDF}}, {0xF43, {0xF42, 0xFB7}}, {0xF4D, {0xF4C, 0xFB7}}, {0xF52, {0xF51, 0xFB7}}, {0xF57, {0xF56, 0xFB7}},
+{0xF5C, {0xF5B, 0xFB7}}, {0xF69, {0xF40, 0xFB5}}, {0xF73, {0xF71, 0xF72}}, {0xF75, {0xF71, 0xF74}}, {0xF76, {0xFB2, 0xF80}}, {0xF78, {0xFB3, 0xF80}}, {0xF81, {0xF71, 0xF80}}, {0xF93, {0xF92, 0xFB7}},
+{0xF9D, {0xF9C, 0xFB7}}, {0xFA2, {0xFA1, 0xFB7}}, {0xFA7, {0xFA6, 0xFB7}}, {0xFAC, {0xFAB, 0xFB7}}, {0xFB9, {0xF90, 0xFB5}}, {0x1026, {0x1025, 0x102E}}, {0x1B06, {0x1B05, 0x1B35}},
+{0x1B08, {0x1B07, 0x1B35}}, {0x1B0A, {0x1B09, 0x1B35}}, {0x1B0C, {0x1B0B, 0x1B35}}, {0x1B0E, {0x1B0D, 0x1B35}}, {0x1B12, {0x1B11, 0x1B35}}, {0x1B3B, {0x1B3A, 0x1B35}}, {0x1B3D, {0x1B3C, 0x1B35}},
+{0x1B40, {0x1B3E, 0x1B35}}, {0x1B41, {0x1B3F, 0x1B35}}, {0x1B43, {0x1B42, 0x1B35}}, {0x1E00, {0x41, 0x325}}, {0x1E01, {0x61, 0x325}}, {0x1E02, {0x42, 0x307}}, {0x1E03, {0x62, 0x307}},
+{0x1E04, {0x42, 0x323}}, {0x1E05, {0x62, 0x323}}, {0x1E06, {0x42, 0x331}}, {0x1E07, {0x62, 0x331}}, {0x1E08, {0x43, 0x327, 0x301}}, {0x1E09, {0x63, 0x327, 0x301}}, {0x1E0A, {0x44, 0x307}},
+{0x1E0B, {0x64, 0x307}}, {0x1E0C, {0x44, 0x323}}, {0x1E0D, {0x64, 0x323}}, {0x1E0E, {0x44, 0x331}}, {0x1E0F, {0x64, 0x331}}, {0x1E10, {0x44, 0x327}}, {0x1E11, {0x64, 0x327}}, {0x1E12, {0x44, 0x32D}},
+{0x1E13, {0x64, 0x32D}}, {0x1E14, {0x45, 0x304, 0x300}}, {0x1E15, {0x65, 0x304, 0x300}}, {0x1E16, {0x45, 0x304, 0x301}}, {0x1E17, {0x65, 0x304, 0x301}}, {0x1E18, {0x45, 0x32D}},
+{0x1E19, {0x65, 0x32D}}, {0x1E1A, {0x45, 0x330}}, {0x1E1B, {0x65, 0x330}}, {0x1E1C, {0x45, 0x327, 0x306}}, {0x1E1D, {0x65, 0x327, 0x306}}, {0x1E1E, {0x46, 0x307}}, {0x1E1F, {0x66, 0x307}},
+{0x1E20, {0x47, 0x304}}, {0x1E21, {0x67, 0x304}}, {0x1E22, {0x48, 0x307}}, {0x1E23, {0x68, 0x307}}, {0x1E24, {0x48, 0x323}}, {0x1E25, {0x68, 0x323}}, {0x1E26, {0x48, 0x308}}, {0x1E27, {0x68, 0x308}},
+{0x1E28, {0x48, 0x327}}, {0x1E29, {0x68, 0x327}}, {0x1E2A, {0x48, 0x32E}}, {0x1E2B, {0x68, 0x32E}}, {0x1E2C, {0x49, 0x330}}, {0x1E2D, {0x69, 0x330}}, {0x1E2E, {0x49, 0x308, 0x301}},
+{0x1E2F, {0x69, 0x308, 0x301}}, {0x1E30, {0x4B, 0x301}}, {0x1E31, {0x6B, 0x301}}, {0x1E32, {0x4B, 0x323}}, {0x1E33, {0x6B, 0x323}}, {0x1E34, {0x4B, 0x331}}, {0x1E35, {0x6B, 0x331}},
+{0x1E36, {0x4C, 0x323}}, {0x1E37, {0x6C, 0x323}}, {0x1E38, {0x4C, 0x323, 0x304}}, {0x1E39, {0x6C, 0x323, 0x304}}, {0x1E3A, {0x4C, 0x331}}, {0x1E3B, {0x6C, 0x331}}, {0x1E3C, {0x4C, 0x32D}},
+{0x1E3D, {0x6C, 0x32D}}, {0x1E3E, {0x4D, 0x301}}, {0x1E3F, {0x6D, 0x301}}, {0x1E40, {0x4D, 0x307}}, {0x1E41, {0x6D, 0x307}}, {0x1E42, {0x4D, 0x323}}, {0x1E43, {0x6D, 0x323}}, {0x1E44, {0x4E, 0x307}},
+{0x1E45, {0x6E, 0x307}}, {0x1E46, {0x4E, 0x323}}, {0x1E47, {0x6E, 0x323}}, {0x1E48, {0x4E, 0x331}}, {0x1E49, {0x6E, 0x331}}, {0x1E4A, {0x4E, 0x32D}}, {0x1E4B, {0x6E, 0x32D}},
+{0x1E4C, {0x4F, 0x303, 0x301}}, {0x1E4D, {0x6F, 0x303, 0x301}}, {0x1E4E, {0x4F, 0x303, 0x308}}, {0x1E4F, {0x6F, 0x303, 0x308}}, {0x1E50, {0x4F, 0x304, 0x300}}, {0x1E51, {0x6F, 0x304, 0x300}},
+{0x1E52, {0x4F, 0x304, 0x301}}, {0x1E53, {0x6F, 0x304, 0x301}}, {0x1E54, {0x50, 0x301}}, {0x1E55, {0x70, 0x301}}, {0x1E56, {0x50, 0x307}}, {0x1E57, {0x70, 0x307}}, {0x1E58, {0x52, 0x307}},
+{0x1E59, {0x72, 0x307}}, {0x1E5A, {0x52, 0x323}}, {0x1E5B, {0x72, 0x323}}, {0x1E5C, {0x52, 0x323, 0x304}}, {0x1E5D, {0x72, 0x323, 0x304}}, {0x1E5E, {0x52, 0x331}}, {0x1E5F, {0x72, 0x331}},
+{0x1E60, {0x53, 0x307}}, {0x1E61, {0x73, 0x307}}, {0x1E62, {0x53, 0x323}}, {0x1E63, {0x73, 0x323}}, {0x1E64, {0x53, 0x301, 0x307}}, {0x1E65, {0x73, 0x301, 0x307}}, {0x1E66, {0x53, 0x30C, 0x307}},
+{0x1E67, {0x73, 0x30C, 0x307}}, {0x1E68, {0x53, 0x323, 0x307}}, {0x1E69, {0x73, 0x323, 0x307}}, {0x1E6A, {0x54, 0x307}}, {0x1E6B, {0x74, 0x307}}, {0x1E6C, {0x54, 0x323}}, {0x1E6D, {0x74, 0x323}},
+{0x1E6E, {0x54, 0x331}}, {0x1E6F, {0x74, 0x331}}, {0x1E70, {0x54, 0x32D}}, {0x1E71, {0x74, 0x32D}}, {0x1E72, {0x55, 0x324}}, {0x1E73, {0x75, 0x324}}, {0x1E74, {0x55, 0x330}}, {0x1E75, {0x75, 0x330}},
+{0x1E76, {0x55, 0x32D}}, {0x1E77, {0x75, 0x32D}}, {0x1E78, {0x55, 0x303, 0x301}}, {0x1E79, {0x75, 0x303, 0x301}}, {0x1E7A, {0x55, 0x304, 0x308}}, {0x1E7B, {0x75, 0x304, 0x308}},
+{0x1E7C, {0x56, 0x303}}, {0x1E7D, {0x76, 0x303}}, {0x1E7E, {0x56, 0x323}}, {0x1E7F, {0x76, 0x323}}, {0x1E80, {0x57, 0x300}}, {0x1E81, {0x77, 0x300}}, {0x1E82, {0x57, 0x301}}, {0x1E83, {0x77, 0x301}},
+{0x1E84, {0x57, 0x308}}, {0x1E85, {0x77, 0x308}}, {0x1E86, {0x57, 0x307}}, {0x1E87, {0x77, 0x307}}, {0x1E88, {0x57, 0x323}}, {0x1E89, {0x77, 0x323}}, {0x1E8A, {0x58, 0x307}}, {0x1E8B, {0x78, 0x307}},
+{0x1E8C, {0x58, 0x308}}, {0x1E8D, {0x78, 0x308}}, {0x1E8E, {0x59, 0x307}}, {0x1E8F, {0x79, 0x307}}, {0x1E90, {0x5A, 0x302}}, {0x1E91, {0x7A, 0x302}}, {0x1E92, {0x5A, 0x323}}, {0x1E93, {0x7A, 0x323}},
+{0x1E94, {0x5A, 0x331}}, {0x1E95, {0x7A, 0x331}}, {0x1E96, {0x68, 0x331}}, {0x1E97, {0x74, 0x308}}, {0x1E98, {0x77, 0x30A}}, {0x1E99, {0x79, 0x30A}}, {0x1E9B, {0x17F, 0x307}}, {0x1EA0, {0x41, 0x323}},
+{0x1EA1, {0x61, 0x323}}, {0x1EA2, {0x41, 0x309}}, {0x1EA3, {0x61, 0x309}}, {0x1EA4, {0x41, 0x302, 0x301}}, {0x1EA5, {0x61, 0x302, 0x301}}, {0x1EA6, {0x41, 0x302, 0x300}},
+{0x1EA7, {0x61, 0x302, 0x300}}, {0x1EA8, {0x41, 0x302, 0x309}}, {0x1EA9, {0x61, 0x302, 0x309}}, {0x1EAA, {0x41, 0x302, 0x303}}, {0x1EAB, {0x61, 0x302, 0x303}}, {0x1EAC, {0x41, 0x323, 0x302}},
+{0x1EAD, {0x61, 0x323, 0x302}}, {0x1EAE, {0x41, 0x306, 0x301}}, {0x1EAF, {0x61, 0x306, 0x301}}, {0x1EB0, {0x41, 0x306, 0x300}}, {0x1EB1, {0x61, 0x306, 0x300}}, {0x1EB2, {0x41, 0x306, 0x309}},
+{0x1EB3, {0x61, 0x306, 0x309}}, {0x1EB4, {0x41, 0x306, 0x303}}, {0x1EB5, {0x61, 0x306, 0x303}}, {0x1EB6, {0x41, 0x323, 0x306}}, {0x1EB7, {0x61, 0x323, 0x306}}, {0x1EB8, {0x45, 0x323}},
+{0x1EB9, {0x65, 0x323}}, {0x1EBA, {0x45, 0x309}}, {0x1EBB, {0x65, 0x309}}, {0x1EBC, {0x45, 0x303}}, {0x1EBD, {0x65, 0x303}}, {0x1EBE, {0x45, 0x302, 0x301}}, {0x1EBF, {0x65, 0x302, 0x301}},
+{0x1EC0, {0x45, 0x302, 0x300}}, {0x1EC1, {0x65, 0x302, 0x300}}, {0x1EC2, {0x45, 0x302, 0x309}}, {0x1EC3, {0x65, 0x302, 0x309}}, {0x1EC4, {0x45, 0x302, 0x303}}, {0x1EC5, {0x65, 0x302, 0x303}},
+{0x1EC6, {0x45, 0x323, 0x302}}, {0x1EC7, {0x65, 0x323, 0x302}}, {0x1EC8, {0x49, 0x309}}, {0x1EC9, {0x69, 0x309}}, {0x1ECA, {0x49, 0x323}}, {0x1ECB, {0x69, 0x323}}, {0x1ECC, {0x4F, 0x323}},
+{0x1ECD, {0x6F, 0x323}}, {0x1ECE, {0x4F, 0x309}}, {0x1ECF, {0x6F, 0x309}}, {0x1ED0, {0x4F, 0x302, 0x301}}, {0x1ED1, {0x6F, 0x302, 0x301}}, {0x1ED2, {0x4F, 0x302, 0x300}},
+{0x1ED3, {0x6F, 0x302, 0x300}}, {0x1ED4, {0x4F, 0x302, 0x309}}, {0x1ED5, {0x6F, 0x302, 0x309}}, {0x1ED6, {0x4F, 0x302, 0x303}}, {0x1ED7, {0x6F, 0x302, 0x303}}, {0x1ED8, {0x4F, 0x323, 0x302}},
+{0x1ED9, {0x6F, 0x323, 0x302}}, {0x1EDA, {0x4F, 0x31B, 0x301}}, {0x1EDB, {0x6F, 0x31B, 0x301}}, {0x1EDC, {0x4F, 0x31B, 0x300}}, {0x1EDD, {0x6F, 0x31B, 0x300}}, {0x1EDE, {0x4F, 0x31B, 0x309}},
+{0x1EDF, {0x6F, 0x31B, 0x309}}, {0x1EE0, {0x4F, 0x31B, 0x303}}, {0x1EE1, {0x6F, 0x31B, 0x303}}, {0x1EE2, {0x4F, 0x31B, 0x323}}, {0x1EE3, {0x6F, 0x31B, 0x323}}, {0x1EE4, {0x55, 0x323}},
+{0x1EE5, {0x75, 0x323}}, {0x1EE6, {0x55, 0x309}}, {0x1EE7, {0x75, 0x309}}, {0x1EE8, {0x55, 0x31B, 0x301}}, {0x1EE9, {0x75, 0x31B, 0x301}}, {0x1EEA, {0x55, 0x31B, 0x300}},
+{0x1EEB, {0x75, 0x31B, 0x300}}, {0x1EEC, {0x55, 0x31B, 0x309}}, {0x1EED, {0x75, 0x31B, 0x309}}, {0x1EEE, {0x55, 0x31B, 0x303}}, {0x1EEF, {0x75, 0x31B, 0x303}}, {0x1EF0, {0x55, 0x31B, 0x323}},
+{0x1EF1, {0x75, 0x31B, 0x323}}, {0x1EF2, {0x59, 0x300}}, {0x1EF3, {0x79, 0x300}}, {0x1EF4, {0x59, 0x323}}, {0x1EF5, {0x79, 0x323}}, {0x1EF6, {0x59, 0x309}}, {0x1EF7, {0x79, 0x309}},
+{0x1EF8, {0x59, 0x303}}, {0x1EF9, {0x79, 0x303}}, {0x1F00, {0x3B1, 0x313}}, {0x1F01, {0x3B1, 0x314}}, {0x1F02, {0x3B1, 0x313, 0x300}}, {0x1F03, {0x3B1, 0x314, 0x300}}, {0x1F04, {0x3B1, 0x313, 0x301}},
+{0x1F05, {0x3B1, 0x314, 0x301}}, {0x1F06, {0x3B1, 0x313, 0x342}}, {0x1F07, {0x3B1, 0x314, 0x342}}, {0x1F08, {0x391, 0x313}}, {0x1F09, {0x391, 0x314}}, {0x1F0A, {0x391, 0x313, 0x300}},
+{0x1F0B, {0x391, 0x314, 0x300}}, {0x1F0C, {0x391, 0x313, 0x301}}, {0x1F0D, {0x391, 0x314, 0x301}}, {0x1F0E, {0x391, 0x313, 0x342}}, {0x1F0F, {0x391, 0x314, 0x342}}, {0x1F10, {0x3B5, 0x313}},
+{0x1F11, {0x3B5, 0x314}}, {0x1F12, {0x3B5, 0x313, 0x300}}, {0x1F13, {0x3B5, 0x314, 0x300}}, {0x1F14, {0x3B5, 0x313, 0x301}}, {0x1F15, {0x3B5, 0x314, 0x301}}, {0x1F18, {0x395, 0x313}},
+{0x1F19, {0x395, 0x314}}, {0x1F1A, {0x395, 0x313, 0x300}}, {0x1F1B, {0x395, 0x314, 0x300}}, {0x1F1C, {0x395, 0x313, 0x301}}, {0x1F1D, {0x395, 0x314, 0x301}}, {0x1F20, {0x3B7, 0x313}},
+{0x1F21, {0x3B7, 0x314}}, {0x1F22, {0x3B7, 0x313, 0x300}}, {0x1F23, {0x3B7, 0x314, 0x300}}, {0x1F24, {0x3B7, 0x313, 0x301}}, {0x1F25, {0x3B7, 0x314, 0x301}}, {0x1F26, {0x3B7, 0x313, 0x342}},
+{0x1F27, {0x3B7, 0x314, 0x342}}, {0x1F28, {0x397, 0x313}}, {0x1F29, {0x397, 0x314}}, {0x1F2A, {0x397, 0x313, 0x300}}, {0x1F2B, {0x397, 0x314, 0x300}}, {0x1F2C, {0x397, 0x313, 0x301}},
+{0x1F2D, {0x397, 0x314, 0x301}}, {0x1F2E, {0x397, 0x313, 0x342}}, {0x1F2F, {0x397, 0x314, 0x342}}, {0x1F30, {0x3B9, 0x313}}, {0x1F31, {0x3B9, 0x314}}, {0x1F32, {0x3B9, 0x313, 0x300}},
+{0x1F33, {0x3B9, 0x314, 0x300}}, {0x1F34, {0x3B9, 0x313, 0x301}}, {0x1F35, {0x3B9, 0x314, 0x301}}, {0x1F36, {0x3B9, 0x313, 0x342}}, {0x1F37, {0x3B9, 0x314, 0x342}}, {0x1F38, {0x399, 0x313}},
+{0x1F39, {0x399, 0x314}}, {0x1F3A, {0x399, 0x313, 0x300}}, {0x1F3B, {0x399, 0x314, 0x300}}, {0x1F3C, {0x399, 0x313, 0x301}}, {0x1F3D, {0x399, 0x314, 0x301}}, {0x1F3E, {0x399, 0x313, 0x342}},
+{0x1F3F, {0x399, 0x314, 0x342}}, {0x1F40, {0x3BF, 0x313}}, {0x1F41, {0x3BF, 0x314}}, {0x1F42, {0x3BF, 0x313, 0x300}}, {0x1F43, {0x3BF, 0x314, 0x300}}, {0x1F44, {0x3BF, 0x313, 0x301}},
+{0x1F45, {0x3BF, 0x314, 0x301}}, {0x1F48, {0x39F, 0x313}}, {0x1F49, {0x39F, 0x314}}, {0x1F4A, {0x39F, 0x313, 0x300}}, {0x1F4B, {0x39F, 0x314, 0x300}}, {0x1F4C, {0x39F, 0x313, 0x301}},
+{0x1F4D, {0x39F, 0x314, 0x301}}, {0x1F50, {0x3C5, 0x313}}, {0x1F51, {0x3C5, 0x314}}, {0x1F52, {0x3C5, 0x313, 0x300}}, {0x1F53, {0x3C5, 0x314, 0x300}}, {0x1F54, {0x3C5, 0x313, 0x301}},
+{0x1F55, {0x3C5, 0x314, 0x301}}, {0x1F56, {0x3C5, 0x313, 0x342}}, {0x1F57, {0x3C5, 0x314, 0x342}}, {0x1F59, {0x3A5, 0x314}}, {0x1F5B, {0x3A5, 0x314, 0x300}}, {0x1F5D, {0x3A5, 0x314, 0x301}},
+{0x1F5F, {0x3A5, 0x314, 0x342}}, {0x1F60, {0x3C9, 0x313}}, {0x1F61, {0x3C9, 0x314}}, {0x1F62, {0x3C9, 0x313, 0x300}}, {0x1F63, {0x3C9, 0x314, 0x300}}, {0x1F64, {0x3C9, 0x313, 0x301}},
+{0x1F65, {0x3C9, 0x314, 0x301}}, {0x1F66, {0x3C9, 0x313, 0x342}}, {0x1F67, {0x3C9, 0x314, 0x342}}, {0x1F68, {0x3A9, 0x313}}, {0x1F69, {0x3A9, 0x314}}, {0x1F6A, {0x3A9, 0x313, 0x300}},
+{0x1F6B, {0x3A9, 0x314, 0x300}}, {0x1F6C, {0x3A9, 0x313, 0x301}}, {0x1F6D, {0x3A9, 0x314, 0x301}}, {0x1F6E, {0x3A9, 0x313, 0x342}}, {0x1F6F, {0x3A9, 0x314, 0x342}}, {0x1F70, {0x3B1, 0x300}},
+{0x1F71, {0x3B1, 0x301}}, {0x1F72, {0x3B5, 0x300}}, {0x1F73, {0x3B5, 0x301}}, {0x1F74, {0x3B7, 0x300}}, {0x1F75, {0x3B7, 0x301}}, {0x1F76, {0x3B9, 0x300}}, {0x1F77, {0x3B9, 0x301}},
+{0x1F78, {0x3BF, 0x300}}, {0x1F79, {0x3BF, 0x301}}, {0x1F7A, {0x3C5, 0x300}}, {0x1F7B, {0x3C5, 0x301}}, {0x1F7C, {0x3C9, 0x300}}, {0x1F7D, {0x3C9, 0x301}}, {0x1F80, {0x3B1, 0x313, 0x345}},
+{0x1F81, {0x3B1, 0x314, 0x345}}, {0x1F82, {0x3B1, 0x313, 0x300, 0x345}}, {0x1F83, {0x3B1, 0x314, 0x300, 0x345}}, {0x1F84, {0x3B1, 0x313, 0x301, 0x345}}, {0x1F85, {0x3B1, 0x314, 0x301, 0x345}},
+{0x1F86, {0x3B1, 0x313, 0x342, 0x345}}, {0x1F87, {0x3B1, 0x314, 0x342, 0x345}}, {0x1F88, {0x391, 0x313, 0x345}}, {0x1F89, {0x391, 0x314, 0x345}}, {0x1F8A, {0x391, 0x313, 0x300, 0x345}},
+{0x1F8B, {0x391, 0x314, 0x300, 0x345}}, {0x1F8C, {0x391, 0x313, 0x301, 0x345}}, {0x1F8D, {0x391, 0x314, 0x301, 0x345}}, {0x1F8E, {0x391, 0x313, 0x342, 0x345}}, {0x1F8F, {0x391, 0x314, 0x342, 0x345}},
+{0x1F90, {0x3B7, 0x313, 0x345}}, {0x1F91, {0x3B7, 0x314, 0x345}}, {0x1F92, {0x3B7, 0x313, 0x300, 0x345}}, {0x1F93, {0x3B7, 0x314, 0x300, 0x345}}, {0x1F94, {0x3B7, 0x313, 0x301, 0x345}},
+{0x1F95, {0x3B7, 0x314, 0x301, 0x345}}, {0x1F96, {0x3B7, 0x313, 0x342, 0x345}}, {0x1F97, {0x3B7, 0x314, 0x342, 0x345}}, {0x1F98, {0x397, 0x313, 0x345}}, {0x1F99, {0x397, 0x314, 0x345}},
+{0x1F9A, {0x397, 0x313, 0x300, 0x345}}, {0x1F9B, {0x397, 0x314, 0x300, 0x345}}, {0x1F9C, {0x397, 0x313, 0x301, 0x345}}, {0x1F9D, {0x397, 0x314, 0x301, 0x345}}, {0x1F9E, {0x397, 0x313, 0x342, 0x345}},
+{0x1F9F, {0x397, 0x314, 0x342, 0x345}}, {0x1FA0, {0x3C9, 0x313, 0x345}}, {0x1FA1, {0x3C9, 0x314, 0x345}}, {0x1FA2, {0x3C9, 0x313, 0x300, 0x345}}, {0x1FA3, {0x3C9, 0x314, 0x300, 0x345}},
+{0x1FA4, {0x3C9, 0x313, 0x301, 0x345}}, {0x1FA5, {0x3C9, 0x314, 0x301, 0x345}}, {0x1FA6, {0x3C9, 0x313, 0x342, 0x345}}, {0x1FA7, {0x3C9, 0x314, 0x342, 0x345}}, {0x1FA8, {0x3A9, 0x313, 0x345}},
+{0x1FA9, {0x3A9, 0x314, 0x345}}, {0x1FAA, {0x3A9, 0x313, 0x300, 0x345}}, {0x1FAB, {0x3A9, 0x314, 0x300, 0x345}}, {0x1FAC, {0x3A9, 0x313, 0x301, 0x345}}, {0x1FAD, {0x3A9, 0x314, 0x301, 0x345}},
+{0x1FAE, {0x3A9, 0x313, 0x342, 0x345}}, {0x1FAF, {0x3A9, 0x314, 0x342, 0x345}}, {0x1FB0, {0x3B1, 0x306}}, {0x1FB1, {0x3B1, 0x304}}, {0x1FB2, {0x3B1, 0x300, 0x345}}, {0x1FB3, {0x3B1, 0x345}},
+{0x1FB4, {0x3B1, 0x301, 0x345}}, {0x1FB6, {0x3B1, 0x342}}, {0x1FB7, {0x3B1, 0x342, 0x345}}, {0x1FB8, {0x391, 0x306}}, {0x1FB9, {0x391, 0x304}}, {0x1FBA, {0x391, 0x300}}, {0x1FBB, {0x391, 0x301}},
+{0x1FBC, {0x391, 0x345}}, {0x1FBE, {0x3B9}}, {0x1FC1, {0xA8, 0x342}}, {0x1FC2, {0x3B7, 0x300, 0x345}}, {0x1FC3, {0x3B7, 0x345}}, {0x1FC4, {0x3B7, 0x301, 0x345}}, {0x1FC6, {0x3B7, 0x342}},
+{0x1FC7, {0x3B7, 0x342, 0x345}}, {0x1FC8, {0x395, 0x300}}, {0x1FC9, {0x395, 0x301}}, {0x1FCA, {0x397, 0x300}}, {0x1FCB, {0x397, 0x301}}, {0x1FCC, {0x397, 0x345}}, {0x1FCD, {0x1FBF, 0x300}},
+{0x1FCE, {0x1FBF, 0x301}}, {0x1FCF, {0x1FBF, 0x342}}, {0x1FD0, {0x3B9, 0x306}}, {0x1FD1, {0x3B9, 0x304}}, {0x1FD2, {0x3B9, 0x308, 0x300}}, {0x1FD3, {0x3B9, 0x308, 0x301}}, {0x1FD6, {0x3B9, 0x342}},
+{0x1FD7, {0x3B9, 0x308, 0x342}}, {0x1FD8, {0x399, 0x306}}, {0x1FD9, {0x399, 0x304}}, {0x1FDA, {0x399, 0x300}}, {0x1FDB, {0x399, 0x301}}, {0x1FDD, {0x1FFE, 0x300}}, {0x1FDE, {0x1FFE, 0x301}},
+{0x1FDF, {0x1FFE, 0x342}}, {0x1FE0, {0x3C5, 0x306}}, {0x1FE1, {0x3C5, 0x304}}, {0x1FE2, {0x3C5, 0x308, 0x300}}, {0x1FE3, {0x3C5, 0x308, 0x301}}, {0x1FE4, {0x3C1, 0x313}}, {0x1FE5, {0x3C1, 0x314}},
+{0x1FE6, {0x3C5, 0x342}}, {0x1FE7, {0x3C5, 0x308, 0x342}}, {0x1FE8, {0x3A5, 0x306}}, {0x1FE9, {0x3A5, 0x304}}, {0x1FEA, {0x3A5, 0x300}}, {0x1FEB, {0x3A5, 0x301}}, {0x1FEC, {0x3A1, 0x314}},
+{0x1FED, {0xA8, 0x300}}, {0x1FEE, {0xA8, 0x301}}, {0x1FEF, {0x60}}, {0x1FF2, {0x3C9, 0x300, 0x345}}, {0x1FF3, {0x3C9, 0x345}}, {0x1FF4, {0x3C9, 0x301, 0x345}}, {0x1FF6, {0x3C9, 0x342}},
+{0x1FF7, {0x3C9, 0x342, 0x345}}, {0x1FF8, {0x39F, 0x300}}, {0x1FF9, {0x39F, 0x301}}, {0x1FFA, {0x3A9, 0x300}}, {0x1FFB, {0x3A9, 0x301}}, {0x1FFC, {0x3A9, 0x345}}, {0x1FFD, {0xB4}}, {0x2000, {0x2002}},
+{0x2001, {0x2003}}, {0x2126, {0x3A9}}, {0x212A, {0x4B}}, {0x212B, {0x41, 0x30A}}, {0x219A, {0x2190, 0x338}}, {0x219B, {0x2192, 0x338}}, {0x21AE, {0x2194, 0x338}}, {0x21CD, {0x21D0, 0x338}},
+{0x21CE, {0x21D4, 0x338}}, {0x21CF, {0x21D2, 0x338}}, {0x2204, {0x2203, 0x338}}, {0x2209, {0x2208, 0x338}}, {0x220C, {0x220B, 0x338}}, {0x2224, {0x2223, 0x338}}, {0x2226, {0x2225, 0x338}},
+{0x2241, {0x223C, 0x338}}, {0x2244, {0x2243, 0x338}}, {0x2247, {0x2245, 0x338}}, {0x2249, {0x2248, 0x338}}, {0x2260, {0x3D, 0x338}}, {0x2262, {0x2261, 0x338}}, {0x226D, {0x224D, 0x338}},
+{0x226E, {0x3C, 0x338}}, {0x226F, {0x3E, 0x338}}, {0x2270, {0x2264, 0x338}}, {0x2271, {0x2265, 0x338}}, {0x2274, {0x2272, 0x338}}, {0x2275, {0x2273, 0x338}}, {0x2278, {0x2276, 0x338}},
+{0x2279, {0x2277, 0x338}}, {0x2280, {0x227A, 0x338}}, {0x2281, {0x227B, 0x338}}, {0x2284, {0x2282, 0x338}}, {0x2285, {0x2283, 0x338}}, {0x2288, {0x2286, 0x338}}, {0x2289, {0x2287, 0x338}},
+{0x22AC, {0x22A2, 0x338}}, {0x22AD, {0x22A8, 0x338}}, {0x22AE, {0x22A9, 0x338}}, {0x22AF, {0x22AB, 0x338}}, {0x22E0, {0x227C, 0x338}}, {0x22E1, {0x227D, 0x338}}, {0x22E2, {0x2291, 0x338}},
+{0x22E3, {0x2292, 0x338}}, {0x22EA, {0x22B2, 0x338}}, {0x22EB, {0x22B3, 0x338}}, {0x22EC, {0x22B4, 0x338}}, {0x22ED, {0x22B5, 0x338}}, {0x2329, {0x3008}}, {0x232A, {0x3009}},
+{0x2ADC, {0x2ADD, 0x338}}, {0x304C, {0x304B, 0x3099}}, {0x304E, {0x304D, 0x3099}}, {0x3050, {0x304F, 0x3099}}, {0x3052, {0x3051, 0x3099}}, {0x3054, {0x3053, 0x3099}}, {0x3056, {0x3055, 0x3099}},
+{0x3058, {0x3057, 0x3099}}, {0x305A, {0x3059, 0x3099}}, {0x305C, {0x305B, 0x3099}}, {0x305E, {0x305D, 0x3099}}, {0x3060, {0x305F, 0x3099}}, {0x3062, {0x3061, 0x3099}}, {0x3065, {0x3064, 0x3099}},
+{0x3067, {0x3066, 0x3099}}, {0x3069, {0x3068, 0x3099}}, {0x3070, {0x306F, 0x3099}}, {0x3071, {0x306F, 0x309A}}, {0x3073, {0x3072, 0x3099}}, {0x3074, {0x3072, 0x309A}}, {0x3076, {0x3075, 0x3099}},
+{0x3077, {0x3075, 0x309A}}, {0x3079, {0x3078, 0x3099}}, {0x307A, {0x3078, 0x309A}}, {0x307C, {0x307B, 0x3099}}, {0x307D, {0x307B, 0x309A}}, {0x3094, {0x3046, 0x3099}}, {0x309E, {0x309D, 0x3099}},
+{0x30AC, {0x30AB, 0x3099}}, {0x30AE, {0x30AD, 0x3099}}, {0x30B0, {0x30AF, 0x3099}}, {0x30B2, {0x30B1, 0x3099}}, {0x30B4, {0x30B3, 0x3099}}, {0x30B6, {0x30B5, 0x3099}}, {0x30B8, {0x30B7, 0x3099}},
+{0x30BA, {0x30B9, 0x3099}}, {0x30BC, {0x30BB, 0x3099}}, {0x30BE, {0x30BD, 0x3099}}, {0x30C0, {0x30BF, 0x3099}}, {0x30C2, {0x30C1, 0x3099}}, {0x30C5, {0x30C4, 0x3099}}, {0x30C7, {0x30C6, 0x3099}},
+{0x30C9, {0x30C8, 0x3099}}, {0x30D0, {0x30CF, 0x3099}}, {0x30D1, {0x30CF, 0x309A}}, {0x30D3, {0x30D2, 0x3099}}, {0x30D4, {0x30D2, 0x309A}}, {0x30D6, {0x30D5, 0x3099}}, {0x30D7, {0x30D5, 0x309A}},
+{0x30D9, {0x30D8, 0x3099}}, {0x30DA, {0x30D8, 0x309A}}, {0x30DC, {0x30DB, 0x3099}}, {0x30DD, {0x30DB, 0x309A}}, {0x30F4, {0x30A6, 0x3099}}, {0x30F7, {0x30EF, 0x3099}}, {0x30F8, {0x30F0, 0x3099}},
+{0x30F9, {0x30F1, 0x3099}}, {0x30FA, {0x30F2, 0x3099}}, {0x30FE, {0x30FD, 0x3099}}, {0xF900, {0x8C48}}, {0xF901, {0x66F4}}, {0xF902, {0x8ECA}}, {0xF903, {0x8CC8}}, {0xF904, {0x6ED1}},
+{0xF905, {0x4E32}}, {0xF906, {0x53E5}}, {0xF907, {0x9F9C}}, {0xF908, {0x9F9C}}, {0xF909, {0x5951}}, {0xF90A, {0x91D1}}, {0xF90B, {0x5587}}, {0xF90C, {0x5948}}, {0xF90D, {0x61F6}}, {0xF90E, {0x7669}},
+{0xF90F, {0x7F85}}, {0xF910, {0x863F}}, {0xF911, {0x87BA}}, {0xF912, {0x88F8}}, {0xF913, {0x908F}}, {0xF914, {0x6A02}}, {0xF915, {0x6D1B}}, {0xF916, {0x70D9}}, {0xF917, {0x73DE}}, {0xF918, {0x843D}},
+{0xF919, {0x916A}}, {0xF91A, {0x99F1}}, {0xF91B, {0x4E82}}, {0xF91C, {0x5375}}, {0xF91D, {0x6B04}}, {0xF91E, {0x721B}}, {0xF91F, {0x862D}}, {0xF920, {0x9E1E}}, {0xF921, {0x5D50}}, {0xF922, {0x6FEB}},
+{0xF923, {0x85CD}}, {0xF924, {0x8964}}, {0xF925, {0x62C9}}, {0xF926, {0x81D8}}, {0xF927, {0x881F}}, {0xF928, {0x5ECA}}, {0xF929, {0x6717}}, {0xF92A, {0x6D6A}}, {0xF92B, {0x72FC}}, {0xF92C, {0x90CE}},
+{0xF92D, {0x4F86}}, {0xF92E, {0x51B7}}, {0xF92F, {0x52DE}}, {0xF930, {0x64C4}}, {0xF931, {0x6AD3}}, {0xF932, {0x7210}}, {0xF933, {0x76E7}}, {0xF934, {0x8001}}, {0xF935, {0x8606}}, {0xF936, {0x865C}},
+{0xF937, {0x8DEF}}, {0xF938, {0x9732}}, {0xF939, {0x9B6F}}, {0xF93A, {0x9DFA}}, {0xF93B, {0x788C}}, {0xF93C, {0x797F}}, {0xF93D, {0x7DA0}}, {0xF93E, {0x83C9}}, {0xF93F, {0x9304}}, {0xF940, {0x9E7F}},
+{0xF941, {0x8AD6}}, {0xF942, {0x58DF}}, {0xF943, {0x5F04}}, {0xF944, {0x7C60}}, {0xF945, {0x807E}}, {0xF946, {0x7262}}, {0xF947, {0x78CA}}, {0xF948, {0x8CC2}}, {0xF949, {0x96F7}}, {0xF94A, {0x58D8}},
+{0xF94B, {0x5C62}}, {0xF94C, {0x6A13}}, {0xF94D, {0x6DDA}}, {0xF94E, {0x6F0F}}, {0xF94F, {0x7D2F}}, {0xF950, {0x7E37}}, {0xF951, {0x964B}}, {0xF952, {0x52D2}}, {0xF953, {0x808B}}, {0xF954, {0x51DC}},
+{0xF955, {0x51CC}}, {0xF956, {0x7A1C}}, {0xF957, {0x7DBE}}, {0xF958, {0x83F1}}, {0xF959, {0x9675}}, {0xF95A, {0x8B80}}, {0xF95B, {0x62CF}}, {0xF95C, {0x6A02}}, {0xF95D, {0x8AFE}}, {0xF95E, {0x4E39}},
+{0xF95F, {0x5BE7}}, {0xF960, {0x6012}}, {0xF961, {0x7387}}, {0xF962, {0x7570}}, {0xF963, {0x5317}}, {0xF964, {0x78FB}}, {0xF965, {0x4FBF}}, {0xF966, {0x5FA9}}, {0xF967, {0x4E0D}}, {0xF968, {0x6CCC}},
+{0xF969, {0x6578}}, {0xF96A, {0x7D22}}, {0xF96B, {0x53C3}}, {0xF96C, {0x585E}}, {0xF96D, {0x7701}}, {0xF96E, {0x8449}}, {0xF96F, {0x8AAA}}, {0xF970, {0x6BBA}}, {0xF971, {0x8FB0}}, {0xF972, {0x6C88}},
+{0xF973, {0x62FE}}, {0xF974, {0x82E5}}, {0xF975, {0x63A0}}, {0xF976, {0x7565}}, {0xF977, {0x4EAE}}, {0xF978, {0x5169}}, {0xF979, {0x51C9}}, {0xF97A, {0x6881}}, {0xF97B, {0x7CE7}}, {0xF97C, {0x826F}},
+{0xF97D, {0x8AD2}}, {0xF97E, {0x91CF}}, {0xF97F, {0x52F5}}, {0xF980, {0x5442}}, {0xF981, {0x5973}}, {0xF982, {0x5EEC}}, {0xF983, {0x65C5}}, {0xF984, {0x6FFE}}, {0xF985, {0x792A}}, {0xF986, {0x95AD}},
+{0xF987, {0x9A6A}}, {0xF988, {0x9E97}}, {0xF989, {0x9ECE}}, {0xF98A, {0x529B}}, {0xF98B, {0x66C6}}, {0xF98C, {0x6B77}}, {0xF98D, {0x8F62}}, {0xF98E, {0x5E74}}, {0xF98F, {0x6190}}, {0xF990, {0x6200}},
+{0xF991, {0x649A}}, {0xF992, {0x6F23}}, {0xF993, {0x7149}}, {0xF994, {0x7489}}, {0xF995, {0x79CA}}, {0xF996, {0x7DF4}}, {0xF997, {0x806F}}, {0xF998, {0x8F26}}, {0xF999, {0x84EE}}, {0xF99A, {0x9023}},
+{0xF99B, {0x934A}}, {0xF99C, {0x5217}}, {0xF99D, {0x52A3}}, {0xF99E, {0x54BD}}, {0xF99F, {0x70C8}}, {0xF9A0, {0x88C2}}, {0xF9A1, {0x8AAA}}, {0xF9A2, {0x5EC9}}, {0xF9A3, {0x5FF5}}, {0xF9A4, {0x637B}},
+{0xF9A5, {0x6BAE}}, {0xF9A6, {0x7C3E}}, {0xF9A7, {0x7375}}, {0xF9A8, {0x4EE4}}, {0xF9A9, {0x56F9}}, {0xF9AA, {0x5BE7}}, {0xF9AB, {0x5DBA}}, {0xF9AC, {0x601C}}, {0xF9AD, {0x73B2}}, {0xF9AE, {0x7469}},
+{0xF9AF, {0x7F9A}}, {0xF9B0, {0x8046}}, {0xF9B1, {0x9234}}, {0xF9B2, {0x96F6}}, {0xF9B3, {0x9748}}, {0xF9B4, {0x9818}}, {0xF9B5, {0x4F8B}}, {0xF9B6, {0x79AE}}, {0xF9B7, {0x91B4}}, {0xF9B8, {0x96B8}},
+{0xF9B9, {0x60E1}}, {0xF9BA, {0x4E86}}, {0xF9BB, {0x50DA}}, {0xF9BC, {0x5BEE}}, {0xF9BD, {0x5C3F}}, {0xF9BE, {0x6599}}, {0xF9BF, {0x6A02}}, {0xF9C0, {0x71CE}}, {0xF9C1, {0x7642}}, {0xF9C2, {0x84FC}},
+{0xF9C3, {0x907C}}, {0xF9C4, {0x9F8D}}, {0xF9C5, {0x6688}}, {0xF9C6, {0x962E}}, {0xF9C7, {0x5289}}, {0xF9C8, {0x677B}}, {0xF9C9, {0x67F3}}, {0xF9CA, {0x6D41}}, {0xF9CB, {0x6E9C}}, {0xF9CC, {0x7409}},
+{0xF9CD, {0x7559}}, {0xF9CE, {0x786B}}, {0xF9CF, {0x7D10}}, {0xF9D0, {0x985E}}, {0xF9D1, {0x516D}}, {0xF9D2, {0x622E}}, {0xF9D3, {0x9678}}, {0xF9D4, {0x502B}}, {0xF9D5, {0x5D19}}, {0xF9D6, {0x6DEA}},
+{0xF9D7, {0x8F2A}}, {0xF9D8, {0x5F8B}}, {0xF9D9, {0x6144}}, {0xF9DA, {0x6817}}, {0xF9DB, {0x7387}}, {0xF9DC, {0x9686}}, {0xF9DD, {0x5229}}, {0xF9DE, {0x540F}}, {0xF9DF, {0x5C65}}, {0xF9E0, {0x6613}},
+{0xF9E1, {0x674E}}, {0xF9E2, {0x68A8}}, {0xF9E3, {0x6CE5}}, {0xF9E4, {0x7406}}, {0xF9E5, {0x75E2}}, {0xF9E6, {0x7F79}}, {0xF9E7, {0x88CF}}, {0xF9E8, {0x88E1}}, {0xF9E9, {0x91CC}}, {0xF9EA, {0x96E2}},
+{0xF9EB, {0x533F}}, {0xF9EC, {0x6EBA}}, {0xF9ED, {0x541D}}, {0xF9EE, {0x71D0}}, {0xF9EF, {0x7498}}, {0xF9F0, {0x85FA}}, {0xF9F1, {0x96A3}}, {0xF9F2, {0x9C57}}, {0xF9F3, {0x9E9F}}, {0xF9F4, {0x6797}},
+{0xF9F5, {0x6DCB}}, {0xF9F6, {0x81E8}}, {0xF9F7, {0x7ACB}}, {0xF9F8, {0x7B20}}, {0xF9F9, {0x7C92}}, {0xF9FA, {0x72C0}}, {0xF9FB, {0x7099}}, {0xF9FC, {0x8B58}}, {0xF9FD, {0x4EC0}}, {0xF9FE, {0x8336}},
+{0xF9FF, {0x523A}}, {0xFA00, {0x5207}}, {0xFA01, {0x5EA6}}, {0xFA02, {0x62D3}}, {0xFA03, {0x7CD6}}, {0xFA04, {0x5B85}}, {0xFA05, {0x6D1E}}, {0xFA06, {0x66B4}}, {0xFA07, {0x8F3B}}, {0xFA08, {0x884C}},
+{0xFA09, {0x964D}}, {0xFA0A, {0x898B}}, {0xFA0B, {0x5ED3}}, {0xFA0C, {0x5140}}, {0xFA0D, {0x55C0}}, {0xFA10, {0x585A}}, {0xFA12, {0x6674}}, {0xFA15, {0x51DE}}, {0xFA16, {0x732A}}, {0xFA17, {0x76CA}},
+{0xFA18, {0x793C}}, {0xFA19, {0x795E}}, {0xFA1A, {0x7965}}, {0xFA1B, {0x798F}}, {0xFA1C, {0x9756}}, {0xFA1D, {0x7CBE}}, {0xFA1E, {0x7FBD}}, {0xFA20, {0x8612}}, {0xFA22, {0x8AF8}}, {0xFA25, {0x9038}},
+{0xFA26, {0x90FD}}, {0xFA2A, {0x98EF}}, {0xFA2B, {0x98FC}}, {0xFA2C, {0x9928}}, {0xFA2D, {0x9DB4}}, {0xFA2E, {0x90DE}}, {0xFA2F, {0x96B7}}, {0xFA30, {0x4FAE}}, {0xFA31, {0x50E7}}, {0xFA32, {0x514D}},
+{0xFA33, {0x52C9}}, {0xFA34, {0x52E4}}, {0xFA35, {0x5351}}, {0xFA36, {0x559D}}, {0xFA37, {0x5606}}, {0xFA38, {0x5668}}, {0xFA39, {0x5840}}, {0xFA3A, {0x58A8}}, {0xFA3B, {0x5C64}}, {0xFA3C, {0x5C6E}},
+{0xFA3D, {0x6094}}, {0xFA3E, {0x6168}}, {0xFA3F, {0x618E}}, {0xFA40, {0x61F2}}, {0xFA41, {0x654F}}, {0xFA42, {0x65E2}}, {0xFA43, {0x6691}}, {0xFA44, {0x6885}}, {0xFA45, {0x6D77}}, {0xFA46, {0x6E1A}},
+{0xFA47, {0x6F22}}, {0xFA48, {0x716E}}, {0xFA49, {0x722B}}, {0xFA4A, {0x7422}}, {0xFA4B, {0x7891}}, {0xFA4C, {0x793E}}, {0xFA4D, {0x7949}}, {0xFA4E, {0x7948}}, {0xFA4F, {0x7950}}, {0xFA50, {0x7956}},
+{0xFA51, {0x795D}}, {0xFA52, {0x798D}}, {0xFA53, {0x798E}}, {0xFA54, {0x7A40}}, {0xFA55, {0x7A81}}, {0xFA56, {0x7BC0}}, {0xFA57, {0x7DF4}}, {0xFA58, {0x7E09}}, {0xFA59, {0x7E41}}, {0xFA5A, {0x7F72}},
+{0xFA5B, {0x8005}}, {0xFA5C, {0x81ED}}, {0xFA5D, {0x8279}}, {0xFA5E, {0x8279}}, {0xFA5F, {0x8457}}, {0xFA60, {0x8910}}, {0xFA61, {0x8996}}, {0xFA62, {0x8B01}}, {0xFA63, {0x8B39}}, {0xFA64, {0x8CD3}},
+{0xFA65, {0x8D08}}, {0xFA66, {0x8FB6}}, {0xFA67, {0x9038}}, {0xFA68, {0x96E3}}, {0xFA69, {0x97FF}}, {0xFA6A, {0x983B}}, {0xFA6B, {0x6075}}, {0xFA6C, {0x242EE}}, {0xFA6D, {0x8218}}, {0xFA70, {0x4E26}},
+{0xFA71, {0x51B5}}, {0xFA72, {0x5168}}, {0xFA73, {0x4F80}}, {0xFA74, {0x5145}}, {0xFA75, {0x5180}}, {0xFA76, {0x52C7}}, {0xFA77, {0x52FA}}, {0xFA78, {0x559D}}, {0xFA79, {0x5555}}, {0xFA7A, {0x5599}},
+{0xFA7B, {0x55E2}}, {0xFA7C, {0x585A}}, {0xFA7D, {0x58B3}}, {0xFA7E, {0x5944}}, {0xFA7F, {0x5954}}, {0xFA80, {0x5A62}}, {0xFA81, {0x5B28}}, {0xFA82, {0x5ED2}}, {0xFA83, {0x5ED9}}, {0xFA84, {0x5F69}},
+{0xFA85, {0x5FAD}}, {0xFA86, {0x60D8}}, {0xFA87, {0x614E}}, {0xFA88, {0x6108}}, {0xFA89, {0x618E}}, {0xFA8A, {0x6160}}, {0xFA8B, {0x61F2}}, {0xFA8C, {0x6234}}, {0xFA8D, {0x63C4}}, {0xFA8E, {0x641C}},
+{0xFA8F, {0x6452}}, {0xFA90, {0x6556}}, {0xFA91, {0x6674}}, {0xFA92, {0x6717}}, {0xFA93, {0x671B}}, {0xFA94, {0x6756}}, {0xFA95, {0x6B79}}, {0xFA96, {0x6BBA}}, {0xFA97, {0x6D41}}, {0xFA98, {0x6EDB}},
+{0xFA99, {0x6ECB}}, {0xFA9A, {0x6F22}}, {0xFA9B, {0x701E}}, {0xFA9C, {0x716E}}, {0xFA9D, {0x77A7}}, {0xFA9E, {0x7235}}, {0xFA9F, {0x72AF}}, {0xFAA0, {0x732A}}, {0xFAA1, {0x7471}}, {0xFAA2, {0x7506}},
+{0xFAA3, {0x753B}}, {0xFAA4, {0x761D}}, {0xFAA5, {0x761F}}, {0xFAA6, {0x76CA}}, {0xFAA7, {0x76DB}}, {0xFAA8, {0x76F4}}, {0xFAA9, {0x774A}}, {0xFAAA, {0x7740}}, {0xFAAB, {0x78CC}}, {0xFAAC, {0x7AB1}},
+{0xFAAD, {0x7BC0}}, {0xFAAE, {0x7C7B}}, {0xFAAF, {0x7D5B}}, {0xFAB0, {0x7DF4}}, {0xFAB1, {0x7F3E}}, {0xFAB2, {0x8005}}, {0xFAB3, {0x8352}}, {0xFAB4, {0x83EF}}, {0xFAB5, {0x8779}}, {0xFAB6, {0x8941}},
+{0xFAB7, {0x8986}}, {0xFAB8, {0x8996}}, {0xFAB9, {0x8ABF}}, {0xFABA, {0x8AF8}}, {0xFABB, {0x8ACB}}, {0xFABC, {0x8B01}}, {0xFABD, {0x8AFE}}, {0xFABE, {0x8AED}}, {0xFABF, {0x8B39}}, {0xFAC0, {0x8B8A}},
+{0xFAC1, {0x8D08}}, {0xFAC2, {0x8F38}}, {0xFAC3, {0x9072}}, {0xFAC4, {0x9199}}, {0xFAC5, {0x9276}}, {0xFAC6, {0x967C}}, {0xFAC7, {0x96E3}}, {0xFAC8, {0x9756}}, {0xFAC9, {0x97DB}}, {0xFACA, {0x97FF}},
+{0xFACB, {0x980B}}, {0xFACC, {0x983B}}, {0xFACD, {0x9B12}}, {0xFACE, {0x9F9C}}, {0xFACF, {0x2284A}}, {0xFAD0, {0x22844}}, {0xFAD1, {0x233D5}}, {0xFAD2, {0x3B9D}}, {0xFAD3, {0x4018}},
+{0xFAD4, {0x4039}}, {0xFAD5, {0x25249}}, {0xFAD6, {0x25CD0}}, {0xFAD7, {0x27ED3}}, {0xFAD8, {0x9F43}}, {0xFAD9, {0x9F8E}}, {0xFB1D, {0x5D9, 0x5B4}}, {0xFB1F, {0x5F2, 0x5B7}}, {0xFB2A, {0x5E9, 0x5C1}},
+{0xFB2B, {0x5E9, 0x5C2}}, {0xFB2C, {0x5E9, 0x5BC, 0x5C1}}, {0xFB2D, {0x5E9, 0x5BC, 0x5C2}}, {0xFB2E, {0x5D0, 0x5B7}}, {0xFB2F, {0x5D0, 0x5B8}}, {0xFB30, {0x5D0, 0x5BC}}, {0xFB31, {0x5D1, 0x5BC}},
+{0xFB32, {0x5D2, 0x5BC}}, {0xFB33, {0x5D3, 0x5BC}}, {0xFB34, {0x5D4, 0x5BC}}, {0xFB35, {0x5D5, 0x5BC}}, {0xFB36, {0x5D6, 0x5BC}}, {0xFB38, {0x5D8, 0x5BC}}, {0xFB39, {0x5D9, 0x5BC}},
+{0xFB3A, {0x5DA, 0x5BC}}, {0xFB3B, {0x5DB, 0x5BC}}, {0xFB3C, {0x5DC, 0x5BC}}, {0xFB3E, {0x5DE, 0x5BC}}, {0xFB40, {0x5E0, 0x5BC}}, {0xFB41, {0x5E1, 0x5BC}}, {0xFB43, {0x5E3, 0x5BC}},
+{0xFB44, {0x5E4, 0x5BC}}, {0xFB46, {0x5E6, 0x5BC}}, {0xFB47, {0x5E7, 0x5BC}}, {0xFB48, {0x5E8, 0x5BC}}, {0xFB49, {0x5E9, 0x5BC}}, {0xFB4A, {0x5EA, 0x5BC}}, {0xFB4B, {0x5D5, 0x5B9}},
+{0xFB4C, {0x5D1, 0x5BF}}, {0xFB4D, {0x5DB, 0x5BF}}, {0xFB4E, {0x5E4, 0x5BF}}, {0x1109A, {0x11099, 0x110BA}}, {0x1109C, {0x1109B, 0x110BA}}, {0x110AB, {0x110A5, 0x110BA}},
+{0x1112E, {0x11131, 0x11127}}, {0x1112F, {0x11132, 0x11127}}, {0x1134B, {0x11347, 0x1133E}}, {0x1134C, {0x11347, 0x11357}}, {0x114BB, {0x114B9, 0x114BA}}, {0x114BC, {0x114B9, 0x114B0}},
+{0x114BE, {0x114B9, 0x114BD}}, {0x115BA, {0x115B8, 0x115AF}}, {0x115BB, {0x115B9, 0x115AF}}, {0x1D15E, {0x1D157, 0x1D165}}, {0x1D15F, {0x1D158, 0x1D165}}, {0x1D160, {0x1D158, 0x1D165, 0x1D16E}},
+{0x1D161, {0x1D158, 0x1D165, 0x1D16F}}, {0x1D162, {0x1D158, 0x1D165, 0x1D170}}, {0x1D163, {0x1D158, 0x1D165, 0x1D171}}, {0x1D164, {0x1D158, 0x1D165, 0x1D172}}, {0x1D1BB, {0x1D1B9, 0x1D165}},
+{0x1D1BC, {0x1D1BA, 0x1D165}}, {0x1D1BD, {0x1D1B9, 0x1D165, 0x1D16E}}, {0x1D1BE, {0x1D1BA, 0x1D165, 0x1D16E}}, {0x1D1BF, {0x1D1B9, 0x1D165, 0x1D16F}}, {0x1D1C0, {0x1D1BA, 0x1D165, 0x1D16F}},
+{0x2F800, {0x4E3D}}, {0x2F801, {0x4E38}}, {0x2F802, {0x4E41}}, {0x2F803, {0x20122}}, {0x2F804, {0x4F60}}, {0x2F805, {0x4FAE}}, {0x2F806, {0x4FBB}}, {0x2F807, {0x5002}}, {0x2F808, {0x507A}},
+{0x2F809, {0x5099}}, {0x2F80A, {0x50E7}}, {0x2F80B, {0x50CF}}, {0x2F80C, {0x349E}}, {0x2F80D, {0x2063A}}, {0x2F80E, {0x514D}}, {0x2F80F, {0x5154}}, {0x2F810, {0x5164}}, {0x2F811, {0x5177}},
+{0x2F812, {0x2051C}}, {0x2F813, {0x34B9}}, {0x2F814, {0x5167}}, {0x2F815, {0x518D}}, {0x2F816, {0x2054B}}, {0x2F817, {0x5197}}, {0x2F818, {0x51A4}}, {0x2F819, {0x4ECC}}, {0x2F81A, {0x51AC}},
+{0x2F81B, {0x51B5}}, {0x2F81C, {0x291DF}}, {0x2F81D, {0x51F5}}, {0x2F81E, {0x5203}}, {0x2F81F, {0x34DF}}, {0x2F820, {0x523B}}, {0x2F821, {0x5246}}, {0x2F822, {0x5272}}, {0x2F823, {0x5277}},
+{0x2F824, {0x3515}}, {0x2F825, {0x52C7}}, {0x2F826, {0x52C9}}, {0x2F827, {0x52E4}}, {0x2F828, {0x52FA}}, {0x2F829, {0x5305}}, {0x2F82A, {0x5306}}, {0x2F82B, {0x5317}}, {0x2F82C, {0x5349}},
+{0x2F82D, {0x5351}}, {0x2F82E, {0x535A}}, {0x2F82F, {0x5373}}, {0x2F830, {0x537D}}, {0x2F831, {0x537F}}, {0x2F832, {0x537F}}, {0x2F833, {0x537F}}, {0x2F834, {0x20A2C}}, {0x2F835, {0x7070}},
+{0x2F836, {0x53CA}}, {0x2F837, {0x53DF}}, {0x2F838, {0x20B63}}, {0x2F839, {0x53EB}}, {0x2F83A, {0x53F1}}, {0x2F83B, {0x5406}}, {0x2F83C, {0x549E}}, {0x2F83D, {0x5438}}, {0x2F83E, {0x5448}},
+{0x2F83F, {0x5468}}, {0x2F840, {0x54A2}}, {0x2F841, {0x54F6}}, {0x2F842, {0x5510}}, {0x2F843, {0x5553}}, {0x2F844, {0x5563}}, {0x2F845, {0x5584}}, {0x2F846, {0x5584}}, {0x2F847, {0x5599}},
+{0x2F848, {0x55AB}}, {0x2F849, {0x55B3}}, {0x2F84A, {0x55C2}}, {0x2F84B, {0x5716}}, {0x2F84C, {0x5606}}, {0x2F84D, {0x5717}}, {0x2F84E, {0x5651}}, {0x2F84F, {0x5674}}, {0x2F850, {0x5207}},
+{0x2F851, {0x58EE}}, {0x2F852, {0x57CE}}, {0x2F853, {0x57F4}}, {0x2F854, {0x580D}}, {0x2F855, {0x578B}}, {0x2F856, {0x5832}}, {0x2F857, {0x5831}}, {0x2F858, {0x58AC}}, {0x2F859, {0x214E4}},
+{0x2F85A, {0x58F2}}, {0x2F85B, {0x58F7}}, {0x2F85C, {0x5906}}, {0x2F85D, {0x591A}}, {0x2F85E, {0x5922}}, {0x2F85F, {0x5962}}, {0x2F860, {0x216A8}}, {0x2F861, {0x216EA}}, {0x2F862, {0x59EC}},
+{0x2F863, {0x5A1B}}, {0x2F864, {0x5A27}}, {0x2F865, {0x59D8}}, {0x2F866, {0x5A66}}, {0x2F867, {0x36EE}}, {0x2F868, {0x36FC}}, {0x2F869, {0x5B08}}, {0x2F86A, {0x5B3E}}, {0x2F86B, {0x5B3E}},
+{0x2F86C, {0x219C8}}, {0x2F86D, {0x5BC3}}, {0x2F86E, {0x5BD8}}, {0x2F86F, {0x5BE7}}, {0x2F870, {0x5BF3}}, {0x2F871, {0x21B18}}, {0x2F872, {0x5BFF}}, {0x2F873, {0x5C06}}, {0x2F874, {0x5F53}},
+{0x2F875, {0x5C22}}, {0x2F876, {0x3781}}, {0x2F877, {0x5C60}}, {0x2F878, {0x5C6E}}, {0x2F879, {0x5CC0}}, {0x2F87A, {0x5C8D}}, {0x2F87B, {0x21DE4}}, {0x2F87C, {0x5D43}}, {0x2F87D, {0x21DE6}},
+{0x2F87E, {0x5D6E}}, {0x2F87F, {0x5D6B}}, {0x2F880, {0x5D7C}}, {0x2F881, {0x5DE1}}, {0x2F882, {0x5DE2}}, {0x2F883, {0x382F}}, {0x2F884, {0x5DFD}}, {0x2F885, {0x5E28}}, {0x2F886, {0x5E3D}},
+{0x2F887, {0x5E69}}, {0x2F888, {0x3862}}, {0x2F889, {0x22183}}, {0x2F88A, {0x387C}}, {0x2F88B, {0x5EB0}}, {0x2F88C, {0x5EB3}}, {0x2F88D, {0x5EB6}}, {0x2F88E, {0x5ECA}}, {0x2F88F, {0x2A392}},
+{0x2F890, {0x5EFE}}, {0x2F891, {0x22331}}, {0x2F892, {0x22331}}, {0x2F893, {0x8201}}, {0x2F894, {0x5F22}}, {0x2F895, {0x5F22}}, {0x2F896, {0x38C7}}, {0x2F897, {0x232B8}}, {0x2F898, {0x261DA}},
+{0x2F899, {0x5F62}}, {0x2F89A, {0x5F6B}}, {0x2F89B, {0x38E3}}, {0x2F89C, {0x5F9A}}, {0x2F89D, {0x5FCD}}, {0x2F89E, {0x5FD7}}, {0x2F89F, {0x5FF9}}, {0x2F8A0, {0x6081}}, {0x2F8A1, {0x393A}},
+{0x2F8A2, {0x391C}}, {0x2F8A3, {0x6094}}, {0x2F8A4, {0x226D4}}, {0x2F8A5, {0x60C7}}, {0x2F8A6, {0x6148}}, {0x2F8A7, {0x614C}}, {0x2F8A8, {0x614E}}, {0x2F8A9, {0x614C}}, {0x2F8AA, {0x617A}},
+{0x2F8AB, {0x618E}}, {0x2F8AC, {0x61B2}}, {0x2F8AD, {0x61A4}}, {0x2F8AE, {0x61AF}}, {0x2F8AF, {0x61DE}}, {0x2F8B0, {0x61F2}}, {0x2F8B1, {0x61F6}}, {0x2F8B2, {0x6210}}, {0x2F8B3, {0x621B}},
+{0x2F8B4, {0x625D}}, {0x2F8B5, {0x62B1}}, {0x2F8B6, {0x62D4}}, {0x2F8B7, {0x6350}}, {0x2F8B8, {0x22B0C}}, {0x2F8B9, {0x633D}}, {0x2F8BA, {0x62FC}}, {0x2F8BB, {0x6368}}, {0x2F8BC, {0x6383}},
+{0x2F8BD, {0x63E4}}, {0x2F8BE, {0x22BF1}}, {0x2F8BF, {0x6422}}, {0x2F8C0, {0x63C5}}, {0x2F8C1, {0x63A9}}, {0x2F8C2, {0x3A2E}}, {0x2F8C3, {0x6469}}, {0x2F8C4, {0x647E}}, {0x2F8C5, {0x649D}},
+{0x2F8C6, {0x6477}}, {0x2F8C7, {0x3A6C}}, {0x2F8C8, {0x654F}}, {0x2F8C9, {0x656C}}, {0x2F8CA, {0x2300A}}, {0x2F8CB, {0x65E3}}, {0x2F8CC, {0x66F8}}, {0x2F8CD, {0x6649}}, {0x2F8CE, {0x3B19}},
+{0x2F8CF, {0x6691}}, {0x2F8D0, {0x3B08}}, {0x2F8D1, {0x3AE4}}, {0x2F8D2, {0x5192}}, {0x2F8D3, {0x5195}}, {0x2F8D4, {0x6700}}, {0x2F8D5, {0x669C}}, {0x2F8D6, {0x80AD}}, {0x2F8D7, {0x43D9}},
+{0x2F8D8, {0x6717}}, {0x2F8D9, {0x671B}}, {0x2F8DA, {0x6721}}, {0x2F8DB, {0x675E}}, {0x2F8DC, {0x6753}}, {0x2F8DD, {0x233C3}}, {0x2F8DE, {0x3B49}}, {0x2F8DF, {0x67FA}}, {0x2F8E0, {0x6785}},
+{0x2F8E1, {0x6852}}, {0x2F8E2, {0x6885}}, {0x2F8E3, {0x2346D}}, {0x2F8E4, {0x688E}}, {0x2F8E5, {0x681F}}, {0x2F8E6, {0x6914}}, {0x2F8E7, {0x3B9D}}, {0x2F8E8, {0x6942}}, {0x2F8E9, {0x69A3}},
+{0x2F8EA, {0x69EA}}, {0x2F8EB, {0x6AA8}}, {0x2F8EC, {0x236A3}}, {0x2F8ED, {0x6ADB}}, {0x2F8EE, {0x3C18}}, {0x2F8EF, {0x6B21}}, {0x2F8F0, {0x238A7}}, {0x2F8F1, {0x6B54}}, {0x2F8F2, {0x3C4E}},
+{0x2F8F3, {0x6B72}}, {0x2F8F4, {0x6B9F}}, {0x2F8F5, {0x6BBA}}, {0x2F8F6, {0x6BBB}}, {0x2F8F7, {0x23A8D}}, {0x2F8F8, {0x21D0B}}, {0x2F8F9, {0x23AFA}}, {0x2F8FA, {0x6C4E}}, {0x2F8FB, {0x23CBC}},
+{0x2F8FC, {0x6CBF}}, {0x2F8FD, {0x6CCD}}, {0x2F8FE, {0x6C67}}, {0x2F8FF, {0x6D16}}, {0x2F900, {0x6D3E}}, {0x2F901, {0x6D77}}, {0x2F902, {0x6D41}}, {0x2F903, {0x6D69}}, {0x2F904, {0x6D78}},
+{0x2F905, {0x6D85}}, {0x2F906, {0x23D1E}}, {0x2F907, {0x6D34}}, {0x2F908, {0x6E2F}}, {0x2F909, {0x6E6E}}, {0x2F90A, {0x3D33}}, {0x2F90B, {0x6ECB}}, {0x2F90C, {0x6EC7}}, {0x2F90D, {0x23ED1}},
+{0x2F90E, {0x6DF9}}, {0x2F90F, {0x6F6E}}, {0x2F910, {0x23F5E}}, {0x2F911, {0x23F8E}}, {0x2F912, {0x6FC6}}, {0x2F913, {0x7039}}, {0x2F914, {0x701E}}, {0x2F915, {0x701B}}, {0x2F916, {0x3D96}},
+{0x2F917, {0x704A}}, {0x2F918, {0x707D}}, {0x2F919, {0x7077}}, {0x2F91A, {0x70AD}}, {0x2F91B, {0x20525}}, {0x2F91C, {0x7145}}, {0x2F91D, {0x24263}}, {0x2F91E, {0x719C}}, {0x2F91F, {0x243AB}},
+{0x2F920, {0x7228}}, {0x2F921, {0x7235}}, {0x2F922, {0x7250}}, {0x2F923, {0x24608}}, {0x2F924, {0x7280}}, {0x2F925, {0x7295}}, {0x2F926, {0x24735}}, {0x2F927, {0x24814}}, {0x2F928, {0x737A}},
+{0x2F929, {0x738B}}, {0x2F92A, {0x3EAC}}, {0x2F92B, {0x73A5}}, {0x2F92C, {0x3EB8}}, {0x2F92D, {0x3EB8}}, {0x2F92E, {0x7447}}, {0x2F92F, {0x745C}}, {0x2F930, {0x7471}}, {0x2F931, {0x7485}},
+{0x2F932, {0x74CA}}, {0x2F933, {0x3F1B}}, {0x2F934, {0x7524}}, {0x2F935, {0x24C36}}, {0x2F936, {0x753E}}, {0x2F937, {0x24C92}}, {0x2F938, {0x7570}}, {0x2F939, {0x2219F}}, {0x2F93A, {0x7610}},
+{0x2F93B, {0x24FA1}}, {0x2F93C, {0x24FB8}}, {0x2F93D, {0x25044}}, {0x2F93E, {0x3FFC}}, {0x2F93F, {0x4008}}, {0x2F940, {0x76F4}}, {0x2F941, {0x250F3}}, {0x2F942, {0x250F2}}, {0x2F943, {0x25119}},
+{0x2F944, {0x25133}}, {0x2F945, {0x771E}}, {0x2F946, {0x771F}}, {0x2F947, {0x771F}}, {0x2F948, {0x774A}}, {0x2F949, {0x4039}}, {0x2F94A, {0x778B}}, {0x2F94B, {0x4046}}, {0x2F94C, {0x4096}},
+{0x2F94D, {0x2541D}}, {0x2F94E, {0x784E}}, {0x2F94F, {0x788C}}, {0x2F950, {0x78CC}}, {0x2F951, {0x40E3}}, {0x2F952, {0x25626}}, {0x2F953, {0x7956}}, {0x2F954, {0x2569A}}, {0x2F955, {0x256C5}},
+{0x2F956, {0x798F}}, {0x2F957, {0x79EB}}, {0x2F958, {0x412F}}, {0x2F959, {0x7A40}}, {0x2F95A, {0x7A4A}}, {0x2F95B, {0x7A4F}}, {0x2F95C, {0x2597C}}, {0x2F95D, {0x25AA7}}, {0x2F95E, {0x25AA7}},
+{0x2F95F, {0x7AEE}}, {0x2F960, {0x4202}}, {0x2F961, {0x25BAB}}, {0x2F962, {0x7BC6}}, {0x2F963, {0x7BC9}}, {0x2F964, {0x4227}}, {0x2F965, {0x25C80}}, {0x2F966, {0x7CD2}}, {0x2F967, {0x42A0}},
+{0x2F968, {0x7CE8}}, {0x2F969, {0x7CE3}}, {0x2F96A, {0x7D00}}, {0x2F96B, {0x25F86}}, {0x2F96C, {0x7D63}}, {0x2F96D, {0x4301}}, {0x2F96E, {0x7DC7}}, {0x2F96F, {0x7E02}}, {0x2F970, {0x7E45}},
+{0x2F971, {0x4334}}, {0x2F972, {0x26228}}, {0x2F973, {0x26247}}, {0x2F974, {0x4359}}, {0x2F975, {0x262D9}}, {0x2F976, {0x7F7A}}, {0x2F977, {0x2633E}}, {0x2F978, {0x7F95}}, {0x2F979, {0x7FFA}},
+{0x2F97A, {0x8005}}, {0x2F97B, {0x264DA}}, {0x2F97C, {0x26523}}, {0x2F97D, {0x8060}}, {0x2F97E, {0x265A8}}, {0x2F97F, {0x8070}}, {0x2F980, {0x2335F}}, {0x2F981, {0x43D5}}, {0x2F982, {0x80B2}},
+{0x2F983, {0x8103}}, {0x2F984, {0x440B}}, {0x2F985, {0x813E}}, {0x2F986, {0x5AB5}}, {0x2F987, {0x267A7}}, {0x2F988, {0x267B5}}, {0x2F989, {0x23393}}, {0x2F98A, {0x2339C}}, {0x2F98B, {0x8201}},
+{0x2F98C, {0x8204}}, {0x2F98D, {0x8F9E}}, {0x2F98E, {0x446B}}, {0x2F98F, {0x8291}}, {0x2F990, {0x828B}}, {0x2F991, {0x829D}}, {0x2F992, {0x52B3}}, {0x2F993, {0x82B1}}, {0x2F994, {0x82B3}},
+{0x2F995, {0x82BD}}, {0x2F996, {0x82E6}}, {0x2F997, {0x26B3C}}, {0x2F998, {0x82E5}}, {0x2F999, {0x831D}}, {0x2F99A, {0x8363}}, {0x2F99B, {0x83AD}}, {0x2F99C, {0x8323}}, {0x2F99D, {0x83BD}},
+{0x2F99E, {0x83E7}}, {0x2F99F, {0x8457}}, {0x2F9A0, {0x8353}}, {0x2F9A1, {0x83CA}}, {0x2F9A2, {0x83CC}}, {0x2F9A3, {0x83DC}}, {0x2F9A4, {0x26C36}}, {0x2F9A5, {0x26D6B}}, {0x2F9A6, {0x26CD5}},
+{0x2F9A7, {0x452B}}, {0x2F9A8, {0x84F1}}, {0x2F9A9, {0x84F3}}, {0x2F9AA, {0x8516}}, {0x2F9AB, {0x273CA}}, {0x2F9AC, {0x8564}}, {0x2F9AD, {0x26F2C}}, {0x2F9AE, {0x455D}}, {0x2F9AF, {0x4561}},
+{0x2F9B0, {0x26FB1}}, {0x2F9B1, {0x270D2}}, {0x2F9B2, {0x456B}}, {0x2F9B3, {0x8650}}, {0x2F9B4, {0x865C}}, {0x2F9B5, {0x8667}}, {0x2F9B6, {0x8669}}, {0x2F9B7, {0x86A9}}, {0x2F9B8, {0x8688}},
+{0x2F9B9, {0x870E}}, {0x2F9BA, {0x86E2}}, {0x2F9BB, {0x8779}}, {0x2F9BC, {0x8728}}, {0x2F9BD, {0x876B}}, {0x2F9BE, {0x8786}}, {0x2F9BF, {0x45D7}}, {0x2F9C0, {0x87E1}}, {0x2F9C1, {0x8801}},
+{0x2F9C2, {0x45F9}}, {0x2F9C3, {0x8860}}, {0x2F9C4, {0x8863}}, {0x2F9C5, {0x27667}}, {0x2F9C6, {0x88D7}}, {0x2F9C7, {0x88DE}}, {0x2F9C8, {0x4635}}, {0x2F9C9, {0x88FA}}, {0x2F9CA, {0x34BB}},
+{0x2F9CB, {0x278AE}}, {0x2F9CC, {0x27966}}, {0x2F9CD, {0x46BE}}, {0x2F9CE, {0x46C7}}, {0x2F9CF, {0x8AA0}}, {0x2F9D0, {0x8AED}}, {0x2F9D1, {0x8B8A}}, {0x2F9D2, {0x8C55}}, {0x2F9D3, {0x27CA8}},
+{0x2F9D4, {0x8CAB}}, {0x2F9D5, {0x8CC1}}, {0x2F9D6, {0x8D1B}}, {0x2F9D7, {0x8D77}}, {0x2F9D8, {0x27F2F}}, {0x2F9D9, {0x20804}}, {0x2F9DA, {0x8DCB}}, {0x2F9DB, {0x8DBC}}, {0x2F9DC, {0x8DF0}},
+{0x2F9DD, {0x208DE}}, {0x2F9DE, {0x8ED4}}, {0x2F9DF, {0x8F38}}, {0x2F9E0, {0x285D2}}, {0x2F9E1, {0x285ED}}, {0x2F9E2, {0x9094}}, {0x2F9E3, {0x90F1}}, {0x2F9E4, {0x9111}}, {0x2F9E5, {0x2872E}},
+{0x2F9E6, {0x911B}}, {0x2F9E7, {0x9238}}, {0x2F9E8, {0x92D7}}, {0x2F9E9, {0x92D8}}, {0x2F9EA, {0x927C}}, {0x2F9EB, {0x93F9}}, {0x2F9EC, {0x9415}}, {0x2F9ED, {0x28BFA}}, {0x2F9EE, {0x958B}},
+{0x2F9EF, {0x4995}}, {0x2F9F0, {0x95B7}}, {0x2F9F1, {0x28D77}}, {0x2F9F2, {0x49E6}}, {0x2F9F3, {0x96C3}}, {0x2F9F4, {0x5DB2}}, {0x2F9F5, {0x9723}}, {0x2F9F6, {0x29145}}, {0x2F9F7, {0x2921A}},
+{0x2F9F8, {0x4A6E}}, {0x2F9F9, {0x4A76}}, {0x2F9FA, {0x97E0}}, {0x2F9FB, {0x2940A}}, {0x2F9FC, {0x4AB2}}, {0x2F9FD, {0x29496}}, {0x2F9FE, {0x980B}}, {0x2F9FF, {0x980B}}, {0x2FA00, {0x9829}},
+{0x2FA01, {0x295B6}}, {0x2FA02, {0x98E2}}, {0x2FA03, {0x4B33}}, {0x2FA04, {0x9929}}, {0x2FA05, {0x99A7}}, {0x2FA06, {0x99C2}}, {0x2FA07, {0x99FE}}, {0x2FA08, {0x4BCE}}, {0x2FA09, {0x29B30}},
+{0x2FA0A, {0x9B12}}, {0x2FA0B, {0x9C40}}, {0x2FA0C, {0x9CFD}}, {0x2FA0D, {0x4CCE}}, {0x2FA0E, {0x4CED}}, {0x2FA0F, {0x9D67}}, {0x2FA10, {0x2A0CE}}, {0x2FA11, {0x4CF8}}, {0x2FA12, {0x2A105}},
+{0x2FA13, {0x2A20E}}, {0x2FA14, {0x2A291}}, {0x2FA15, {0x9EBB}}, {0x2FA16, {0x4D56}}, {0x2FA17, {0x9EF9}}, {0x2FA18, {0x9EFE}}, {0x2FA19, {0x9F05}}, {0x2FA1A, {0x9F0F}}, {0x2FA1B, {0x9F16}},
+{0x2FA1D, {0x2A600}},
+};
+
 static std::string codepoint_to_utf8(uint32_t cp) {
     std::string result;
     if (/* 0x00 <= cp && */ cp <= 0x7f) {

From adcb12a9bad87bc96f2f158c95892b3d04aa7ffb Mon Sep 17 00:00:00 2001
From: compilade <113953597+compilade@users.noreply.github.com>
Date: Wed, 28 Feb 2024 03:52:56 -0500
Subject: [PATCH 05/16] llama : fix non-quantization of expert gating tensors
 (#5754)

This reverts a single line from #5475
---
 llama.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 356ca1076..893bcdbc0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11162,7 +11162,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= !params->only_copy;
 
         // do not quantize expert gating tensors
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
+        // NOTE: can't use LLM_TN here because the layer number is not known
+        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
 
         // do not quantize positional embeddings and token types (BERT)
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");

From a693bea1e6762a17b78b6ddf4611e54136941ea2 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 28 Feb 2024 09:55:37 +0100
Subject: [PATCH 06/16] server : hit Ctrl+C twice to exit (#5734)

* server: twice ctrl+C to exit

* std::atomic_flag

* sigint: message

* sigint: stderr

* Update examples/server/server.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
---
 examples/server/server.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6b3ee531c..080fa9bd5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2772,7 +2772,16 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }
 
 std::function<void(int)> shutdown_handler;
-inline void signal_handler(int signal) { shutdown_handler(signal); }
+std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+inline void signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
+    shutdown_handler(signal);
+}
 
 int main(int argc, char **argv)
 {

From 5f706718566e3a5147916dc381f3b99de0ffad47 Mon Sep 17 00:00:00 2001
From: "UEXTM.com" <84163508+uextm@users.noreply.github.com>
Date: Sat, 24 Feb 2024 11:27:36 -0500
Subject: [PATCH 07/16] Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml-backend-impl.h |  2 ++
 ggml-backend.c      | 16 ++++++++++++++--
 ggml-backend.h      |  2 +-
 ggml-cuda.cu        |  8 +++++++-
 ggml-kompute.cpp    |  8 +++++++-
 ggml-metal.m        |  8 +++++++-
 ggml-sycl.cpp       |  8 +++++++-
 ggml-vulkan.cpp     |  8 +++++++-
 ggml.c              |  4 ++++
 ggml.h              | 10 ++++++++++
 10 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index f95df47f7..0e5bf0ae1 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -104,6 +104,8 @@ extern "C" {
     };
 
     struct ggml_backend {
+        ggml_guid_t guid;
+
         struct ggml_backend_i iface;
 
         ggml_backend_context_t context;
diff --git a/ggml-backend.c b/ggml-backend.c
index 5076d9e5e..c86673b04 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -12,7 +12,6 @@
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-
 // backend buffer type
 
 const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml
 
 // backend
 
+ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
+    if (backend == NULL) {
+        return NULL;
+    }
+    return backend->guid;
+}
+
 const char * ggml_backend_name(ggml_backend_t backend) {
     if (backend == NULL) {
         return "NULL";
@@ -781,6 +787,11 @@ static struct ggml_backend_i cpu_backend_i = {
     /* .supports_op             = */ ggml_backend_cpu_supports_op,
 };
 
+static ggml_guid_t ggml_backend_cpu_guid(void) {
+    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
+}
+
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
     if (ctx == NULL) {
@@ -800,6 +811,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     }
 
     *cpu_backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_cpu_guid(),
         /* .interface = */ cpu_backend_i,
         /* .context   = */ ctx
     };
@@ -807,7 +819,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
 }
 
 GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_cpu_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
 }
 
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
diff --git a/ggml-backend.h b/ggml-backend.h
index f13c69bff..8fb54bd92 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -49,7 +49,7 @@ extern "C" {
     // Backend
     //
 
-
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
     GGML_API const char * ggml_backend_name(ggml_backend_t backend);
     GGML_API void         ggml_backend_free(ggml_backend_t backend);
 
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 831c84efb..0c6501e98 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -12277,6 +12277,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
 };
 
+static ggml_guid_t ggml_backend_cuda_guid() {
+    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
+    return &guid;
+}
+
 GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
     ggml_init_cublas(); // TODO: remove from ggml.c
 
@@ -12294,6 +12299,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
     };
 
     ggml_backend_t cuda_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_cuda_guid(),
         /* .interface = */ ggml_backend_cuda_interface,
         /* .context   = */ ctx
     };
@@ -12302,7 +12308,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
 }
 
 GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_cuda_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
 }
 
 GGML_CALL int ggml_backend_cuda_get_device_count() {
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 51c5af8ec..e740a76d1 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1953,11 +1953,17 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
 };
 
+static ggml_guid_t ggml_backend_kompute_guid() {
+    static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca };
+    return &guid;
+}
+
 ggml_backend_t ggml_backend_kompute_init(int device) {
     GGML_ASSERT(s_kompute_context == nullptr);
     s_kompute_context = new ggml_kompute_context(device);
 
     ggml_backend_t kompute_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_kompute_guid(),
         /* .interface = */ kompute_backend_i,
         /* .context   = */ s_kompute_context,
     };
@@ -1966,7 +1972,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
 }
 
 bool ggml_backend_is_kompute(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_kompute_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
 }
 
 static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
diff --git a/ggml-metal.m b/ggml-metal.m
index 9eba2f5d2..71fcca560 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2771,6 +2771,11 @@ void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void *
     ggml_metal_log_user_data = user_data;
 }
 
+static ggml_guid_t ggml_backend_metal_guid(void) {
+    static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
+    return &guid;
+}
+
 ggml_backend_t ggml_backend_metal_init(void) {
     struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
 
@@ -2781,6 +2786,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
     ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
 
     *metal_backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_metal_guid(),
         /* .interface = */ ggml_backend_metal_i,
         /* .context   = */ ctx,
     };
@@ -2789,7 +2795,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
 }
 
 bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_metal_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid());
 }
 
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 835967fb6..a054ec8b9 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15162,6 +15162,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .supports_op             = */ ggml_backend_sycl_supports_op,
 };
 
+static ggml_guid_t ggml_backend_sycl_guid() {
+    static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
+    return &guid;
+}
+
 ggml_backend_t ggml_backend_sycl_init(int device) {
     ggml_init_sycl(); // TODO: remove from ggml.c
 
@@ -15179,6 +15184,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
     };
 
     ggml_backend_t sycl_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_sycl_guid(),
         /* .interface = */ ggml_backend_sycl_interface,
         /* .context   = */ ctx
     };
@@ -15187,7 +15193,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
 }
 
 bool ggml_backend_is_sycl(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_sycl_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
 }
 
 static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 6caafb822..314e3d7a9 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -5244,6 +5244,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .supports_op             = */ ggml_backend_vk_supports_op,
 };
 
+static ggml_guid_t ggml_backend_vk_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
 GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
     if (vk_instance.initialized[idx]) {
         return vk_instance.backends[idx];
@@ -5262,6 +5267,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
     vk_instance.initialized[idx] = true;
 
     ggml_backend_t vk_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_vk_guid(),
         /* .interface = */ ggml_backend_vk_interface,
         /* .context   = */ &vk_instance.contexts[ctx->idx],
     };
@@ -5272,7 +5278,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
 }
 
 GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_vk_name;
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
 }
 
 GGML_CALL int ggml_backend_vk_get_device_count() {
diff --git a/ggml.c b/ggml.c
index 4591644ad..68ac6201e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -355,6 +355,10 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
     }
 }
 
+bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
+    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
+}
+
 //
 // timing
 //
diff --git a/ggml.h b/ggml.h
index 23b768640..0a6d3c051 100644
--- a/ggml.h
+++ b/ggml.h
@@ -672,6 +672,16 @@ extern "C" {
         GGML_NUMA_STRATEGY_COUNT
     };
 
+    //
+    // GUID
+    //
+
+    // GUID types
+    typedef uint8_t ggml_guid[16];
+    typedef ggml_guid * ggml_guid_t;
+
+    GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
+
     // misc
 
     GGML_API void    ggml_time_init(void); // call this once at the beginning of the program

From 2774b0c97427ee3ad3e2ee121354d078794e89d9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sun, 25 Feb 2024 20:41:35 +0100
Subject: [PATCH 08/16] add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci
---
 ggml.c | 54 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/ggml.c b/ggml.c
index 68ac6201e..f29b9f13f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1608,9 +1608,15 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     uint16_t t;
     for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        if (x[i] <= -10.0f) {
+            y[i] = 0.0f;
+        } else if (x[i] >= 10.0f) {
+            y[i] = x[i];
+        } else {
+            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            memcpy(&t, &fp16, sizeof(uint16_t));
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        }
     }
 }
 #else
@@ -5780,11 +5786,13 @@ struct ggml_tensor * ggml_pool_1d(
         is_node = true;
     }
 
-    const int64_t ne[2] = {
+    const int64_t ne[4] = {
         ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
         a->ne[1],
+        a->ne[2],
+        a->ne[3],
     };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
     int32_t params[] = { op, k0, s0, p0 };
     ggml_set_op_params(result, params, sizeof(params));
@@ -15081,9 +15089,10 @@ static void ggml_compute_forward_map_custom1(
         return;
     }
 
-    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
+    struct ggml_map_custom1_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
 
-    p->fun(dst, a, params->ith, params->nth, p->userdata);
+    p.fun(dst, a, params->ith, params->nth, p.userdata);
 }
 
 // ggml_compute_forward_map_custom2
@@ -15099,9 +15108,10 @@ static void ggml_compute_forward_map_custom2(
         return;
     }
 
-    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
+    struct ggml_map_custom2_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
 
-    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
+    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
 }
 
 // ggml_compute_forward_map_custom3
@@ -15118,9 +15128,10 @@ static void ggml_compute_forward_map_custom3(
         return;
     }
 
-    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
+    struct ggml_map_custom3_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
 
-    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
+    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
 }
 
 // ggml_compute_forward_cross_entropy_loss
@@ -17386,29 +17397,32 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_MAP_CUSTOM1:
             {
-                struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
+                struct ggml_map_custom1_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
                     n_tasks = n_threads;
                 } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
+                    n_tasks = MIN(p.n_tasks, n_threads);
                 }
             } break;
         case GGML_OP_MAP_CUSTOM2:
             {
-                struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
+                struct ggml_map_custom2_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
                     n_tasks = n_threads;
                 } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
+                    n_tasks = MIN(p.n_tasks, n_threads);
                 }
             } break;
         case GGML_OP_MAP_CUSTOM3:
             {
-                struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
+                struct ggml_map_custom3_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
                     n_tasks = n_threads;
                 } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
+                    n_tasks = MIN(p.n_tasks, n_threads);
                 }
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS:

From 8c0e8f4e73e275756ad69f9c99b26ead085ca9f0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Feb 2024 11:17:32 +0200
Subject: [PATCH 09/16] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 59de34370..389c0bdfe 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-8cdf783f288a98eddf521b0ab1b4d405be9e18ba
+b458250b736a7473f7ff3560d47c93f1644f3290

From 78aacf36344df724cdca9f1e1af849b2d2519cb8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Feb 2024 17:36:53 +0200
Subject: [PATCH 10/16] awq-py : remove (#5768)

---
 awq-py/README.md        | 116 ------------------
 awq-py/awq/apply_awq.py | 254 ----------------------------------------
 awq-py/requirements.txt |   2 -
 3 files changed, 372 deletions(-)
 delete mode 100644 awq-py/README.md
 delete mode 100644 awq-py/awq/apply_awq.py
 delete mode 100644 awq-py/requirements.txt

diff --git a/awq-py/README.md b/awq-py/README.md
deleted file mode 100644
index 16e68d027..000000000
--- a/awq-py/README.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
-[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
-
-**Supported models:**
-
-- [X] LLaMA
-- [x] LLaMA 2
-- [X] MPT
-- [X] Mistral AI v0.1
-- [ ] Bloom
-- [ ] Mixtral MoE
-
-**TODO:**
-- [x] Update version work with both MPT and MPT-AWQ model
-- [ ] Add OPT model
-- [ ] Add Bloom model
-- [ ] Add Mixtral MoE
-- [ ] Support w3, w2
-
-
-## Contents
-
-- [Install](##Install)
-- [Convert](##Convert)
-- [Quantize](##Quantize)
-- [Test](##Test)
-- [Benchmark](##Benchmark)
-- [Results](##Results)
-
-## Install
-Install requirements
-```bash
-pip install -r requirements.txt
-```
-Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
-```bash
-git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
-```
-
-## Convert
-Example for llama model
-```bash
-# For llama7b and llama2 models
-python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
-# For mistral and mpt models
-python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
-```
-
-## Quantize
-```bash
-# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
-./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
-```
-
-## Test
-```bash
-# For all models.
-./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
-```
-
-## Benchmark
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-```bash
-# For llama and llama2, and mistral models.
-./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
-```
-
-## Results
-Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
-We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
-
-### Llama 7B (Build with OpenBLAS)
-
-| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-----------:|--------------|-------:|-------:|-------:|-------:|
-|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
-|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
-|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Llama2 7B (Build with CuBLAS)
-
-| Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|------------:|--------------|-------:|-------:|-------:|-------:|
-|Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
-|Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
-|AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Mistral 7B v0.1 (Build with CuBLAS)
-
-| Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-------------:|--------------|-------:|-------:|-------:|-------:|
-|Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
-|Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
-|AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-### MPT 7B (Build with OpenBLAS)
-
-| Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
-|---------:|--------------|-------:|-------:|-------:|--------:|
-|MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
-|MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
-|AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
-|AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py
deleted file mode 100644
index 11132c5d2..000000000
--- a/awq-py/awq/apply_awq.py
+++ /dev/null
@@ -1,254 +0,0 @@
-"""
-Implements the AWQ for llama.cpp use cases.
-Original paper: https://arxiv.org/abs/2306.00978
-
-This code is based on versions of the AWQ implementation found in the following repositories:
-* https://github.com/mit-han-lab/llm-awq
-* https://github.com/casper-hansen/AutoAWQ
-"""
-
-import os
-import torch
-import torch.nn as nn
-
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers.models.bloom.modeling_bloom import BloomGelu
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-from transformers.activations import GELUActivation
-
-
-class ScaledActivation(nn.Module):
-    """
-    ScaledActivation module wraps an existing activation function and applies a
-    scale factor to its output.
-
-    Args:
-        module (nn.Module): The activation function to be scaled.
-        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
-            scale factors for each feature.
-
-    Returns:
-        torch.Tensor: The scaled output of the activation function.
-    """
-
-    def __init__(self, module, scales):
-        super().__init__()
-        self.act = module
-        self.scales = nn.Parameter(scales.data)
-
-    def forward(self, x):
-        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-def set_op_by_name(layer, name, new_module):
-    """
-    Set the new module for given module's name.
-
-    Args:
-        layer (nn.Module): The layer in which to replace the submodule.
-        name (str): The path to the submodule to be replaced, using dot notation
-            to access nested modules.
-        new_module (nn.Module): The new module to replace the existing one.
-    """
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_op_by_name(module, op_name):
-    """
-    Retrieves a submodule within a given layer based on its name.
-
-    Args:
-        module (nn.Module): The layer containing the submodule to find.
-        op_name (str): The name of the submodule.
-
-    Returns:
-        nn.Module: The requested submodule found within the given layer.
-
-    Raises:
-        ValueError: If the specified submodule cannot be found within the layer.
-    """
-    for name, m in module.named_modules():
-        if name == op_name:
-            return m
-    raise ValueError(f"Cannot find op {op_name} in module {module}")
-
-
-@torch.no_grad()
-def scale_ln_fcs(ln, fcs, scales):
-    """
-    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
-
-    Args:
-        ln (nn.LayerNorm): The LayerNorm module to be scaled.
-        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-
-    if not isinstance(fcs, list):
-        fcs = [fcs]
-
-    scales = scales.to(ln.weight.device)
-
-    ln.weight.div_(scales)
-    if hasattr(ln, "bias") and ln.bias is not None:
-        ln.bias.div_(scales)
-
-    for fc in fcs:
-        fc.weight.mul_(scales.view(1, -1))
-
-    for p in ln.parameters():
-        assert torch.isnan(p).sum() == 0
-    for fc in fcs:
-        for p in fc.parameters():
-            assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_fc_fc(fc1, fc2, scales):
-    """
-    Scales the weights of two fully-connected layers in a specific pattern.
-
-    Args:
-        fc1 (nn.Linear): The first fully-connected layer to be scaled.
-        fc2 (nn.Linear): The second fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-    assert isinstance(fc1, nn.Linear)
-    assert isinstance(fc2, nn.Linear)
-
-    scales = scales.to(fc1.weight.device)
-
-    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
-    if fc1.bias is not None:
-        fc1.bias.div_(scales.view(-1))
-
-    fc2.weight.mul_(scales.view(1, -1))
-
-    for p in fc1.parameters():
-        assert torch.isnan(p).sum() == 0
-    for p in fc2.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_gelu_fc(gelu, fc, scales):
-    """
-    Scales the weight of a GELU activation and a fully-connected layer proportionally.
-
-    Args:
-        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
-        fc (nn.Linear): The fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-
-    Raises:
-        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
-        TypeError: If the `fc` module is not of type `nn.Linear`.
-    """
-    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
-    assert isinstance(fc, nn.Linear)
-
-    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
-
-    for p in fc.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-def apply_scale(module, scales_list, input_feat_dict=None):
-    """
-    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layers to be scaled.
-        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
-            * prev_op_name (str): The name of the preceding operation or module,
-                relative to which the layers to be scaled are located.
-            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
-            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
-        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
-            input features (optional).
-    """
-    for prev_op_name, layer_names, scales in scales_list:
-        prev_op = get_op_by_name(module, prev_op_name)
-        layers = [get_op_by_name(module, name) for name in layer_names]
-
-        prev_op.cuda()
-        for layer in layers:
-            layer.cuda()
-        scales.cuda()
-
-        if isinstance(prev_op, nn.Linear):
-            assert len(layers) == 1
-            scale_fc_fc(prev_op, layers[0], scales)
-        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
-            scale_ln_fcs(prev_op, layers, scales)
-        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
-            new_module = ScaledActivation(prev_op, scales)
-            set_op_by_name(module, prev_op_name, new_module)
-            scale_gelu_fc(prev_op, layers[0], scales)
-        else:
-            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
-
-        # apply the scaling to input feat if given; prepare it for clipping
-        if input_feat_dict is not None:
-            for layer_name in layer_names:
-                inp = input_feat_dict[layer_name]
-                inp.div_(scales.view(1, -1).to(inp.device))
-
-        prev_op.cpu()
-        for layer in layers:
-            layer.cpu()
-        scales.cpu()
-
-
-@torch.no_grad()
-def apply_clip(module, clip_list):
-    """
-    Applies element-wise clipping to the weight of a specific layer within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layer to be clipped.
-        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
-            * name (str): The name of the layer to be clipped, relative to the root of the module.
-            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
-    """
-    for name, max_val in clip_list:
-        layer = get_op_by_name(module, name)
-        layer.cuda()
-        max_val = max_val.to(layer.weight.device)
-        org_shape = layer.weight.shape
-        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
-        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
-        layer.weight.data = layer.weight.data.reshape(org_shape)
-        layer.cpu()
-
-
-def add_scale_weights(model_path, scale_path, tmp_path):
-    """
-    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
-    including scaling factors and clipping bounds.
-
-    Args:
-        model_path (str): Path to the pre-trained model to be equipped with AWQ.
-        scale_path (str): Path to the AWQ scale factors (.pt file).
-        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
-    """
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, config=config, trust_remote_code=True
-    )
-    model.eval()
-    awq_results = torch.load(str(scale_path), map_location="cpu")
-    apply_scale(model, awq_results["scale"])
-    apply_clip(model, awq_results["clip"])
-    model.save_pretrained(str(tmp_path))
-    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt
deleted file mode 100644
index 991896116..000000000
--- a/awq-py/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-torch>=2.1.1
-transformers>=4.32.0

From 08c5ee87e4cceb603ecceac90734fcdade57311b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Feb 2024 18:43:38 +0200
Subject: [PATCH 11/16] llama : remove deprecated API (#5770)

ggml-ci
---
 llama.cpp | 88 +------------------------------------------------------
 llama.h   | 45 ----------------------------
 2 files changed, 1 insertion(+), 132 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 893bcdbc0..30d5eb32d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7894,9 +7894,9 @@ static int llama_decode_internal(
     const auto n_batch = cparams.n_batch;
 
     GGML_ASSERT(n_tokens <= n_batch);
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     const int64_t t_start_us = ggml_time_us();
 
@@ -10062,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
     }
 }
 
-void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    llama_sample_temp(ctx, candidates_p, temp);
-}
-
 void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
@@ -10192,38 +10188,6 @@ void llama_sample_apply_guidance(
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-void llama_sample_classifier_free_guidance(
-          struct llama_context * ctx,
-        llama_token_data_array * candidates,
-          struct llama_context * guidance_ctx,
-                         float   scale) {
-    GGML_ASSERT(ctx);
-    int64_t t_start_sample_us;
-
-    t_start_sample_us = ggml_time_us();
-    const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    GGML_ASSERT(n_vocab == candidates->size);
-    GGML_ASSERT(!candidates->sorted);
-
-    std::vector<float> logits_base(n_vocab);
-    for (size_t i = 0; i < n_vocab; ++i) {
-        logits_base[i] = candidates->data[i].logit;
-    }
-
-    float * logits_guidance = llama_get_logits(guidance_ctx);
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
-    t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < n_vocab; ++i) {
-        candidates->data[i].logit = logits_base[i];
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
     GGML_ASSERT(ctx);
 
@@ -11724,15 +11688,6 @@ bool llama_supports_gpu_offload(void) {
 #endif
 }
 
-// deprecated:
-bool llama_mmap_supported(void) {
-    return llama_supports_mmap();
-}
-
-bool llama_mlock_supported(void) {
-    return llama_supports_mlock();
-}
-
 void llama_backend_init(void) {
     ggml_time_init();
 
@@ -12244,15 +12199,6 @@ uint32_t llama_model_quantize(
     }
 }
 
-int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-    try {
-        return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-        return 1;
-    }
-}
-
 int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
     try {
         return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12802,38 +12748,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
     return true;
 }
 
-int llama_eval(
-        struct llama_context * ctx,
-                 llama_token * tokens,
-                     int32_t   n_tokens,
-                     int32_t   n_past) {
-    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-
-    const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
-    if (ret < 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
-int llama_eval_embd(
-            struct llama_context * ctx,
-                           float * embd,
-                         int32_t   n_tokens,
-                         int32_t   n_past) {
-    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
-
-    const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
 void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
diff --git a/llama.h b/llama.h
index 16e28e91d..a6823bb2b 100644
--- a/llama.h
+++ b/llama.h
@@ -364,9 +364,6 @@ extern "C" {
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);
 
-    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
-    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
-
     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
@@ -423,14 +420,6 @@ extern "C" {
     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
     // will be applied on top of the previous one
     // Returns 0 on success
-    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
-            struct llama_context * ctx,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads),
-            "use llama_model_apply_lora_from_file instead");
-
     LLAMA_API int32_t llama_model_apply_lora_from_file(
             const struct llama_model * model,
                       const char * path_lora,
@@ -606,27 +595,6 @@ extern "C" {
     // Decoding
     //
 
-    // Run the llama inference to obtain the logits and probabilities for the next token(s).
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    // DEPRECATED: use llama_decode() instead
-    LLAMA_API DEPRECATED(int llama_eval(
-            struct llama_context * ctx,
-                     llama_token * tokens,
-                         int32_t   n_tokens,
-                         int32_t   n_past),
-            "use llama_decode() instead");
-
-    // Same as llama_eval, but use float matrix input directly.
-    // DEPRECATED: use llama_decode() instead
-    LLAMA_API DEPRECATED(int llama_eval_embd(
-            struct llama_context * ctx,
-                           float * embd,
-                         int32_t   n_tokens,
-                         int32_t   n_past),
-            "use llama_decode() instead");
-
     // Return batch for single sequence of tokens starting at pos_0
     //
     // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@@ -800,13 +768,6 @@ extern "C" {
                              float * logits_guidance,
                              float   scale);
 
-    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
-              struct llama_context * ctx,
-            llama_token_data_array * candidates,
-              struct llama_context * guidance_ctx,
-                             float   scale),
-              "use llama_sample_apply_guidance() instead");
-
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(
             struct llama_context * ctx,
@@ -860,12 +821,6 @@ extern "C" {
           llama_token_data_array * candidates,
                            float   temp);
 
-    LLAMA_API DEPRECATED(void llama_sample_temperature(
-                struct llama_context * ctx,
-              llama_token_data_array * candidates,
-                               float   temp),
-            "use llama_sample_temp instead");
-
     /// @details Apply constraints from grammar
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,

From 317709b2a81dbaf87850202686ec5bb2602a504e Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 28 Feb 2024 19:33:37 +0000
Subject: [PATCH 12/16] make portability_enumeration_ext apple only (#5757)

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 314e3d7a9..896c290b2 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1106,7 +1106,9 @@ void ggml_vk_instance_init() {
 
     const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
     const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
+#ifdef __APPLE__
     const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
+#endif
 
     std::vector<const char*> layers;
 
@@ -1117,13 +1119,17 @@ void ggml_vk_instance_init() {
     if (validation_ext) {
         extensions.push_back("VK_EXT_validation_features");
     }
+#ifdef __APPLE__
     if (portability_enumeration_ext) {
         extensions.push_back("VK_KHR_portability_enumeration");
     }
+#endif
     vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
+#ifdef __APPLE__
     if (portability_enumeration_ext) {
         instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
     }
+#endif
 
     std::vector<vk::ValidationFeatureEnableEXT> features_enable;
     vk::ValidationFeaturesEXT validation_features;

From 87c91c07663b707e831c59ec373b5e665ff9d64a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Feb 2024 21:44:21 +0200
Subject: [PATCH 13/16] ci : reduce 3b ppl chunks to 1 to avoid timeout (#5771)

ggml-ci
---
 ci/run.sh | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index f3a29c2e9..35eb3c7aa 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -272,19 +272,19 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
@@ -343,17 +343,17 @@ function gg_run_open_llama_3b_v2 {
     python3 ../convert-lora-to-ggml.py ${path_lora}
 
     # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
     compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
     # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
     compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
     # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
     compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
     set +e

From d5ab29757ebc59a30f03e408294ec20628a6374e Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Thu, 29 Feb 2024 00:17:23 -0800
Subject: [PATCH 14/16] llama : constified `llama_set_state_data`'s `src`
 (#5774)

---
 llama.cpp | 6 +++---
 llama.h   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 30d5eb32d..62699ce52 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12545,8 +12545,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 }
 
 // Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
-    uint8_t * inp = src;
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    const uint8_t * inp = src;
 
     // set rng
     {
@@ -12555,7 +12555,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
         GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
 
-        std::string rng_str((char *)inp, rng_size); inp += rng_size;
+        std::string rng_str((const char *)inp, rng_size); inp += rng_size;
 
         std::istringstream rng_ss(rng_str);
         rng_ss >> ctx->rng;
diff --git a/llama.h b/llama.h
index a6823bb2b..4d0ebe37d 100644
--- a/llama.h
+++ b/llama.h
@@ -575,7 +575,7 @@ extern "C" {
     // Returns the number of bytes read
     LLAMA_API size_t llama_set_state_data(
             struct llama_context * ctx,
-                         uint8_t * src);
+                   const uint8_t * src);
 
     // Save/load session file
     LLAMA_API bool llama_load_session_file(

From 052051d8ae4639a1c3c61e7da3237bcc572469d4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 29 Feb 2024 21:42:11 +0100
Subject: [PATCH 15/16] Server: normalize naming (#5779)

* server: normalize naming

* fix spacing
---
 examples/server/server.cpp | 370 ++++++++++++++++---------------------
 examples/server/utils.hpp  | 186 ++++++++++++-------
 2 files changed, 277 insertions(+), 279 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 080fa9bd5..bf20e0cf1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -33,8 +33,7 @@
 
 using json = nlohmann::json;
 
-struct server_params
-{
+struct server_params {
     std::string hostname = "127.0.0.1";
     std::vector<std::string> api_keys;
     std::string public_path = "examples/server/public";
@@ -49,103 +48,50 @@ struct server_params
 bool server_verbose = false;
 bool server_log_json = true;
 
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-enum stop_type
-{
+enum stop_type {
     STOP_FULL,
     STOP_PARTIAL,
 };
 
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
+// TODO: can become bool if we can't find use of more states
+enum slot_state {
+    IDLE,
+    PROCESSING,
+};
 
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
+enum slot_command {
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
 
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
+struct slot_params {
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
 
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
 
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
-{
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-    return out;
-}
+    std::vector<std::string> antiprompt;
 
-struct llama_client_slot
-{
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image {
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+struct server_slot {
     int id;
     int task_id = -1;
 
@@ -165,8 +111,8 @@ struct llama_client_slot
     int32_t i_batch     = -1;
     int32_t n_predict   = -1;
 
-    int32_t num_prompt_tokens           = 0;
-    int32_t num_prompt_tokens_processed = 0;
+    int32_t n_prompt_tokens           = 0;
+    int32_t n_prompt_tokens_processed = 0;
 
     json prompt;
     std::string generated_text;
@@ -201,8 +147,8 @@ struct llama_client_slot
     std::vector<slot_image> images;
 
     // stats
-    size_t sent_count = 0;
-    size_t sent_token_probs_index = 0;
+    size_t n_sent_text = 0; // number of sent text character
+    size_t n_sent_token_probs = 0;
 
     int64_t t_start_process_prompt;
     int64_t t_start_genereration;
@@ -214,7 +160,7 @@ struct llama_client_slot
     int multitask_id = -1;
 
     void reset() {
-        num_prompt_tokens      = 0;
+        n_prompt_tokens        = 0;
         generated_text         = "";
         truncated              = false;
         stopped_eos            = false;
@@ -222,16 +168,15 @@ struct llama_client_slot
         stopped_limit          = false;
         stopping_word          = "";
         n_past                 = 0;
-        sent_count             = 0;
-        sent_token_probs_index = 0;
+        n_sent_text            = 0;
+        n_sent_token_probs     = 0;
         infill                 = false;
         ga_i                   = 0;
         n_past_se              = 0;
 
         generated_token_probs.clear();
 
-        for (slot_image & img : images)
-        {
+        for (slot_image & img : images) {
             free(img.image_embedding);
             if (img.img_data) {
                 clip_image_u8_free(img.img_data);
@@ -243,19 +188,15 @@ struct llama_client_slot
     }
 
     bool has_budget(gpt_params &global_params) {
-        if (params.n_predict == -1 && global_params.n_predict == -1)
-        {
+        if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
 
         n_remaining = -1;
 
-        if (params.n_predict != -1)
-        {
+        if (params.n_predict != -1) {
             n_remaining = params.n_predict - n_decoded;
-        }
-        else if (global_params.n_predict != -1)
-        {
+        } else if (global_params.n_predict != -1) {
             n_remaining = global_params.n_predict - n_decoded;
         }
 
@@ -271,8 +212,7 @@ struct llama_client_slot
     }
 
     void add_token_string(const completion_token_output &token) {
-        if (command == RELEASE)
-        {
+        if (command == RELEASE) {
             return;
         }
         cache_tokens.push_back(token.tok);
@@ -290,10 +230,10 @@ struct llama_client_slot
     json get_formated_timings() {
         return json
         {
-            {"prompt_n",               num_prompt_tokens_processed},
+            {"prompt_n",               n_prompt_tokens_processed},
             {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
+            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
+            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
 
             {"predicted_n",            n_decoded},
             {"predicted_ms",           t_token_generation},
@@ -304,18 +244,18 @@ struct llama_client_slot
 
     void print_timings() const {
        char buffer[512];
-        double t_token = t_prompt_processing / num_prompt_tokens_processed;
-        double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
+        double t_token = t_prompt_processing / n_prompt_tokens_processed;
+        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
         sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
-                t_prompt_processing, num_prompt_tokens_processed,
+                t_prompt_processing, n_prompt_tokens_processed,
                 t_token, n_tokens_second);
         LOG_INFO(buffer, {
-            {"slot_id",                     id},
-            {"task_id",                     task_id},
-            {"t_prompt_processing",         t_prompt_processing},
-            {"num_prompt_tokens_processed", num_prompt_tokens_processed},
-            {"t_token",                     t_token},
-            {"n_tokens_second",             n_tokens_second},
+            {"slot_id",                   id},
+            {"task_id",                   task_id},
+            {"t_prompt_processing",       t_prompt_processing},
+            {"n_prompt_tokens_processed", n_prompt_tokens_processed},
+            {"t_token",                   t_token},
+            {"n_tokens_second",           n_tokens_second},
         });
 
         t_token = t_token_generation / n_decoded;
@@ -343,7 +283,7 @@ struct llama_client_slot
     }
 };
 
-struct llama_metrics {
+struct server_metrics {
     uint64_t n_prompt_tokens_processed_total = 0;
     uint64_t n_tokens_predicted_total        = 0;
 
@@ -354,18 +294,16 @@ struct llama_metrics {
     uint64_t t_tokens_generation      = 0;
 
 
-    void on_prompt_eval(const llama_client_slot &slot) {
-        n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
-
-        n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
-        t_prompt_processing       += slot.t_prompt_processing;
+    void on_prompt_eval(const server_slot &slot) {
+        n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
+        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
+        t_prompt_processing             += slot.t_prompt_processing;
     }
 
-    void on_prediction(const llama_client_slot &slot) {
+    void on_prediction(const server_slot &slot) {
         n_tokens_predicted_total += slot.n_decoded;
-
-        n_tokens_predicted  += slot.n_decoded;
-        t_tokens_generation += slot.t_token_generation;
+        n_tokens_predicted       += slot.n_decoded;
+        t_tokens_generation      += slot.t_token_generation;
     }
 
     void reset_bucket() {
@@ -404,13 +342,13 @@ struct llama_server_context
     std::string name_assistant;
 
     // slots / clients
-    std::vector<llama_client_slot> slots;
+    std::vector<server_slot> slots;
     json default_generation_settings_for_props;
 
-    llama_server_queue queue_tasks;
+    llama_server_queue    queue_tasks;
     llama_server_response queue_results;
 
-    llama_metrics metrics;
+    server_metrics metrics;
 
     ~llama_server_context()
     {
@@ -487,7 +425,7 @@ struct llama_server_context
         LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
         for (int i = 0; i < params.n_parallel; i++)
         {
-            llama_client_slot slot;
+            server_slot slot;
 
             slot.id = i;
             slot.n_ctx = n_ctx_slot;
@@ -579,11 +517,11 @@ struct llama_server_context
         return prompt_tokens;
     }
 
-    llama_client_slot* get_slot(int id) {
+    server_slot* get_slot(int id) {
         int64_t t_last = ggml_time_us();
-        llama_client_slot *last_used = nullptr;
+        server_slot *last_used = nullptr;
 
-        for (llama_client_slot & slot : slots)
+        for (server_slot & slot : slots)
         {
             if (slot.id == id && slot.available())
             {
@@ -600,7 +538,7 @@ struct llama_server_context
         return last_used;
     }
 
-    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
+    bool launch_slot_with_data(server_slot* &slot, json data) {
         slot_params default_params;
         llama_sampling_params default_sparams;
 
@@ -888,7 +826,7 @@ struct llama_server_context
         clean_kv_cache = false;
     }
 
-    void update_system_prompt() {
+    void system_prompt_update() {
         kv_cache_clear();
         system_tokens.clear();
 
@@ -933,9 +871,9 @@ struct llama_server_context
         system_need_update = false;
     }
 
-    void notify_system_prompt_changed() {
+    void system_prompt_notify() {
         // release all slots
-        for (llama_client_slot &slot : slots)
+        for (server_slot &slot : slots)
         {
             slot.release();
         }
@@ -943,17 +881,17 @@ struct llama_server_context
         system_need_update = true;
     }
 
-    void process_system_prompt_data(const json &sys_props) {
+    void system_prompt_process(const json &sys_props) {
         system_prompt  = sys_props.value("prompt", "");
         name_user      = sys_props.value("anti_prompt", "");
         name_assistant = sys_props.value("assistant_name", "");
 
 
-        notify_system_prompt_changed();
+        system_prompt_notify();
     }
 
     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
-                                        const stop_type type, llama_client_slot &slot)
+                                        const stop_type type, server_slot &slot)
     {
         size_t stop_pos = std::string::npos;
 
@@ -975,8 +913,8 @@ struct llama_server_context
             {
                 if (type == STOP_FULL)
                 {
-                    slot.stopped_word = true;
-                    slot.stopping_word = word;
+                    slot.stopped_word   = true;
+                    slot.stopping_word  = word;
                     slot.has_next_token = false;
                 }
                 stop_pos = pos;
@@ -986,7 +924,7 @@ struct llama_server_context
         return stop_pos;
     }
 
-    bool process_token(completion_token_output &result, llama_client_slot &slot) {
+    bool process_token(completion_token_output &result, server_slot &slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
         const std::string token_str = llama_token_to_piece(ctx, result.tok);
         slot.sampled = result.tok;
@@ -1032,7 +970,7 @@ struct llama_server_context
 
         if (!incomplete)
         {
-            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
             const std::string str_test = slot.generated_text.substr(pos);
             bool is_stop_full = false;
             size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
@@ -1042,7 +980,7 @@ struct llama_server_context
                 slot.generated_text.erase(
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
-                pos = std::min(slot.sent_count, slot.generated_text.size());
+                pos = std::min(slot.n_sent_text, slot.generated_text.size());
             }
             else
             {
@@ -1055,7 +993,7 @@ struct llama_server_context
             {
                 // no send the stop word in the response
                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                slot.sent_count += result.text_to_send.size();
+                slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
             }
             slot.add_token_string(result);
@@ -1099,7 +1037,7 @@ struct llama_server_context
         return slot.has_next_token; // continue
     }
 
-    bool process_images(llama_client_slot &slot) const
+    bool process_images(server_slot &slot) const
     {
         for (slot_image &img : slot.images)
         {
@@ -1132,7 +1070,7 @@ struct llama_server_context
         queue_results.send(res);
     }
 
-    json get_formated_generation(llama_client_slot &slot)
+    json get_formated_generation(server_slot &slot)
     {
         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
@@ -1179,7 +1117,7 @@ struct llama_server_context
         };
     }
 
-    void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
+    void send_partial_response(server_slot &slot, completion_token_output tkn)
     {
         task_result res;
         res.id = slot.task_id;
@@ -1199,13 +1137,13 @@ struct llama_server_context
         {
             std::vector<completion_token_output> probs_output = {};
             const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
-            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
+            size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
+            size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
             if (probs_pos < probs_stop_pos)
             {
                 probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
             }
-            slot.sent_token_probs_index = probs_stop_pos;
+            slot.n_sent_token_probs = probs_stop_pos;
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
         }
 
@@ -1218,7 +1156,7 @@ struct llama_server_context
         queue_results.send(res);
     }
 
-    void send_final_response(llama_client_slot &slot)
+    void send_final_response(server_slot &slot)
     {
         task_result res;
         res.id = slot.task_id;
@@ -1233,7 +1171,7 @@ struct llama_server_context
             {"stop",                true},
             {"model",               params.model_alias},
             {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.num_prompt_tokens},
+            {"tokens_evaluated",    slot.n_prompt_tokens},
             {"generation_settings", get_formated_generation(slot)},
             {"prompt",              slot.prompt},
             {"truncated",           slot.truncated},
@@ -1271,7 +1209,7 @@ struct llama_server_context
         queue_results.send(res);
     }
 
-    void send_embedding(llama_client_slot &slot)
+    void send_embedding(server_slot &slot)
     {
         task_result res;
         res.id = slot.task_id;
@@ -1282,9 +1220,7 @@ struct llama_server_context
         const int n_embd = llama_n_embd(model);
         if (!params.embedding)
         {
-            LOG_WARNING("embedding disabled", {
-                                                  {"params.embedding", params.embedding},
-                                              });
+            LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
             res.result_json = json
             {
                 {"embedding", std::vector<float>(n_embd, 0.0f)},
@@ -1296,7 +1232,7 @@ struct llama_server_context
             std::vector<float> embedding(data, data + n_embd);
             res.result_json = json
             {
-                {"embedding", embedding },
+                {"embedding", embedding},
             };
         }
         queue_results.send(res);
@@ -1345,7 +1281,7 @@ struct llama_server_context
     }
 
     // for multiple images processing
-    bool ingest_images(llama_client_slot &slot, int n_batch)
+    bool ingest_images(server_slot &slot, int n_batch)
     {
         int image_idx = 0;
 
@@ -1384,7 +1320,17 @@ struct llama_server_context
                 }
 
                 const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+                llama_batch batch_img = {
+                    n_eval,
+                    nullptr,
+                    (img.image_embedding + i * n_embd),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    slot.n_past,
+                    1, 0
+                };
                 if (llama_decode(ctx, batch_img))
                 {
                     LOG_TEE("%s : failed to eval image\n", __func__);
@@ -1454,7 +1400,7 @@ struct llama_server_context
         switch (task.type)
         {
             case TASK_TYPE_COMPLETION: {
-                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
                 if (slot == nullptr)
                 {
                     // if no slot is available, we defer this task for processing later
@@ -1469,10 +1415,10 @@ struct llama_server_context
                         send_error(task, "system prompt can only be updated when all slots are idle");
                         break;
                     }
-                    process_system_prompt_data(task.data["system_prompt"]);
+                    system_prompt_process(task.data["system_prompt"]);
 
                     // reset cache_tokens for all slots
-                    for (llama_client_slot &slot : slots)
+                    for (server_slot &slot : slots)
                     {
                         slot.cache_tokens.clear();
                         slot.n_past    = 0;
@@ -1512,20 +1458,20 @@ struct llama_server_context
                 int n_idle_slots       = 0;
                 int n_processing_slots = 0;
 
-                for (llama_client_slot &slot: slots) {
+                for (server_slot &slot: slots) {
                     json slot_data = get_formated_generation(slot);
                     slot_data["id"] = slot.id;
                     slot_data["task_id"] = slot.task_id;
                     slot_data["state"] = slot.state;
                     slot_data["prompt"] = slot.prompt;
                     slot_data["next_token"] = {
-                            {"has_next_token", slot.has_next_token},
-                            {"n_remain", slot.n_remaining},
+                            {"has_next_token",       slot.has_next_token},
+                            {"n_remain",             slot.n_remaining},
                             {"num_tokens_predicted", slot.n_decoded},
-                            {"stopped_eos", slot.stopped_eos},
-                            {"stopped_word", slot.stopped_word},
-                            {"stopped_limit", slot.stopped_limit},
-                            {"stopping_word", slot.stopping_word},
+                            {"stopped_eos",          slot.stopped_eos},
+                            {"stopped_word",         slot.stopped_word},
+                            {"stopped_limit",        slot.stopped_limit},
+                            {"stopping_word",        slot.stopping_word},
                     };
                     if (slot_data["state"] == IDLE) {
                         n_idle_slots++;
@@ -1563,10 +1509,10 @@ struct llama_server_context
                         { "n_tokens_predicted",              metrics.n_tokens_predicted},
                         { "t_tokens_generation",             metrics.t_tokens_generation},
 
-                        { "kv_cache_tokens_count",          llama_get_kv_cache_token_count(ctx)},
-                        { "kv_cache_used_cells",            llama_get_kv_cache_used_cells(ctx)},
+                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
+                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},
 
-                        { "slots",                          slots_data },
+                        { "slots",                           slots_data },
                 };
                 metrics.reset_bucket();
                 queue_results.send(res);
@@ -1597,7 +1543,7 @@ struct llama_server_context
         if (system_need_update)
         {
             LOG_INFO("updating system prompt", {});
-            update_system_prompt();
+            system_prompt_update();
         }
 
         llama_batch_clear(batch);
@@ -1618,7 +1564,7 @@ struct llama_server_context
         task.target_id = -1;
         queue_tasks.post(task);
 
-        for (llama_client_slot &slot : slots)
+        for (server_slot &slot : slots)
         {
             if (slot.ga_n == 1)
             {
@@ -1754,45 +1700,50 @@ struct llama_server_context
                         prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
                     }
 
-                    slot.num_prompt_tokens = prompt_tokens.size();
+                    slot.n_prompt_tokens = prompt_tokens.size();
 
                     if (slot.params.n_keep < 0)
                     {
-                        slot.params.n_keep = slot.num_prompt_tokens;
+                        slot.params.n_keep = slot.n_prompt_tokens;
                     }
                     slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
                     // if input prompt is too big, truncate it
-                    if (slot.num_prompt_tokens >= slot.n_ctx)
+                    if (slot.n_prompt_tokens >= slot.n_ctx)
                     {
                         const int n_left = slot.n_ctx - slot.params.n_keep;
                         const int n_block_size = n_left / 2;
-                        const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+                        const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
-                        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+                        std::vector<llama_token> new_tokens(
+                            prompt_tokens.begin(),
+                            prompt_tokens.begin() + slot.params.n_keep);
+                        new_tokens.insert(
+                            new_tokens.end(),
+                            prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+                            prompt_tokens.end());
 
                         LOG_VERBOSE("input truncated", {
-                            {"n_ctx",  slot.n_ctx},
-                            {"n_keep", slot.params.n_keep},
-                            {"n_left", n_left},
+                            {"n_ctx",      slot.n_ctx},
+                            {"n_keep",     slot.params.n_keep},
+                            {"n_left",     n_left},
                             {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
                         });
                         slot.truncated = true;
                         prompt_tokens = new_tokens;
 
-                        slot.num_prompt_tokens = prompt_tokens.size();
-                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+                        slot.n_prompt_tokens = prompt_tokens.size();
+                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                     }
 
                     if (!slot.params.cache_prompt)
                     {
                         llama_sampling_reset(slot.ctx_sampling);
 
-                        slot.n_past = 0;
+                        slot.n_past    = 0;
                         slot.n_past_se = 0;
-                        slot.ga_i = 0;
-                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+                        slot.ga_i      = 0;
+                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
                     }
                     else
                     {
@@ -1811,7 +1762,7 @@ struct llama_server_context
                             slot.n_past -= 1;
                         }
 
-                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
+                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
 
                         if (slot.ga_n != 1)
                         {
@@ -1836,13 +1787,13 @@ struct llama_server_context
                             { "slot_id", slot.id },
                             { "task_id", slot.task_id },
                             { "n_past",  slot.n_past },
-                            { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
+                            { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
                         });
                     }
 
                     slot.cache_tokens = prompt_tokens;
 
-                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
+                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
                         LOG_INFO("we have to evaluate at least 1 token to generate logits", {
@@ -1898,8 +1849,8 @@ struct llama_server_context
                     if (has_images && !ingest_images(slot, n_batch))
                     {
                         LOG_ERROR("failed processing images", {
-                            "slot_id", slot.id,
-                            "task_id", slot.task_id,
+                            {"slot_id", slot.id},
+                            {"task_id", slot.task_id},
                         });
                         // FIXME @phymbert: to be properly tested
                         //  early returning without changing the slot state will block the slot for ever
@@ -2049,10 +2000,6 @@ struct llama_server_context
         LOG_VERBOSE("slots updated", {});
         return true;
     }
-
-    void run_on_all_tasks_finished() {
-        update_slots();
-    }
 };
 
 static void server_print_usage(const char *argv0, const gpt_params &params,
@@ -2561,7 +2508,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 std::istreambuf_iterator<char>(),
                 std::back_inserter(systm_content)
             );
-            llama.process_system_prompt_data(json::parse(systm_content));
+            llama.system_prompt_process(json::parse(systm_content));
         }
         else if (arg == "-ctk" || arg == "--cache-type-k") {
             params.cache_type_k = argv[++i];
@@ -2692,7 +2639,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
 /* llama.cpp completion api semantics */
 static json format_partial_response(
-    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
+    llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
     json res = json
     {
@@ -2748,14 +2695,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
     });
 }
 
-struct token_translator
-{
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot)
 {
     auto & gtps = slot->generated_token_probs;
     auto translator = token_translator{llama.ctx};
@@ -3526,8 +3466,8 @@ int main(int argc, char **argv)
         &llama_server_context::process_single_task, &llama, std::placeholders::_1));
     llama.queue_tasks.on_finish_multitask(std::bind(
         &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_all_tasks_finished(std::bind(
-        &llama_server_context::run_on_all_tasks_finished, &llama));
+    llama.queue_tasks.on_run_slots(std::bind(
+        &llama_server_context::update_slots, &llama));
     llama.queue_results.on_multitask_update(std::bind(
         &llama_server_queue::update_multitask,
         &llama.queue_tasks,
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d7abd7cbb..d98541f26 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -37,10 +37,6 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 
-//
-// parallel
-//
-
 enum server_state {
     SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
     SERVER_STATE_READY,          // Server is ready and model is loaded
@@ -78,51 +74,8 @@ struct task_multi {
     std::vector<task_result> results{};
 };
 
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
-    IDLE,
-    PROCESSING,
-};
-
-enum slot_command
-{
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
-};
-
-struct slot_params
-{
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
-
-    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-};
-
-struct slot_image
-{
-    int32_t id;
-
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
-};
-
 // completion token output with probabilities
-struct completion_token_output
-{
+struct completion_token_output {
     struct token_prob
     {
         llama_token tok;
@@ -134,8 +87,13 @@ struct completion_token_output
     std::string text_to_send;
 };
 
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
-{
+struct token_translator {
+    llama_context * ctx;
+    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
+};
+
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
     std::stringstream ss_tid;
     ss_tid << std::this_thread::get_id();
     json log = nlohmann::ordered_json{
@@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line,
 //
 
 template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
+static T json_value(const json &body, const std::string &key, const T &default_value) {
     // Fallback null to default value
     return body.contains(key) && !body.at(key).is_null()
         ? body.value(key, default_value)
@@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) {
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
-{
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
     size_t alloc_size = 0;
     // vector holding all allocated string to be passed to llama_chat_apply_template
     std::vector<std::string> str(messages.size() * 2);
@@ -250,7 +206,7 @@ struct llama_server_queue {
     // callback functions
     std::function<void(task_server&)> callback_new_task;
     std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_all_task_finished;
+    std::function<void(void)> callback_run_slots;
 
     // Add a new task to the end of the queue
     int post(task_server task) {
@@ -283,14 +239,14 @@ struct llama_server_queue {
         callback_new_task = callback;
     }
 
-    // Register function to process a multitask
+    // Register function to process a multitask when it is finished
     void on_finish_multitask(std::function<void(task_multi&)> callback) {
         callback_finish_multitask = callback;
     }
 
-    // Register the function to be called when the batch of tasks is finished
-    void on_all_tasks_finished(std::function<void(void)> callback) {
-        callback_all_task_finished = callback;
+    // Register the function to be called when all slots data is ready to be processed
+    void on_run_slots(std::function<void(void)> callback) {
+        callback_run_slots = callback;
     }
 
     // Call when the state of one slot is changed
@@ -312,7 +268,13 @@ struct llama_server_queue {
         condition_tasks.notify_all();
     }
 
-    // Start the main loop.
+    /**
+     * Main loop consists of these steps:
+     * - Wait until a new task arrives
+     * - Process the task (i.e. maybe copy data into slot)
+     * - Check if multitask is finished
+     * - Run all slots
+     */
     void start_loop() {
         running = true;
         while (true) {
@@ -331,8 +293,8 @@ struct llama_server_queue {
                     LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                     callback_new_task(task);
                 }
-                LOG_VERBOSE("callback_all_task_finished", {});
-                // process and update all the multitasks
+                LOG_VERBOSE("update_multitasks", {});
+                // check if we have any finished multitasks
                 auto queue_iterator = queue_multitasks.begin();
                 while (queue_iterator != queue_multitasks.end())
                 {
@@ -349,8 +311,9 @@ struct llama_server_queue {
                         ++queue_iterator;
                     }
                 }
-                // all tasks in the current loop is finished
-                callback_all_task_finished();
+                // all tasks in the current loop is processed, slots data is now ready
+                LOG_VERBOSE("callback_run_slots", {});
+                callback_run_slots();
             }
             LOG_VERBOSE("wait for new task", {});
             // wait for new task
@@ -408,12 +371,14 @@ struct llama_server_response {
     std::mutex mutex_results;
     std::condition_variable condition_results;
 
+    // add the task_id to the list of tasks waiting for response
     void add_waiting_task_id(int task_id) {
         LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.insert(task_id);
     }
 
+    // when the request is finished, we can remove task associated with it
     void remove_waiting_task_id(int task_id) {
         LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
         std::unique_lock<std::mutex> lock(mutex_results);
@@ -574,3 +539,96 @@ static std::string gen_chatcmplid()
     chatcmplid << "chatcmpl-" << random_string();
     return chatcmplid.str();
 }
+
+//
+// other common utils
+//
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+    return i;
+}
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+    return str.size() >= suffix.size() &&
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial))
+                {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
+    std::string ret;
+    for (; begin != end; ++begin)
+    {
+        ret += llama_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+// convert a vector of completion_token_output to json
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
+{
+    json out = json::array();
+    for (const auto &prob : probs)
+    {
+        json probs_for_token = json::array();
+        for (const auto &p : prob.probs)
+        {
+            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            probs_for_token.push_back(json
+            {
+                {"tok_str", tok_str},
+                {"prob",    p.prob},
+            });
+        }
+        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+        out.push_back(json{
+            {"content", tok_str},
+            {"probs",   probs_for_token},
+        });
+    }
+    return out;
+}

From 38d152160898b0173ffe4dc7df5daadcbd2eceb0 Mon Sep 17 00:00:00 2001
From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Date: Fri, 1 Mar 2024 07:36:47 +0000
Subject: [PATCH 16/16] [SYCL] Use batched mul_mat pathway (#5591)

* Use batched mul_mat pathway

* rm extra line

* Explicitly state scaled data type

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
---
 ggml-sycl.cpp | 107 +++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 63 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index a054ec8b9..6f391b0c6 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12726,6 +12726,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
 
     GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
     GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
 
     GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
 
@@ -13269,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
     int64_t i03 = i13 / r3;
     int64_t i02 = i12 / r2;
 
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 }
 
-static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
-                                                 const ggml_tensor *src1,
-                                                 ggml_tensor *dst) try {
+static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
+                                             const ggml_tensor *src1,
+                                             ggml_tensor *dst) try {
     GGML_ASSERT(!ggml_is_transposed(src0));
     GGML_ASSERT(!ggml_is_transposed(src1));
 
     GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_BINARY_OP_LOCALS
 
-    GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
-
-    GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
+    const int64_t ne_dst  = ggml_nelements(dst);
 
     SYCL_CHECK(ggml_sycl_set_device(g_main_device));
     dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@@ -13312,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     // convert src1 to fp16
-    const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
-    GGML_ASSERT(to_fp16_sycl != nullptr);
-
-    sycl_pool_alloc<sycl::half> src1_as_f16(ne1);
-    to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+    sycl_pool_alloc<sycl::half> src1_f16_alloc;
+    if (src1->type != GGML_TYPE_F16) {
+      const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+      const int64_t ne_src1 = ggml_nelements(src1);
+      src1_f16_alloc.alloc(ne_src1);
+      GGML_ASSERT(to_fp16_sycl != nullptr);
+      to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
+    }
+    sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
+                                                       : src1_f16_alloc.get();
 
     sycl_pool_alloc<sycl::half> dst_f16;
     char * dst_t;
@@ -13337,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
     const void * alpha = &alpha_f16;
     const void * beta  = &beta_f16;
 
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne);
+    // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
+    // once oneMKL open source supports half, half, float, float: datatypes
+    dst_t = (char *) dst_f16.alloc(ne_dst);
 
-        nbd2 /= sizeof(float) / sizeof(sycl::half);
-        nbd3 /= sizeof(float) / sizeof(sycl::half);
-    } else {
-        dst_t = (char *) dst_ddf;
-
-        cu_compute_type = dpct::library_data_t::real_float;
-        cu_data_type = dpct::library_data_t::real_float;
-
-        alpha = &alpha_f32;
-        beta  = &beta_f32;
-    }
+    nbd2 /= sizeof(float) / sizeof(sycl::half);
+    nbd3 /= sizeof(float) / sizeof(sycl::half);
 
     GGML_ASSERT(ne12 % ne02 == 0);
     GGML_ASSERT(ne13 % ne03 == 0);
@@ -13386,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
             *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
-            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
-            (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
-            nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
-            (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
+            nb01 / nb00, nb02 / nb00,
+            (const char *)src1_f16, dpct::library_data_t::real_half,
+            nb11 / nb10, nb12 / nb10, beta,
+            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
             ne12 * ne13, cu_compute_type)));
     } else {
         // use syclGemmBatchedEx
@@ -13409,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
                                          {sycl::aspect::fp16});
 
             main_stream->submit([&](sycl::handler &cgh) {
-                const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
-                const void **ptrs_src_get_ct3 = ptrs_src.get();
-                void **ptrs_dst_get_ct4 = ptrs_dst.get();
-
+                const void **ptrs_src_get = ptrs_src.get();
+                void **ptrs_dst_get = ptrs_dst.get();
+                size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
+                size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
                 cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
                                  [=](sycl::nd_item<3> item_ct1) {
                                      k_compute_batched_ptrs(
-                                         src0_as_f16, src1_as_f16_get_ct1,
-                                         dst_t, ptrs_src_get_ct3,
-                                         ptrs_dst_get_ct4, ne12, ne13, ne23,
-                                         nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
-                                         r3, item_ct1);
+                                         src0_as_f16, src1_f16,
+                                         dst_t, ptrs_src_get,
+                                         ptrs_dst_get, ne12, ne13, ne23,
+                                         nb02, nb03, nb12_scaled, nb13_scaled,
+                                         nbd2, nbd3, r2, r3, item_ct1);
                                  });
             });
         }
-        /*
-        DPCT1010:95: SYCL uses exceptions to report errors and does not use the
-        error codes. The call was replaced with 0. You need to rewrite this
-        code.
-        */
-        SYCL_CHECK(0);
-
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
             *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **)(ptrs_src.get() + 0 * ne23),
-            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
+            dpct::library_data_t::real_half, nb01 / nb00,
             (const void **)(ptrs_src.get() + 1 * ne23),
-            dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
+            dpct::library_data_t::real_half, nb11 / nb10, beta,
             (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
             cu_compute_type)));
     }
 #endif
 
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
-        to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
-    }
+    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+    to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13491,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         // KQV single-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
         ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
         // KQ + KQV multi-batch
-        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
-        ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
+        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
+        ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
         ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);