From 7c4263d4261d6ee6f0539d53eb9e1b4d120ba8af Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 28 Feb 2024 10:37:02 +0200 Subject: [PATCH 01/16] ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760) * WIP: make i-quants work for QK_K = 64 * iq2_xs: attempt to fix AVX dot product for QK_K = 64 Tests pass, but I get gibberish. * QK_K = 64 tests pass on ARM_NEON and Metal Sadly, that does not mean it actually works. * Make CUDA compile with QK_K = 64 Tests don't pass, plus we get misaligned access * Q2_K: fixed bug in imatrix quantization for QK_K = 64 * iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work) --------- Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 27 ++++++--- ggml-metal.metal | 58 ++++++++++--------- ggml-quants.c | 148 +++++++++++++++++++++++++++++++++++++++-------- ggml-quants.h | 5 ++ ggml.c | 15 ++++- 5 files changed, 194 insertions(+), 59 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dfd28df62..831c84efb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -544,14 +544,19 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong #define QR3_XS 8 #define QI3_XS (QK_K / (4*QR3_XS)) +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif typedef struct { half d; uint8_t qs[QK_K/4]; uint8_t qh[QK_K/32]; uint8_t signs[QK_K/8]; - uint8_t scales[QK_K/64]; + uint8_t scales[IQ3S_N_SCALE]; } block_iq3_s; -static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding"); +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); #define QR1_S 8 #define QI1_S (QK_K / (4*QR1_S)) @@ -571,6 +576,11 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); +#if QK_K == 64 +#define block_iq4_xs block_iq4_nl +#define QR4_XS QR4_NL +#define QI4_XS QI4_NL +#else // QR4_XS = 8 is very slightly faster than QR4_XS = 4 #define QR4_XS 8 #define QI4_XS (QK_K / (4*QR4_XS)) @@ -581,7 +591,7 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); - +#endif #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -2439,9 +2449,9 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst } +#if QK_K != 64 template static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; const block_iq4_xs * x = (const block_iq4_xs *)vx; @@ -2455,8 +2465,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; } - } +#endif static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { @@ -5382,8 +5392,7 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( return 0.f; #endif #else - assert(false); - return 0.f; + return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs); #endif } @@ -7444,7 +7453,11 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, template static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_block_iq4_nl<<>>(vx, y); +#else dequantize_block_iq4_xs<<>>(vx, y); +#endif } template diff --git a/ggml-metal.metal b/ggml-metal.metal index 689411903..74a5e0b03 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2560,12 +2560,16 @@ typedef struct { uint8_t qs[QK4_NL/2]; } block_iq4_nl; +#if QK_K == 64 +#define block_iq4_xs block_iq4_nl +#else typedef struct { half d; uint16_t scales_h; uint8_t scales_l[QK_K/64]; uint8_t qs[QK_K/2]; } block_iq4_xs; +#endif //====================================== dot products ========================= @@ -4346,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); } -#if QK_K == 256 const int ix = tiisg; device const float * y4 = y + 32 * ix; @@ -4387,12 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl( y4 += 32 * 32; } -#else - (void) x; - (void) y; - (void) yl; - (void) nb32; -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -4482,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); } -#if QK_K == 256 const int ix = tiisg; device const float * y4 = y + 32 * ix; @@ -4533,12 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl( y4 += 32 * 32; } -#else - (void) x; - (void) y; - (void) yl; - (void) nb32; -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -4628,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); } -#if QK_K == 256 const int ix = tiisg; device const float * y4 = y + 32 * ix; @@ -4672,12 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl( y4 += 32 * 32; } -#else - (void) x; - (void) y; - (void) yl; - (void) nb32; -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -5016,7 +4999,6 @@ void kernel_mul_mv_iq1_s_f32_impl( const int nb32 = nb * (QK_K / 32); -#if QK_K == 256 const int ix = tiisg/2; const int il = tiisg%2; @@ -5055,12 +5037,6 @@ void kernel_mul_mv_iq1_s_f32_impl( y4 += 16 * 32; } -#else - (void) x; - (void) y; - (void) yl; - (void) nb32; -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -5167,6 +5143,7 @@ void kernel_mul_mv_iq4_nl_f32_impl( } } +#if QK_K != 64 void kernel_mul_mv_iq4_xs_f32_impl( device const void * src0, device const float * src1, @@ -5260,6 +5237,7 @@ void kernel_mul_mv_iq4_xs_f32_impl( } } } +#endif [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( @@ -5344,7 +5322,11 @@ kernel void kernel_mul_mv_iq4_xs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { +#if QK_K == 64 + kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +#else kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +#endif } //============================= templates and their specializations ============================= @@ -5770,6 +5752,9 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 template void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { +#if QK_K == 64 + dequantize_iq4_nl(xb, il, reg); +#else // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 const int ib32 = il/2; il = il%2; @@ -5786,6 +5771,7 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; } +#endif } template @@ -6334,7 +6320,11 @@ template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; +#if QK_K == 64 +template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; +#else template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; +#endif // // matrix-matrix multiplication @@ -6378,7 +6368,11 @@ template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; +#if QK_K == 64 +template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +#else template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +#endif // // indirect matrix-matrix multiplication @@ -6434,7 +6428,11 @@ template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#if QK_K == 64 +template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#else template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#endif // // matrix-vector multiplication @@ -7707,7 +7705,11 @@ kernel void kernel_mul_mv_id_iq4_xs_f32( const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; +#if QK_K == 64 + kernel_mul_mv_iq4_nl_f32_impl( +#else kernel_mul_mv_iq4_xs_f32_impl( +#endif src0[id], (device const float *) (src1 + bid*nb11), dst + bid*ne0, diff --git a/ggml-quants.c b/ggml-quants.c index f73d17ce2..371826f14 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1877,7 +1877,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri float mins[QK_K/16]; float scales[QK_K/16]; float sw[QK_K/16]; - float weight[QK_K/16]; + float weight[16]; uint8_t Ls[QK_K/16], Lm[QK_K/16]; for (int i = 0; i < nb; i++) { @@ -1887,13 +1887,42 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri float sigma2 = sumx2/QK_K; for (int j = 0; j < QK_K/16; ++j) { const float * restrict qw = quant_weights + QK_K * i + 16*j; - for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); + for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l]; - scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); } - float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); - float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); + float dm, mm; +#if QK_K == 64 + float max_scale = 0, max_min = 0; + for (int j = 0; j < QK_K/16; ++j) { + max_scale = MAX(max_scale, scales[j]); + max_min = MAX(max_min, mins[j]); + } + dm = max_scale/15; + mm = max_min/15; + if (max_scale) { + float id = 1/dm; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(id*scales[j]); + Ls[j] = MAX(0, MIN(15, l)); + } + } else { + memset(Ls, 0, QK_K/16); + } + if (max_min) { + float id = 1/mm; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(id*mins[j]); + Lm[j] = MAX(0, MIN(15, l)); + } + } else { + memset(Lm, 0, QK_K/16); + } +#else + dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); + mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); +#endif y[i].d = GGML_FP32_TO_FP16(dm); y[i].dmin = GGML_FP32_TO_FP16(mm); dm = GGML_FP16_TO_FP32(y[i].d); @@ -4227,6 +4256,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); +#if QK_K == 64 + dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k); +#else const int nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -4246,6 +4278,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, qs += 16; } } +#endif } //===================================== Q8_K ============================================== @@ -6306,7 +6339,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r float sumf = 0; - int isum[4]; + int isum[QK_K/16]; for (int i = 0; i < nb; ++i) { @@ -6322,14 +6355,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - isum[0] = isum[1] = isum[2] = isum[3] = 0; + memset(isum, 0, (QK_K/16)*sizeof(int)); for (int l = 0; l < 16; ++l) { isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3); isum[1] += q8[l+16] * ((q2[l] >> 2) & 3); isum[2] += q8[l+32] * ((q2[l] >> 4) & 3); isum[3] += q8[l+48] * ((q2[l] >> 6) & 3); } - for (int l = 0; l < 4; ++l) { + for (int l = 0; l < QK_K/16; ++l) { isum[l] *= (sc[l] & 0xF); } sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs; @@ -9488,15 +9521,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * #elif defined(__AVX2__) - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m1 = _mm_set1_epi8(1); - const __m256i m511 = _mm256_set1_epi16(511); const __m256i mone = _mm256_set1_epi8(1); - - static const uint8_t k_bit_helper[32] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; static const char block_sign_shuffle_mask_1[32] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, @@ -9510,11 +9535,77 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; - const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); +#if QK_K == 64 + static const uint8_t k_bit_helper[16] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper); + const __m128i m511 = _mm_set1_epi16(511); + typedef union { + __m128i vec_index; + uint16_t index[8]; + } index_t; + + index_t idx; + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs); + idx.vec_index = _mm_and_si128(q2_data, m511); + + const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9); + const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13); + const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper); + + const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); + const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits); + const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits); + + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32)); + + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]], + iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]], + iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]); + + __m256i signs; + signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + + const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1)); + const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1)); + + const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2)); + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); +#else + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); + const __m256i m511 = _mm256_set1_epi16(511); + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + uint64_t aux64; // somewhat hacky, but gives a significant boost in performance @@ -9603,6 +9694,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = 0.125f * hsum_float_8(accumf); +#endif #else @@ -10199,7 +10291,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const const int nb = n / QK_K; -#if defined __ARM_NEON + // TODO: implement for QK_K = 64 +#if defined __ARM_NEON && QK_K == 256 const uint8x16_t m8 = vdupq_n_u8(0x08); const uint8x16_t m7 = vdupq_n_u8(0x07); @@ -10256,7 +10349,8 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const *s = sumf; -#elif defined __AVX2__ + // TODO: implement for QK_K = 64 +#elif defined __AVX2__ && QK_K == 256 const __m128i m8 = _mm_set1_epi8(0x08); const __m128i m7 = _mm_set1_epi8(0x07); @@ -10455,6 +10549,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); assert(n % QK_K == 0); +#if QK_K == 64 + ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc); +#else const block_iq4_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -10574,6 +10671,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = sumf; #endif +#endif } // ================================ IQ2 quantization ============================================= @@ -10921,7 +11019,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_xxs * y = vy; @@ -11094,7 +11192,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_xs * y = vy; @@ -12037,7 +12135,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(n%QK_K == 0); - const int nbl = n/256; + const int nbl = n/QK_K; block_iq1_s * y = vy; @@ -12315,6 +12413,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest } size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +#if QK_K == 64 + return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights); +#else (void)hist; GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; @@ -12333,6 +12434,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i qrow += nblock*sizeof(block_iq4_xs); } return nrow * nblock * sizeof(block_iq4_xs); +#endif } void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) { @@ -12363,7 +12465,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_s * y = vy; diff --git a/ggml-quants.h b/ggml-quants.h index 2c61134c4..316e35687 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -230,6 +230,10 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); +#if QK_K == 64 +#define block_iq4_xs block_iq4_nl +//typedef struct block_iq4_nl block_iq4_xs; +#else typedef struct { ggml_fp16_t d; uint16_t scales_h; @@ -237,6 +241,7 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +#endif #ifdef __cplusplus extern "C" { diff --git a/ggml.c b/ggml.c index d66db3352..4591644ad 100644 --- a/ggml.c +++ b/ggml.c @@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", +#if QK_K == 64 + .blck_size = QK4_NL, +#else .blck_size = QK_K, +#endif .type_size = sizeof(block_iq4_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float = quantize_row_iq4_xs, .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference, .vec_dot = ggml_vec_dot_iq4_xs_q8_K, +#if QK_K == 64 + .vec_dot_type = GGML_TYPE_Q8_0, +#else .vec_dot_type = GGML_TYPE_Q8_K, +#endif .nrows = 1, }, [GGML_TYPE_Q8_K] = { @@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_IQ4_NL: +#if QK_K == 64 + case GGML_TYPE_IQ4_XS: +#endif { GGML_ASSERT(start % QK4_NL == 0); GGML_ASSERT(start % n_per_row == 0); @@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; +#if QK_K != 64 case GGML_TYPE_IQ4_XS: { - GGML_ASSERT(start % QK4_NL == 0); + GGML_ASSERT(start % QK_K == 0); GGML_ASSERT(start % n_per_row == 0); size_t start_row = start / n_per_row; size_t row_size = ggml_row_size(type, n_per_row); result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; +#endif case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); From efc72253f7987ed7bdc8bde9d9fa5c7cac2f6292 Mon Sep 17 00:00:00 2001 From: Jorge A <161275481+jorgealias@users.noreply.github.com> Date: Wed, 28 Feb 2024 01:39:15 -0700 Subject: [PATCH 02/16] server : add "/chat/completions" alias for "/v1/...` (#5722) * Add "/chat/completions" as alias for "/v1/chat/completions" * merge to upstream master * minor : fix trailing whitespace --------- Co-authored-by: Georgi Gerganov --- examples/server/server.cpp | 133 +++++++++--------- .../server/tests/features/parallel.feature | 22 +++ examples/server/tests/features/steps/steps.py | 28 +++- 3 files changed, 115 insertions(+), 68 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 846ef7e5f..6b3ee531c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3211,87 +3211,88 @@ int main(int argc, char **argv) res.set_content(models.dump(), "application/json; charset=utf-8"); }); + const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res) + { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + if (!validate_api_key(req, res)) { + return; + } + json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template); - // TODO: add mount point without "/v1" prefix -- how? - svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res) - { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!validate_api_key(req, res)) { - return; - } - json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, data, false, false, -1); - const int task_id = llama.queue_tasks.get_new_id(); - llama.queue_results.add_waiting_task_id(task_id); - llama.request_completion(task_id, data, false, false, -1); + if (!json_value(data, "stream", false)) { + std::string completion_text; + task_result result = llama.queue_results.recv(task_id); - if (!json_value(data, "stream", false)) { - std::string completion_text; - task_result result = llama.queue_results.recv(task_id); + if (!result.error && result.stop) { + json oaicompat_result = format_final_response_oaicompat(data, result); - if (!result.error && result.stop) { - json oaicompat_result = format_final_response_oaicompat(data, result); + res.set_content(oaicompat_result.dump(-1, ' ', false, + json::error_handler_t::replace), + "application/json; charset=utf-8"); + } else { + res.status = 500; + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); + } + llama.queue_results.remove_waiting_task_id(task_id); + } else { + const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) { + while (true) { + task_result llama_result = llama.queue_results.recv(task_id); + if (!llama_result.error) { + std::vector result_array = format_partial_response_oaicompat( llama_result); - res.set_content(oaicompat_result.dump(-1, ' ', false, - json::error_handler_t::replace), - "application/json; charset=utf-8"); - } else { - res.status = 500; - res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); - } - llama.queue_results.remove_waiting_task_id(task_id); - } else { - const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) { - while (true) { - task_result llama_result = llama.queue_results.recv(task_id); - if (!llama_result.error) { - std::vector result_array = format_partial_response_oaicompat( llama_result); - - for (auto it = result_array.begin(); it != result_array.end(); ++it) - { - if (!it->empty()) { - const std::string str = - "data: " + - it->dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; - LOG_VERBOSE("data stream", {{"to_send", str}}); - if (!sink.write(str.c_str(), str.size())) { - llama.queue_results.remove_waiting_task_id(task_id); - return false; - } - } - } - if (llama_result.stop) { - break; - } - } else { + for (auto it = result_array.begin(); it != result_array.end(); ++it) + { + if (!it->empty()) { const std::string str = - "error: " + - llama_result.result_json.dump(-1, ' ', false, - json::error_handler_t::replace) + + "data: " + + it->dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n"; LOG_VERBOSE("data stream", {{"to_send", str}}); if (!sink.write(str.c_str(), str.size())) { llama.queue_results.remove_waiting_task_id(task_id); return false; } - break; } } - sink.done(); - llama.queue_results.remove_waiting_task_id(task_id); - return true; - }; - - auto on_complete = [task_id, &llama](bool) { - // cancel request - llama.request_cancel(task_id); - llama.queue_results.remove_waiting_task_id(task_id); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + if (llama_result.stop) { + break; + } + } else { + const std::string str = + "error: " + + llama_result.result_json.dump(-1, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + llama.queue_results.remove_waiting_task_id(task_id); + return false; + } + break; + } } - }); + sink.done(); + llama.queue_results.remove_waiting_task_id(task_id); + return true; + }; + + auto on_complete = [task_id, &llama](bool) { + // cancel request + llama.request_cancel(task_id); + llama.queue_results.remove_waiting_task_id(task_id); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + } + }; + + svr.Post("/chat/completions", chat_completions); + svr.Post("/v1/chat/completions", chat_completions); svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index c85f9de1d..5f895cf90 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -54,6 +54,28 @@ Feature: Parallel | disabled | 128 | | enabled | 64 | + Scenario Outline: Multi users OAI completions compatibility no v1 + Given a system prompt You are a writer. + And a model tinyllama-2 + Given a prompt: + """ + Write a very long book. + """ + And a prompt: + """ + Write another a poem. + """ + And max tokens to predict + And streaming is + Given concurrent OAI completions requests no v1 + Then the server is busy + Then the server is idle + Then all prompts are predicted with tokens + Examples: + | streaming | n_predict | + | disabled | 128 | + | enabled | 64 | + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 Given a prompt: """ diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index ad87fcb82..381da105e 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -231,6 +231,7 @@ async def step_oai_chat_completions(context, api_error): completion = await oai_chat_completions(context.prompts.pop(), context.system_prompt, context.base_url, + '/v1/chat', False, model=context.model if hasattr(context, 'model') else None, @@ -288,6 +289,28 @@ async def step_oai_chat_completions(context): # user_prompt is inserted automatically context.system_prompt, context.base_url, + '/v1/chat/completions', + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) + + +@step(u'concurrent OAI completions requests no v1') +@async_run_until_complete +async def step_oai_chat_completions(context): + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + '/chat/completions', True, # async_client model=context.model if hasattr(context, 'model') else None, @@ -497,6 +520,7 @@ async def request_completion(prompt, async def oai_chat_completions(user_prompt, system_prompt, base_url, + base_path, async_client, debug=False, model=None, @@ -537,7 +561,7 @@ async def oai_chat_completions(user_prompt, origin = 'llama.cpp' headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} async with aiohttp.ClientSession() as session: - async with session.post(f'{base_url}/v1/chat/completions', + async with session.post(f'{base_url}{base_path}', json=payload, headers=headers) as response: if enable_streaming: @@ -579,7 +603,7 @@ async def oai_chat_completions(user_prompt, else: try: openai.api_key = user_api_key - openai.api_base = f'{base_url}/v1/chat' + openai.api_base = f'{base_url}{base_path}' chat_completion = openai.Completion.create( messages=payload['messages'], model=model, From 6c4416868df2e5455da7d20547f62bcf9735ba8e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 28 Feb 2024 09:39:39 +0100 Subject: [PATCH 03/16] readme : add link to LLaVA 1.6 models (#5758) Signed-off-by: Daniel Bevenius --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 507a2888b..5401e197f 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ Typically finetunes of the base models below are supported as well. **Multimodal models:** -- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) +- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) From 177628bfd85565070916ad66a5ac4071ee0527d8 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Wed, 28 Feb 2024 02:51:11 -0600 Subject: [PATCH 04/16] llama : improve BERT tokenization (#5740) * implement nfd for stripping accents in wpm tokenizer * sort nfd map; reuse iterator * use builtin tolower * add locale include * Simplify to_lower cases Co-authored-by: Jared Van Bortel --------- Co-authored-by: Jared Van Bortel --- llama.cpp | 137 +++++++++------------------- unicode.h | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 305 insertions(+), 94 deletions(-) diff --git a/llama.cpp b/llama.cpp index 464e1b89b..356ca1076 100644 --- a/llama.cpp +++ b/llama.cpp @@ -68,10 +68,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -8941,37 +8943,46 @@ struct llm_tokenizer_wpm { } std::vector preprocess(const std::string & text) { - std::string ori_str = normalize(text); - uint64_t ori_size = ori_str.size(); + // normalalization form D + std::vector codepoints = codepoints_from_utf8(text); + std::vector nfd_codepoints; + for (uint32_t code : codepoints) { + auto it = nfd_map.find(code); + if (it != nfd_map.end()) { + for (uint32_t c : it->second) { + nfd_codepoints.push_back(c); + } + } else { + nfd_codepoints.push_back(code); + } + } - // single punct / single symbol / single digit - // baseline: add whitespace on the left and right of punct and chinese characters - std::vector words; + // strip accents, strip control, uniformize whitespace, + // to lowercase, pad chinese characters, pad punctuation std::string new_str = ""; - uint64_t i = 0; - while (i < ori_size) { - int utf_char_len = utf8_len(ori_str[i]); - if ((utf_char_len == 1) && ispunct(ori_str[i])) { - new_str += " "; - new_str += ori_str[i]; - new_str += " "; - i += 1; + for (uint32_t code : nfd_codepoints) { + int type = codepoint_type(code); + if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { + continue; } - else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) { - new_str += " "; - new_str += ori_str.substr(i, 3); - new_str += " "; - i += 3; + code = to_lower(code); + if (type == CODEPOINT_TYPE_WHITESPACE) { + code = ' '; } - else { - new_str += ori_str[i]; - i += 1; + std::string s = codepoint_to_utf8(code); + if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) { + new_str += " "; + new_str += s; + new_str += " "; + } else { + new_str += s; } } // split by whitespace uint64_t l = 0; uint64_t r = 0; + std::vector words; while (r < new_str.size()) { // if is whitespace if (isspace(new_str[r])) { @@ -8989,47 +9000,20 @@ struct llm_tokenizer_wpm { return words; } - std::string normalize(const std::string & text) { - // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98 - std::string text2 = strip_accents(text); - for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) { - char c = text2[i]; - if (c >= 'A' && c <= 'Z') { - text2[i] = c - 'A' + 'a'; - } + uint32_t to_lower(uint32_t code) { +#if defined(_WIN32) + if (code > 0xFFFF) { + return code; } - return text2; +#endif + return std::tolower(wchar_t(code), std::locale("en_US.UTF-8")); } - bool is_chinese_char(const std::string & str) { - int len = str.length(); - unsigned int codepoint = 0; - int num_bytes = 0; - int i = 0; - unsigned char ch = static_cast(str[i]); - if (ch <= 0x7f) { - codepoint = ch; - num_bytes = 1; - } else if ((ch >> 5) == 0x06) { - codepoint = ch & 0x1f; - num_bytes = 2; - } else if ((ch >> 4) == 0x0e) { - codepoint = ch & 0x0f; - num_bytes = 3; - } else if ((ch >> 3) == 0x1e) { - codepoint = ch & 0x07; - num_bytes = 4; - } - for (int j = 1; j < num_bytes; ++j) { - if (i + j >= len) { - return false; // incomplete UTF-8 character - } - unsigned char next_ch = static_cast(str[i + j]); - if ((next_ch >> 6) != 0x02) { - return false; // invalid trailing byte - } - codepoint = (codepoint << 6) | (next_ch & 0x3f); - } + bool is_ascii_punct(uint32_t code) { + return code < 256 && ispunct(code); + } + + bool is_chinese_char(uint32_t codepoint) { if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) || (codepoint >= 0x3400 && codepoint <= 0x4DBF) || (codepoint >= 0x20000 && codepoint <= 0x2A6DF) || @@ -9045,41 +9029,6 @@ struct llm_tokenizer_wpm { return false; } - std::string strip_accents(const std::string & input_string) { - std::string resultString; - std::map accent_map = { - {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'}, - {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'}, - {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'}, - {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'}, - {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'}, - {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'}, - {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'}, - {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'}, - {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'}, - }; - - for (size_t i = 0; i < input_string.length();) { - int len = utf8_len(input_string[i]); - std::string curChar = input_string.substr(i, len); - auto iter = accent_map.find(curChar); - if (iter != accent_map.end()) { - resultString += iter->second; - } else { - resultString += curChar; - } - i += len; - } - - return resultString; - } - - static size_t utf8_len(char src) { - const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; - } - const llama_vocab & vocab; }; diff --git a/unicode.h b/unicode.h index 10a5dab01..620e2b580 100644 --- a/unicode.h +++ b/unicode.h @@ -223,6 +223,268 @@ static const std::vector> control_ranges = { {0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, }; +static const std::unordered_map> nfd_map = { +{0xC0, {0x41, 0x300}}, {0xC1, {0x41, 0x301}}, {0xC2, {0x41, 0x302}}, {0xC3, {0x41, 0x303}}, {0xC4, {0x41, 0x308}}, {0xC5, {0x41, 0x30A}}, {0xC7, {0x43, 0x327}}, {0xC8, {0x45, 0x300}}, +{0xC9, {0x45, 0x301}}, {0xCA, {0x45, 0x302}}, {0xCB, {0x45, 0x308}}, {0xCC, {0x49, 0x300}}, {0xCD, {0x49, 0x301}}, {0xCE, {0x49, 0x302}}, {0xCF, {0x49, 0x308}}, {0xD1, {0x4E, 0x303}}, +{0xD2, {0x4F, 0x300}}, {0xD3, {0x4F, 0x301}}, {0xD4, {0x4F, 0x302}}, {0xD5, {0x4F, 0x303}}, {0xD6, {0x4F, 0x308}}, {0xD9, {0x55, 0x300}}, {0xDA, {0x55, 0x301}}, {0xDB, {0x55, 0x302}}, +{0xDC, {0x55, 0x308}}, {0xDD, {0x59, 0x301}}, {0xE0, {0x61, 0x300}}, {0xE1, {0x61, 0x301}}, {0xE2, {0x61, 0x302}}, {0xE3, {0x61, 0x303}}, {0xE4, {0x61, 0x308}}, {0xE5, {0x61, 0x30A}}, +{0xE7, {0x63, 0x327}}, {0xE8, {0x65, 0x300}}, {0xE9, {0x65, 0x301}}, {0xEA, {0x65, 0x302}}, {0xEB, {0x65, 0x308}}, {0xEC, {0x69, 0x300}}, {0xED, {0x69, 0x301}}, {0xEE, {0x69, 0x302}}, +{0xEF, {0x69, 0x308}}, {0xF1, {0x6E, 0x303}}, {0xF2, {0x6F, 0x300}}, {0xF3, {0x6F, 0x301}}, {0xF4, {0x6F, 0x302}}, {0xF5, {0x6F, 0x303}}, {0xF6, {0x6F, 0x308}}, {0xF9, {0x75, 0x300}}, +{0xFA, {0x75, 0x301}}, {0xFB, {0x75, 0x302}}, {0xFC, {0x75, 0x308}}, {0xFD, {0x79, 0x301}}, {0xFF, {0x79, 0x308}}, {0x100, {0x41, 0x304}}, {0x101, {0x61, 0x304}}, {0x102, {0x41, 0x306}}, +{0x103, {0x61, 0x306}}, {0x104, {0x41, 0x328}}, {0x105, {0x61, 0x328}}, {0x106, {0x43, 0x301}}, {0x107, {0x63, 0x301}}, {0x108, {0x43, 0x302}}, {0x109, {0x63, 0x302}}, {0x10A, {0x43, 0x307}}, +{0x10B, {0x63, 0x307}}, {0x10C, {0x43, 0x30C}}, {0x10D, {0x63, 0x30C}}, {0x10E, {0x44, 0x30C}}, {0x10F, {0x64, 0x30C}}, {0x112, {0x45, 0x304}}, {0x113, {0x65, 0x304}}, {0x114, {0x45, 0x306}}, +{0x115, {0x65, 0x306}}, {0x116, {0x45, 0x307}}, {0x117, {0x65, 0x307}}, {0x118, {0x45, 0x328}}, {0x119, {0x65, 0x328}}, {0x11A, {0x45, 0x30C}}, {0x11B, {0x65, 0x30C}}, {0x11C, {0x47, 0x302}}, +{0x11D, {0x67, 0x302}}, {0x11E, {0x47, 0x306}}, {0x11F, {0x67, 0x306}}, {0x120, {0x47, 0x307}}, {0x121, {0x67, 0x307}}, {0x122, {0x47, 0x327}}, {0x123, {0x67, 0x327}}, {0x124, {0x48, 0x302}}, +{0x125, {0x68, 0x302}}, {0x128, {0x49, 0x303}}, {0x129, {0x69, 0x303}}, {0x12A, {0x49, 0x304}}, {0x12B, {0x69, 0x304}}, {0x12C, {0x49, 0x306}}, {0x12D, {0x69, 0x306}}, {0x12E, {0x49, 0x328}}, +{0x12F, {0x69, 0x328}}, {0x130, {0x49, 0x307}}, {0x134, {0x4A, 0x302}}, {0x135, {0x6A, 0x302}}, {0x136, {0x4B, 0x327}}, {0x137, {0x6B, 0x327}}, {0x139, {0x4C, 0x301}}, {0x13A, {0x6C, 0x301}}, +{0x13B, {0x4C, 0x327}}, {0x13C, {0x6C, 0x327}}, {0x13D, {0x4C, 0x30C}}, {0x13E, {0x6C, 0x30C}}, {0x143, {0x4E, 0x301}}, {0x144, {0x6E, 0x301}}, {0x145, {0x4E, 0x327}}, {0x146, {0x6E, 0x327}}, +{0x147, {0x4E, 0x30C}}, {0x148, {0x6E, 0x30C}}, {0x14C, {0x4F, 0x304}}, {0x14D, {0x6F, 0x304}}, {0x14E, {0x4F, 0x306}}, {0x14F, {0x6F, 0x306}}, {0x150, {0x4F, 0x30B}}, {0x151, {0x6F, 0x30B}}, +{0x154, {0x52, 0x301}}, {0x155, {0x72, 0x301}}, {0x156, {0x52, 0x327}}, {0x157, {0x72, 0x327}}, {0x158, {0x52, 0x30C}}, {0x159, {0x72, 0x30C}}, {0x15A, {0x53, 0x301}}, {0x15B, {0x73, 0x301}}, +{0x15C, {0x53, 0x302}}, {0x15D, {0x73, 0x302}}, {0x15E, {0x53, 0x327}}, {0x15F, {0x73, 0x327}}, {0x160, {0x53, 0x30C}}, {0x161, {0x73, 0x30C}}, {0x162, {0x54, 0x327}}, {0x163, {0x74, 0x327}}, +{0x164, {0x54, 0x30C}}, {0x165, {0x74, 0x30C}}, {0x168, {0x55, 0x303}}, {0x169, {0x75, 0x303}}, {0x16A, {0x55, 0x304}}, {0x16B, {0x75, 0x304}}, {0x16C, {0x55, 0x306}}, {0x16D, {0x75, 0x306}}, +{0x16E, {0x55, 0x30A}}, {0x16F, {0x75, 0x30A}}, {0x170, {0x55, 0x30B}}, {0x171, {0x75, 0x30B}}, {0x172, {0x55, 0x328}}, {0x173, {0x75, 0x328}}, {0x174, {0x57, 0x302}}, {0x175, {0x77, 0x302}}, +{0x176, {0x59, 0x302}}, {0x177, {0x79, 0x302}}, {0x178, {0x59, 0x308}}, {0x179, {0x5A, 0x301}}, {0x17A, {0x7A, 0x301}}, {0x17B, {0x5A, 0x307}}, {0x17C, {0x7A, 0x307}}, {0x17D, {0x5A, 0x30C}}, +{0x17E, {0x7A, 0x30C}}, {0x1A0, {0x4F, 0x31B}}, {0x1A1, {0x6F, 0x31B}}, {0x1AF, {0x55, 0x31B}}, {0x1B0, {0x75, 0x31B}}, {0x1CD, {0x41, 0x30C}}, {0x1CE, {0x61, 0x30C}}, {0x1CF, {0x49, 0x30C}}, +{0x1D0, {0x69, 0x30C}}, {0x1D1, {0x4F, 0x30C}}, {0x1D2, {0x6F, 0x30C}}, {0x1D3, {0x55, 0x30C}}, {0x1D4, {0x75, 0x30C}}, {0x1D5, {0x55, 0x308, 0x304}}, {0x1D6, {0x75, 0x308, 0x304}}, +{0x1D7, {0x55, 0x308, 0x301}}, {0x1D8, {0x75, 0x308, 0x301}}, {0x1D9, {0x55, 0x308, 0x30C}}, {0x1DA, {0x75, 0x308, 0x30C}}, {0x1DB, {0x55, 0x308, 0x300}}, {0x1DC, {0x75, 0x308, 0x300}}, +{0x1DE, {0x41, 0x308, 0x304}}, {0x1DF, {0x61, 0x308, 0x304}}, {0x1E0, {0x41, 0x307, 0x304}}, {0x1E1, {0x61, 0x307, 0x304}}, {0x1E2, {0xC6, 0x304}}, {0x1E3, {0xE6, 0x304}}, {0x1E6, {0x47, 0x30C}}, +{0x1E7, {0x67, 0x30C}}, {0x1E8, {0x4B, 0x30C}}, {0x1E9, {0x6B, 0x30C}}, {0x1EA, {0x4F, 0x328}}, {0x1EB, {0x6F, 0x328}}, {0x1EC, {0x4F, 0x328, 0x304}}, {0x1ED, {0x6F, 0x328, 0x304}}, +{0x1EE, {0x1B7, 0x30C}}, {0x1EF, {0x292, 0x30C}}, {0x1F0, {0x6A, 0x30C}}, {0x1F4, {0x47, 0x301}}, {0x1F5, {0x67, 0x301}}, {0x1F8, {0x4E, 0x300}}, {0x1F9, {0x6E, 0x300}}, {0x1FA, {0x41, 0x30A, 0x301}}, +{0x1FB, {0x61, 0x30A, 0x301}}, {0x1FC, {0xC6, 0x301}}, {0x1FD, {0xE6, 0x301}}, {0x1FE, {0xD8, 0x301}}, {0x1FF, {0xF8, 0x301}}, {0x200, {0x41, 0x30F}}, {0x201, {0x61, 0x30F}}, {0x202, {0x41, 0x311}}, +{0x203, {0x61, 0x311}}, {0x204, {0x45, 0x30F}}, {0x205, {0x65, 0x30F}}, {0x206, {0x45, 0x311}}, {0x207, {0x65, 0x311}}, {0x208, {0x49, 0x30F}}, {0x209, {0x69, 0x30F}}, {0x20A, {0x49, 0x311}}, +{0x20B, {0x69, 0x311}}, {0x20C, {0x4F, 0x30F}}, {0x20D, {0x6F, 0x30F}}, {0x20E, {0x4F, 0x311}}, {0x20F, {0x6F, 0x311}}, {0x210, {0x52, 0x30F}}, {0x211, {0x72, 0x30F}}, {0x212, {0x52, 0x311}}, +{0x213, {0x72, 0x311}}, {0x214, {0x55, 0x30F}}, {0x215, {0x75, 0x30F}}, {0x216, {0x55, 0x311}}, {0x217, {0x75, 0x311}}, {0x218, {0x53, 0x326}}, {0x219, {0x73, 0x326}}, {0x21A, {0x54, 0x326}}, +{0x21B, {0x74, 0x326}}, {0x21E, {0x48, 0x30C}}, {0x21F, {0x68, 0x30C}}, {0x226, {0x41, 0x307}}, {0x227, {0x61, 0x307}}, {0x228, {0x45, 0x327}}, {0x229, {0x65, 0x327}}, {0x22A, {0x4F, 0x308, 0x304}}, +{0x22B, {0x6F, 0x308, 0x304}}, {0x22C, {0x4F, 0x303, 0x304}}, {0x22D, {0x6F, 0x303, 0x304}}, {0x22E, {0x4F, 0x307}}, {0x22F, {0x6F, 0x307}}, {0x230, {0x4F, 0x307, 0x304}}, +{0x231, {0x6F, 0x307, 0x304}}, {0x232, {0x59, 0x304}}, {0x233, {0x79, 0x304}}, {0x340, {0x300}}, {0x341, {0x301}}, {0x343, {0x313}}, {0x344, {0x308, 0x301}}, {0x374, {0x2B9}}, {0x37E, {0x3B}}, +{0x385, {0xA8, 0x301}}, {0x386, {0x391, 0x301}}, {0x387, {0xB7}}, {0x388, {0x395, 0x301}}, {0x389, {0x397, 0x301}}, {0x38A, {0x399, 0x301}}, {0x38C, {0x39F, 0x301}}, {0x38E, {0x3A5, 0x301}}, +{0x38F, {0x3A9, 0x301}}, {0x390, {0x3B9, 0x308, 0x301}}, {0x3AA, {0x399, 0x308}}, {0x3AB, {0x3A5, 0x308}}, {0x3AC, {0x3B1, 0x301}}, {0x3AD, {0x3B5, 0x301}}, {0x3AE, {0x3B7, 0x301}}, +{0x3AF, {0x3B9, 0x301}}, {0x3B0, {0x3C5, 0x308, 0x301}}, {0x3CA, {0x3B9, 0x308}}, {0x3CB, {0x3C5, 0x308}}, {0x3CC, {0x3BF, 0x301}}, {0x3CD, {0x3C5, 0x301}}, {0x3CE, {0x3C9, 0x301}}, +{0x3D3, {0x3D2, 0x301}}, {0x3D4, {0x3D2, 0x308}}, {0x400, {0x415, 0x300}}, {0x401, {0x415, 0x308}}, {0x403, {0x413, 0x301}}, {0x407, {0x406, 0x308}}, {0x40C, {0x41A, 0x301}}, {0x40D, {0x418, 0x300}}, +{0x40E, {0x423, 0x306}}, {0x419, {0x418, 0x306}}, {0x439, {0x438, 0x306}}, {0x450, {0x435, 0x300}}, {0x451, {0x435, 0x308}}, {0x453, {0x433, 0x301}}, {0x457, {0x456, 0x308}}, {0x45C, {0x43A, 0x301}}, +{0x45D, {0x438, 0x300}}, {0x45E, {0x443, 0x306}}, {0x476, {0x474, 0x30F}}, {0x477, {0x475, 0x30F}}, {0x4C1, {0x416, 0x306}}, {0x4C2, {0x436, 0x306}}, {0x4D0, {0x410, 0x306}}, {0x4D1, {0x430, 0x306}}, +{0x4D2, {0x410, 0x308}}, {0x4D3, {0x430, 0x308}}, {0x4D6, {0x415, 0x306}}, {0x4D7, {0x435, 0x306}}, {0x4DA, {0x4D8, 0x308}}, {0x4DB, {0x4D9, 0x308}}, {0x4DC, {0x416, 0x308}}, {0x4DD, {0x436, 0x308}}, +{0x4DE, {0x417, 0x308}}, {0x4DF, {0x437, 0x308}}, {0x4E2, {0x418, 0x304}}, {0x4E3, {0x438, 0x304}}, {0x4E4, {0x418, 0x308}}, {0x4E5, {0x438, 0x308}}, {0x4E6, {0x41E, 0x308}}, {0x4E7, {0x43E, 0x308}}, +{0x4EA, {0x4E8, 0x308}}, {0x4EB, {0x4E9, 0x308}}, {0x4EC, {0x42D, 0x308}}, {0x4ED, {0x44D, 0x308}}, {0x4EE, {0x423, 0x304}}, {0x4EF, {0x443, 0x304}}, {0x4F0, {0x423, 0x308}}, {0x4F1, {0x443, 0x308}}, +{0x4F2, {0x423, 0x30B}}, {0x4F3, {0x443, 0x30B}}, {0x4F4, {0x427, 0x308}}, {0x4F5, {0x447, 0x308}}, {0x4F8, {0x42B, 0x308}}, {0x4F9, {0x44B, 0x308}}, {0x622, {0x627, 0x653}}, {0x623, {0x627, 0x654}}, +{0x624, {0x648, 0x654}}, {0x625, {0x627, 0x655}}, {0x626, {0x64A, 0x654}}, {0x6C0, {0x6D5, 0x654}}, {0x6C2, {0x6C1, 0x654}}, {0x6D3, {0x6D2, 0x654}}, {0x929, {0x928, 0x93C}}, {0x931, {0x930, 0x93C}}, +{0x934, {0x933, 0x93C}}, {0x958, {0x915, 0x93C}}, {0x959, {0x916, 0x93C}}, {0x95A, {0x917, 0x93C}}, {0x95B, {0x91C, 0x93C}}, {0x95C, {0x921, 0x93C}}, {0x95D, {0x922, 0x93C}}, {0x95E, {0x92B, 0x93C}}, +{0x95F, {0x92F, 0x93C}}, {0x9CB, {0x9C7, 0x9BE}}, {0x9CC, {0x9C7, 0x9D7}}, {0x9DC, {0x9A1, 0x9BC}}, {0x9DD, {0x9A2, 0x9BC}}, {0x9DF, {0x9AF, 0x9BC}}, {0xA33, {0xA32, 0xA3C}}, {0xA36, {0xA38, 0xA3C}}, +{0xA59, {0xA16, 0xA3C}}, {0xA5A, {0xA17, 0xA3C}}, {0xA5B, {0xA1C, 0xA3C}}, {0xA5E, {0xA2B, 0xA3C}}, {0xB48, {0xB47, 0xB56}}, {0xB4B, {0xB47, 0xB3E}}, {0xB4C, {0xB47, 0xB57}}, {0xB5C, {0xB21, 0xB3C}}, +{0xB5D, {0xB22, 0xB3C}}, {0xB94, {0xB92, 0xBD7}}, {0xBCA, {0xBC6, 0xBBE}}, {0xBCB, {0xBC7, 0xBBE}}, {0xBCC, {0xBC6, 0xBD7}}, {0xC48, {0xC46, 0xC56}}, {0xCC0, {0xCBF, 0xCD5}}, {0xCC7, {0xCC6, 0xCD5}}, +{0xCC8, {0xCC6, 0xCD6}}, {0xCCA, {0xCC6, 0xCC2}}, {0xCCB, {0xCC6, 0xCC2, 0xCD5}}, {0xD4A, {0xD46, 0xD3E}}, {0xD4B, {0xD47, 0xD3E}}, {0xD4C, {0xD46, 0xD57}}, {0xDDA, {0xDD9, 0xDCA}}, +{0xDDC, {0xDD9, 0xDCF}}, {0xDDD, {0xDD9, 0xDCF, 0xDCA}}, {0xDDE, {0xDD9, 0xDDF}}, {0xF43, {0xF42, 0xFB7}}, {0xF4D, {0xF4C, 0xFB7}}, {0xF52, {0xF51, 0xFB7}}, {0xF57, {0xF56, 0xFB7}}, +{0xF5C, {0xF5B, 0xFB7}}, {0xF69, {0xF40, 0xFB5}}, {0xF73, {0xF71, 0xF72}}, {0xF75, {0xF71, 0xF74}}, {0xF76, {0xFB2, 0xF80}}, {0xF78, {0xFB3, 0xF80}}, {0xF81, {0xF71, 0xF80}}, {0xF93, {0xF92, 0xFB7}}, +{0xF9D, {0xF9C, 0xFB7}}, {0xFA2, {0xFA1, 0xFB7}}, {0xFA7, {0xFA6, 0xFB7}}, {0xFAC, {0xFAB, 0xFB7}}, {0xFB9, {0xF90, 0xFB5}}, {0x1026, {0x1025, 0x102E}}, {0x1B06, {0x1B05, 0x1B35}}, +{0x1B08, {0x1B07, 0x1B35}}, {0x1B0A, {0x1B09, 0x1B35}}, {0x1B0C, {0x1B0B, 0x1B35}}, {0x1B0E, {0x1B0D, 0x1B35}}, {0x1B12, {0x1B11, 0x1B35}}, {0x1B3B, {0x1B3A, 0x1B35}}, {0x1B3D, {0x1B3C, 0x1B35}}, +{0x1B40, {0x1B3E, 0x1B35}}, {0x1B41, {0x1B3F, 0x1B35}}, {0x1B43, {0x1B42, 0x1B35}}, {0x1E00, {0x41, 0x325}}, {0x1E01, {0x61, 0x325}}, {0x1E02, {0x42, 0x307}}, {0x1E03, {0x62, 0x307}}, +{0x1E04, {0x42, 0x323}}, {0x1E05, {0x62, 0x323}}, {0x1E06, {0x42, 0x331}}, {0x1E07, {0x62, 0x331}}, {0x1E08, {0x43, 0x327, 0x301}}, {0x1E09, {0x63, 0x327, 0x301}}, {0x1E0A, {0x44, 0x307}}, +{0x1E0B, {0x64, 0x307}}, {0x1E0C, {0x44, 0x323}}, {0x1E0D, {0x64, 0x323}}, {0x1E0E, {0x44, 0x331}}, {0x1E0F, {0x64, 0x331}}, {0x1E10, {0x44, 0x327}}, {0x1E11, {0x64, 0x327}}, {0x1E12, {0x44, 0x32D}}, +{0x1E13, {0x64, 0x32D}}, {0x1E14, {0x45, 0x304, 0x300}}, {0x1E15, {0x65, 0x304, 0x300}}, {0x1E16, {0x45, 0x304, 0x301}}, {0x1E17, {0x65, 0x304, 0x301}}, {0x1E18, {0x45, 0x32D}}, +{0x1E19, {0x65, 0x32D}}, {0x1E1A, {0x45, 0x330}}, {0x1E1B, {0x65, 0x330}}, {0x1E1C, {0x45, 0x327, 0x306}}, {0x1E1D, {0x65, 0x327, 0x306}}, {0x1E1E, {0x46, 0x307}}, {0x1E1F, {0x66, 0x307}}, +{0x1E20, {0x47, 0x304}}, {0x1E21, {0x67, 0x304}}, {0x1E22, {0x48, 0x307}}, {0x1E23, {0x68, 0x307}}, {0x1E24, {0x48, 0x323}}, {0x1E25, {0x68, 0x323}}, {0x1E26, {0x48, 0x308}}, {0x1E27, {0x68, 0x308}}, +{0x1E28, {0x48, 0x327}}, {0x1E29, {0x68, 0x327}}, {0x1E2A, {0x48, 0x32E}}, {0x1E2B, {0x68, 0x32E}}, {0x1E2C, {0x49, 0x330}}, {0x1E2D, {0x69, 0x330}}, {0x1E2E, {0x49, 0x308, 0x301}}, +{0x1E2F, {0x69, 0x308, 0x301}}, {0x1E30, {0x4B, 0x301}}, {0x1E31, {0x6B, 0x301}}, {0x1E32, {0x4B, 0x323}}, {0x1E33, {0x6B, 0x323}}, {0x1E34, {0x4B, 0x331}}, {0x1E35, {0x6B, 0x331}}, +{0x1E36, {0x4C, 0x323}}, {0x1E37, {0x6C, 0x323}}, {0x1E38, {0x4C, 0x323, 0x304}}, {0x1E39, {0x6C, 0x323, 0x304}}, {0x1E3A, {0x4C, 0x331}}, {0x1E3B, {0x6C, 0x331}}, {0x1E3C, {0x4C, 0x32D}}, +{0x1E3D, {0x6C, 0x32D}}, {0x1E3E, {0x4D, 0x301}}, {0x1E3F, {0x6D, 0x301}}, {0x1E40, {0x4D, 0x307}}, {0x1E41, {0x6D, 0x307}}, {0x1E42, {0x4D, 0x323}}, {0x1E43, {0x6D, 0x323}}, {0x1E44, {0x4E, 0x307}}, +{0x1E45, {0x6E, 0x307}}, {0x1E46, {0x4E, 0x323}}, {0x1E47, {0x6E, 0x323}}, {0x1E48, {0x4E, 0x331}}, {0x1E49, {0x6E, 0x331}}, {0x1E4A, {0x4E, 0x32D}}, {0x1E4B, {0x6E, 0x32D}}, +{0x1E4C, {0x4F, 0x303, 0x301}}, {0x1E4D, {0x6F, 0x303, 0x301}}, {0x1E4E, {0x4F, 0x303, 0x308}}, {0x1E4F, {0x6F, 0x303, 0x308}}, {0x1E50, {0x4F, 0x304, 0x300}}, {0x1E51, {0x6F, 0x304, 0x300}}, +{0x1E52, {0x4F, 0x304, 0x301}}, {0x1E53, {0x6F, 0x304, 0x301}}, {0x1E54, {0x50, 0x301}}, {0x1E55, {0x70, 0x301}}, {0x1E56, {0x50, 0x307}}, {0x1E57, {0x70, 0x307}}, {0x1E58, {0x52, 0x307}}, +{0x1E59, {0x72, 0x307}}, {0x1E5A, {0x52, 0x323}}, {0x1E5B, {0x72, 0x323}}, {0x1E5C, {0x52, 0x323, 0x304}}, {0x1E5D, {0x72, 0x323, 0x304}}, {0x1E5E, {0x52, 0x331}}, {0x1E5F, {0x72, 0x331}}, +{0x1E60, {0x53, 0x307}}, {0x1E61, {0x73, 0x307}}, {0x1E62, {0x53, 0x323}}, {0x1E63, {0x73, 0x323}}, {0x1E64, {0x53, 0x301, 0x307}}, {0x1E65, {0x73, 0x301, 0x307}}, {0x1E66, {0x53, 0x30C, 0x307}}, +{0x1E67, {0x73, 0x30C, 0x307}}, {0x1E68, {0x53, 0x323, 0x307}}, {0x1E69, {0x73, 0x323, 0x307}}, {0x1E6A, {0x54, 0x307}}, {0x1E6B, {0x74, 0x307}}, {0x1E6C, {0x54, 0x323}}, {0x1E6D, {0x74, 0x323}}, +{0x1E6E, {0x54, 0x331}}, {0x1E6F, {0x74, 0x331}}, {0x1E70, {0x54, 0x32D}}, {0x1E71, {0x74, 0x32D}}, {0x1E72, {0x55, 0x324}}, {0x1E73, {0x75, 0x324}}, {0x1E74, {0x55, 0x330}}, {0x1E75, {0x75, 0x330}}, +{0x1E76, {0x55, 0x32D}}, {0x1E77, {0x75, 0x32D}}, {0x1E78, {0x55, 0x303, 0x301}}, {0x1E79, {0x75, 0x303, 0x301}}, {0x1E7A, {0x55, 0x304, 0x308}}, {0x1E7B, {0x75, 0x304, 0x308}}, +{0x1E7C, {0x56, 0x303}}, {0x1E7D, {0x76, 0x303}}, {0x1E7E, {0x56, 0x323}}, {0x1E7F, {0x76, 0x323}}, {0x1E80, {0x57, 0x300}}, {0x1E81, {0x77, 0x300}}, {0x1E82, {0x57, 0x301}}, {0x1E83, {0x77, 0x301}}, +{0x1E84, {0x57, 0x308}}, {0x1E85, {0x77, 0x308}}, {0x1E86, {0x57, 0x307}}, {0x1E87, {0x77, 0x307}}, {0x1E88, {0x57, 0x323}}, {0x1E89, {0x77, 0x323}}, {0x1E8A, {0x58, 0x307}}, {0x1E8B, {0x78, 0x307}}, +{0x1E8C, {0x58, 0x308}}, {0x1E8D, {0x78, 0x308}}, {0x1E8E, {0x59, 0x307}}, {0x1E8F, {0x79, 0x307}}, {0x1E90, {0x5A, 0x302}}, {0x1E91, {0x7A, 0x302}}, {0x1E92, {0x5A, 0x323}}, {0x1E93, {0x7A, 0x323}}, +{0x1E94, {0x5A, 0x331}}, {0x1E95, {0x7A, 0x331}}, {0x1E96, {0x68, 0x331}}, {0x1E97, {0x74, 0x308}}, {0x1E98, {0x77, 0x30A}}, {0x1E99, {0x79, 0x30A}}, {0x1E9B, {0x17F, 0x307}}, {0x1EA0, {0x41, 0x323}}, +{0x1EA1, {0x61, 0x323}}, {0x1EA2, {0x41, 0x309}}, {0x1EA3, {0x61, 0x309}}, {0x1EA4, {0x41, 0x302, 0x301}}, {0x1EA5, {0x61, 0x302, 0x301}}, {0x1EA6, {0x41, 0x302, 0x300}}, +{0x1EA7, {0x61, 0x302, 0x300}}, {0x1EA8, {0x41, 0x302, 0x309}}, {0x1EA9, {0x61, 0x302, 0x309}}, {0x1EAA, {0x41, 0x302, 0x303}}, {0x1EAB, {0x61, 0x302, 0x303}}, {0x1EAC, {0x41, 0x323, 0x302}}, +{0x1EAD, {0x61, 0x323, 0x302}}, {0x1EAE, {0x41, 0x306, 0x301}}, {0x1EAF, {0x61, 0x306, 0x301}}, {0x1EB0, {0x41, 0x306, 0x300}}, {0x1EB1, {0x61, 0x306, 0x300}}, {0x1EB2, {0x41, 0x306, 0x309}}, +{0x1EB3, {0x61, 0x306, 0x309}}, {0x1EB4, {0x41, 0x306, 0x303}}, {0x1EB5, {0x61, 0x306, 0x303}}, {0x1EB6, {0x41, 0x323, 0x306}}, {0x1EB7, {0x61, 0x323, 0x306}}, {0x1EB8, {0x45, 0x323}}, +{0x1EB9, {0x65, 0x323}}, {0x1EBA, {0x45, 0x309}}, {0x1EBB, {0x65, 0x309}}, {0x1EBC, {0x45, 0x303}}, {0x1EBD, {0x65, 0x303}}, {0x1EBE, {0x45, 0x302, 0x301}}, {0x1EBF, {0x65, 0x302, 0x301}}, +{0x1EC0, {0x45, 0x302, 0x300}}, {0x1EC1, {0x65, 0x302, 0x300}}, {0x1EC2, {0x45, 0x302, 0x309}}, {0x1EC3, {0x65, 0x302, 0x309}}, {0x1EC4, {0x45, 0x302, 0x303}}, {0x1EC5, {0x65, 0x302, 0x303}}, +{0x1EC6, {0x45, 0x323, 0x302}}, {0x1EC7, {0x65, 0x323, 0x302}}, {0x1EC8, {0x49, 0x309}}, {0x1EC9, {0x69, 0x309}}, {0x1ECA, {0x49, 0x323}}, {0x1ECB, {0x69, 0x323}}, {0x1ECC, {0x4F, 0x323}}, +{0x1ECD, {0x6F, 0x323}}, {0x1ECE, {0x4F, 0x309}}, {0x1ECF, {0x6F, 0x309}}, {0x1ED0, {0x4F, 0x302, 0x301}}, {0x1ED1, {0x6F, 0x302, 0x301}}, {0x1ED2, {0x4F, 0x302, 0x300}}, +{0x1ED3, {0x6F, 0x302, 0x300}}, {0x1ED4, {0x4F, 0x302, 0x309}}, {0x1ED5, {0x6F, 0x302, 0x309}}, {0x1ED6, {0x4F, 0x302, 0x303}}, {0x1ED7, {0x6F, 0x302, 0x303}}, {0x1ED8, {0x4F, 0x323, 0x302}}, +{0x1ED9, {0x6F, 0x323, 0x302}}, {0x1EDA, {0x4F, 0x31B, 0x301}}, {0x1EDB, {0x6F, 0x31B, 0x301}}, {0x1EDC, {0x4F, 0x31B, 0x300}}, {0x1EDD, {0x6F, 0x31B, 0x300}}, {0x1EDE, {0x4F, 0x31B, 0x309}}, +{0x1EDF, {0x6F, 0x31B, 0x309}}, {0x1EE0, {0x4F, 0x31B, 0x303}}, {0x1EE1, {0x6F, 0x31B, 0x303}}, {0x1EE2, {0x4F, 0x31B, 0x323}}, {0x1EE3, {0x6F, 0x31B, 0x323}}, {0x1EE4, {0x55, 0x323}}, +{0x1EE5, {0x75, 0x323}}, {0x1EE6, {0x55, 0x309}}, {0x1EE7, {0x75, 0x309}}, {0x1EE8, {0x55, 0x31B, 0x301}}, {0x1EE9, {0x75, 0x31B, 0x301}}, {0x1EEA, {0x55, 0x31B, 0x300}}, +{0x1EEB, {0x75, 0x31B, 0x300}}, {0x1EEC, {0x55, 0x31B, 0x309}}, {0x1EED, {0x75, 0x31B, 0x309}}, {0x1EEE, {0x55, 0x31B, 0x303}}, {0x1EEF, {0x75, 0x31B, 0x303}}, {0x1EF0, {0x55, 0x31B, 0x323}}, +{0x1EF1, {0x75, 0x31B, 0x323}}, {0x1EF2, {0x59, 0x300}}, {0x1EF3, {0x79, 0x300}}, {0x1EF4, {0x59, 0x323}}, {0x1EF5, {0x79, 0x323}}, {0x1EF6, {0x59, 0x309}}, {0x1EF7, {0x79, 0x309}}, +{0x1EF8, {0x59, 0x303}}, {0x1EF9, {0x79, 0x303}}, {0x1F00, {0x3B1, 0x313}}, {0x1F01, {0x3B1, 0x314}}, {0x1F02, {0x3B1, 0x313, 0x300}}, {0x1F03, {0x3B1, 0x314, 0x300}}, {0x1F04, {0x3B1, 0x313, 0x301}}, +{0x1F05, {0x3B1, 0x314, 0x301}}, {0x1F06, {0x3B1, 0x313, 0x342}}, {0x1F07, {0x3B1, 0x314, 0x342}}, {0x1F08, {0x391, 0x313}}, {0x1F09, {0x391, 0x314}}, {0x1F0A, {0x391, 0x313, 0x300}}, +{0x1F0B, {0x391, 0x314, 0x300}}, {0x1F0C, {0x391, 0x313, 0x301}}, {0x1F0D, {0x391, 0x314, 0x301}}, {0x1F0E, {0x391, 0x313, 0x342}}, {0x1F0F, {0x391, 0x314, 0x342}}, {0x1F10, {0x3B5, 0x313}}, +{0x1F11, {0x3B5, 0x314}}, {0x1F12, {0x3B5, 0x313, 0x300}}, {0x1F13, {0x3B5, 0x314, 0x300}}, {0x1F14, {0x3B5, 0x313, 0x301}}, {0x1F15, {0x3B5, 0x314, 0x301}}, {0x1F18, {0x395, 0x313}}, +{0x1F19, {0x395, 0x314}}, {0x1F1A, {0x395, 0x313, 0x300}}, {0x1F1B, {0x395, 0x314, 0x300}}, {0x1F1C, {0x395, 0x313, 0x301}}, {0x1F1D, {0x395, 0x314, 0x301}}, {0x1F20, {0x3B7, 0x313}}, +{0x1F21, {0x3B7, 0x314}}, {0x1F22, {0x3B7, 0x313, 0x300}}, {0x1F23, {0x3B7, 0x314, 0x300}}, {0x1F24, {0x3B7, 0x313, 0x301}}, {0x1F25, {0x3B7, 0x314, 0x301}}, {0x1F26, {0x3B7, 0x313, 0x342}}, +{0x1F27, {0x3B7, 0x314, 0x342}}, {0x1F28, {0x397, 0x313}}, {0x1F29, {0x397, 0x314}}, {0x1F2A, {0x397, 0x313, 0x300}}, {0x1F2B, {0x397, 0x314, 0x300}}, {0x1F2C, {0x397, 0x313, 0x301}}, +{0x1F2D, {0x397, 0x314, 0x301}}, {0x1F2E, {0x397, 0x313, 0x342}}, {0x1F2F, {0x397, 0x314, 0x342}}, {0x1F30, {0x3B9, 0x313}}, {0x1F31, {0x3B9, 0x314}}, {0x1F32, {0x3B9, 0x313, 0x300}}, +{0x1F33, {0x3B9, 0x314, 0x300}}, {0x1F34, {0x3B9, 0x313, 0x301}}, {0x1F35, {0x3B9, 0x314, 0x301}}, {0x1F36, {0x3B9, 0x313, 0x342}}, {0x1F37, {0x3B9, 0x314, 0x342}}, {0x1F38, {0x399, 0x313}}, +{0x1F39, {0x399, 0x314}}, {0x1F3A, {0x399, 0x313, 0x300}}, {0x1F3B, {0x399, 0x314, 0x300}}, {0x1F3C, {0x399, 0x313, 0x301}}, {0x1F3D, {0x399, 0x314, 0x301}}, {0x1F3E, {0x399, 0x313, 0x342}}, +{0x1F3F, {0x399, 0x314, 0x342}}, {0x1F40, {0x3BF, 0x313}}, {0x1F41, {0x3BF, 0x314}}, {0x1F42, {0x3BF, 0x313, 0x300}}, {0x1F43, {0x3BF, 0x314, 0x300}}, {0x1F44, {0x3BF, 0x313, 0x301}}, +{0x1F45, {0x3BF, 0x314, 0x301}}, {0x1F48, {0x39F, 0x313}}, {0x1F49, {0x39F, 0x314}}, {0x1F4A, {0x39F, 0x313, 0x300}}, {0x1F4B, {0x39F, 0x314, 0x300}}, {0x1F4C, {0x39F, 0x313, 0x301}}, +{0x1F4D, {0x39F, 0x314, 0x301}}, {0x1F50, {0x3C5, 0x313}}, {0x1F51, {0x3C5, 0x314}}, {0x1F52, {0x3C5, 0x313, 0x300}}, {0x1F53, {0x3C5, 0x314, 0x300}}, {0x1F54, {0x3C5, 0x313, 0x301}}, +{0x1F55, {0x3C5, 0x314, 0x301}}, {0x1F56, {0x3C5, 0x313, 0x342}}, {0x1F57, {0x3C5, 0x314, 0x342}}, {0x1F59, {0x3A5, 0x314}}, {0x1F5B, {0x3A5, 0x314, 0x300}}, {0x1F5D, {0x3A5, 0x314, 0x301}}, +{0x1F5F, {0x3A5, 0x314, 0x342}}, {0x1F60, {0x3C9, 0x313}}, {0x1F61, {0x3C9, 0x314}}, {0x1F62, {0x3C9, 0x313, 0x300}}, {0x1F63, {0x3C9, 0x314, 0x300}}, {0x1F64, {0x3C9, 0x313, 0x301}}, +{0x1F65, {0x3C9, 0x314, 0x301}}, {0x1F66, {0x3C9, 0x313, 0x342}}, {0x1F67, {0x3C9, 0x314, 0x342}}, {0x1F68, {0x3A9, 0x313}}, {0x1F69, {0x3A9, 0x314}}, {0x1F6A, {0x3A9, 0x313, 0x300}}, +{0x1F6B, {0x3A9, 0x314, 0x300}}, {0x1F6C, {0x3A9, 0x313, 0x301}}, {0x1F6D, {0x3A9, 0x314, 0x301}}, {0x1F6E, {0x3A9, 0x313, 0x342}}, {0x1F6F, {0x3A9, 0x314, 0x342}}, {0x1F70, {0x3B1, 0x300}}, +{0x1F71, {0x3B1, 0x301}}, {0x1F72, {0x3B5, 0x300}}, {0x1F73, {0x3B5, 0x301}}, {0x1F74, {0x3B7, 0x300}}, {0x1F75, {0x3B7, 0x301}}, {0x1F76, {0x3B9, 0x300}}, {0x1F77, {0x3B9, 0x301}}, +{0x1F78, {0x3BF, 0x300}}, {0x1F79, {0x3BF, 0x301}}, {0x1F7A, {0x3C5, 0x300}}, {0x1F7B, {0x3C5, 0x301}}, {0x1F7C, {0x3C9, 0x300}}, {0x1F7D, {0x3C9, 0x301}}, {0x1F80, {0x3B1, 0x313, 0x345}}, +{0x1F81, {0x3B1, 0x314, 0x345}}, {0x1F82, {0x3B1, 0x313, 0x300, 0x345}}, {0x1F83, {0x3B1, 0x314, 0x300, 0x345}}, {0x1F84, {0x3B1, 0x313, 0x301, 0x345}}, {0x1F85, {0x3B1, 0x314, 0x301, 0x345}}, +{0x1F86, {0x3B1, 0x313, 0x342, 0x345}}, {0x1F87, {0x3B1, 0x314, 0x342, 0x345}}, {0x1F88, {0x391, 0x313, 0x345}}, {0x1F89, {0x391, 0x314, 0x345}}, {0x1F8A, {0x391, 0x313, 0x300, 0x345}}, +{0x1F8B, {0x391, 0x314, 0x300, 0x345}}, {0x1F8C, {0x391, 0x313, 0x301, 0x345}}, {0x1F8D, {0x391, 0x314, 0x301, 0x345}}, {0x1F8E, {0x391, 0x313, 0x342, 0x345}}, {0x1F8F, {0x391, 0x314, 0x342, 0x345}}, +{0x1F90, {0x3B7, 0x313, 0x345}}, {0x1F91, {0x3B7, 0x314, 0x345}}, {0x1F92, {0x3B7, 0x313, 0x300, 0x345}}, {0x1F93, {0x3B7, 0x314, 0x300, 0x345}}, {0x1F94, {0x3B7, 0x313, 0x301, 0x345}}, +{0x1F95, {0x3B7, 0x314, 0x301, 0x345}}, {0x1F96, {0x3B7, 0x313, 0x342, 0x345}}, {0x1F97, {0x3B7, 0x314, 0x342, 0x345}}, {0x1F98, {0x397, 0x313, 0x345}}, {0x1F99, {0x397, 0x314, 0x345}}, +{0x1F9A, {0x397, 0x313, 0x300, 0x345}}, {0x1F9B, {0x397, 0x314, 0x300, 0x345}}, {0x1F9C, {0x397, 0x313, 0x301, 0x345}}, {0x1F9D, {0x397, 0x314, 0x301, 0x345}}, {0x1F9E, {0x397, 0x313, 0x342, 0x345}}, +{0x1F9F, {0x397, 0x314, 0x342, 0x345}}, {0x1FA0, {0x3C9, 0x313, 0x345}}, {0x1FA1, {0x3C9, 0x314, 0x345}}, {0x1FA2, {0x3C9, 0x313, 0x300, 0x345}}, {0x1FA3, {0x3C9, 0x314, 0x300, 0x345}}, +{0x1FA4, {0x3C9, 0x313, 0x301, 0x345}}, {0x1FA5, {0x3C9, 0x314, 0x301, 0x345}}, {0x1FA6, {0x3C9, 0x313, 0x342, 0x345}}, {0x1FA7, {0x3C9, 0x314, 0x342, 0x345}}, {0x1FA8, {0x3A9, 0x313, 0x345}}, +{0x1FA9, {0x3A9, 0x314, 0x345}}, {0x1FAA, {0x3A9, 0x313, 0x300, 0x345}}, {0x1FAB, {0x3A9, 0x314, 0x300, 0x345}}, {0x1FAC, {0x3A9, 0x313, 0x301, 0x345}}, {0x1FAD, {0x3A9, 0x314, 0x301, 0x345}}, +{0x1FAE, {0x3A9, 0x313, 0x342, 0x345}}, {0x1FAF, {0x3A9, 0x314, 0x342, 0x345}}, {0x1FB0, {0x3B1, 0x306}}, {0x1FB1, {0x3B1, 0x304}}, {0x1FB2, {0x3B1, 0x300, 0x345}}, {0x1FB3, {0x3B1, 0x345}}, +{0x1FB4, {0x3B1, 0x301, 0x345}}, {0x1FB6, {0x3B1, 0x342}}, {0x1FB7, {0x3B1, 0x342, 0x345}}, {0x1FB8, {0x391, 0x306}}, {0x1FB9, {0x391, 0x304}}, {0x1FBA, {0x391, 0x300}}, {0x1FBB, {0x391, 0x301}}, +{0x1FBC, {0x391, 0x345}}, {0x1FBE, {0x3B9}}, {0x1FC1, {0xA8, 0x342}}, {0x1FC2, {0x3B7, 0x300, 0x345}}, {0x1FC3, {0x3B7, 0x345}}, {0x1FC4, {0x3B7, 0x301, 0x345}}, {0x1FC6, {0x3B7, 0x342}}, +{0x1FC7, {0x3B7, 0x342, 0x345}}, {0x1FC8, {0x395, 0x300}}, {0x1FC9, {0x395, 0x301}}, {0x1FCA, {0x397, 0x300}}, {0x1FCB, {0x397, 0x301}}, {0x1FCC, {0x397, 0x345}}, {0x1FCD, {0x1FBF, 0x300}}, +{0x1FCE, {0x1FBF, 0x301}}, {0x1FCF, {0x1FBF, 0x342}}, {0x1FD0, {0x3B9, 0x306}}, {0x1FD1, {0x3B9, 0x304}}, {0x1FD2, {0x3B9, 0x308, 0x300}}, {0x1FD3, {0x3B9, 0x308, 0x301}}, {0x1FD6, {0x3B9, 0x342}}, +{0x1FD7, {0x3B9, 0x308, 0x342}}, {0x1FD8, {0x399, 0x306}}, {0x1FD9, {0x399, 0x304}}, {0x1FDA, {0x399, 0x300}}, {0x1FDB, {0x399, 0x301}}, {0x1FDD, {0x1FFE, 0x300}}, {0x1FDE, {0x1FFE, 0x301}}, +{0x1FDF, {0x1FFE, 0x342}}, {0x1FE0, {0x3C5, 0x306}}, {0x1FE1, {0x3C5, 0x304}}, {0x1FE2, {0x3C5, 0x308, 0x300}}, {0x1FE3, {0x3C5, 0x308, 0x301}}, {0x1FE4, {0x3C1, 0x313}}, {0x1FE5, {0x3C1, 0x314}}, +{0x1FE6, {0x3C5, 0x342}}, {0x1FE7, {0x3C5, 0x308, 0x342}}, {0x1FE8, {0x3A5, 0x306}}, {0x1FE9, {0x3A5, 0x304}}, {0x1FEA, {0x3A5, 0x300}}, {0x1FEB, {0x3A5, 0x301}}, {0x1FEC, {0x3A1, 0x314}}, +{0x1FED, {0xA8, 0x300}}, {0x1FEE, {0xA8, 0x301}}, {0x1FEF, {0x60}}, {0x1FF2, {0x3C9, 0x300, 0x345}}, {0x1FF3, {0x3C9, 0x345}}, {0x1FF4, {0x3C9, 0x301, 0x345}}, {0x1FF6, {0x3C9, 0x342}}, +{0x1FF7, {0x3C9, 0x342, 0x345}}, {0x1FF8, {0x39F, 0x300}}, {0x1FF9, {0x39F, 0x301}}, {0x1FFA, {0x3A9, 0x300}}, {0x1FFB, {0x3A9, 0x301}}, {0x1FFC, {0x3A9, 0x345}}, {0x1FFD, {0xB4}}, {0x2000, {0x2002}}, +{0x2001, {0x2003}}, {0x2126, {0x3A9}}, {0x212A, {0x4B}}, {0x212B, {0x41, 0x30A}}, {0x219A, {0x2190, 0x338}}, {0x219B, {0x2192, 0x338}}, {0x21AE, {0x2194, 0x338}}, {0x21CD, {0x21D0, 0x338}}, +{0x21CE, {0x21D4, 0x338}}, {0x21CF, {0x21D2, 0x338}}, {0x2204, {0x2203, 0x338}}, {0x2209, {0x2208, 0x338}}, {0x220C, {0x220B, 0x338}}, {0x2224, {0x2223, 0x338}}, {0x2226, {0x2225, 0x338}}, +{0x2241, {0x223C, 0x338}}, {0x2244, {0x2243, 0x338}}, {0x2247, {0x2245, 0x338}}, {0x2249, {0x2248, 0x338}}, {0x2260, {0x3D, 0x338}}, {0x2262, {0x2261, 0x338}}, {0x226D, {0x224D, 0x338}}, +{0x226E, {0x3C, 0x338}}, {0x226F, {0x3E, 0x338}}, {0x2270, {0x2264, 0x338}}, {0x2271, {0x2265, 0x338}}, {0x2274, {0x2272, 0x338}}, {0x2275, {0x2273, 0x338}}, {0x2278, {0x2276, 0x338}}, +{0x2279, {0x2277, 0x338}}, {0x2280, {0x227A, 0x338}}, {0x2281, {0x227B, 0x338}}, {0x2284, {0x2282, 0x338}}, {0x2285, {0x2283, 0x338}}, {0x2288, {0x2286, 0x338}}, {0x2289, {0x2287, 0x338}}, +{0x22AC, {0x22A2, 0x338}}, {0x22AD, {0x22A8, 0x338}}, {0x22AE, {0x22A9, 0x338}}, {0x22AF, {0x22AB, 0x338}}, {0x22E0, {0x227C, 0x338}}, {0x22E1, {0x227D, 0x338}}, {0x22E2, {0x2291, 0x338}}, +{0x22E3, {0x2292, 0x338}}, {0x22EA, {0x22B2, 0x338}}, {0x22EB, {0x22B3, 0x338}}, {0x22EC, {0x22B4, 0x338}}, {0x22ED, {0x22B5, 0x338}}, {0x2329, {0x3008}}, {0x232A, {0x3009}}, +{0x2ADC, {0x2ADD, 0x338}}, {0x304C, {0x304B, 0x3099}}, {0x304E, {0x304D, 0x3099}}, {0x3050, {0x304F, 0x3099}}, {0x3052, {0x3051, 0x3099}}, {0x3054, {0x3053, 0x3099}}, {0x3056, {0x3055, 0x3099}}, +{0x3058, {0x3057, 0x3099}}, {0x305A, {0x3059, 0x3099}}, {0x305C, {0x305B, 0x3099}}, {0x305E, {0x305D, 0x3099}}, {0x3060, {0x305F, 0x3099}}, {0x3062, {0x3061, 0x3099}}, {0x3065, {0x3064, 0x3099}}, +{0x3067, {0x3066, 0x3099}}, {0x3069, {0x3068, 0x3099}}, {0x3070, {0x306F, 0x3099}}, {0x3071, {0x306F, 0x309A}}, {0x3073, {0x3072, 0x3099}}, {0x3074, {0x3072, 0x309A}}, {0x3076, {0x3075, 0x3099}}, +{0x3077, {0x3075, 0x309A}}, {0x3079, {0x3078, 0x3099}}, {0x307A, {0x3078, 0x309A}}, {0x307C, {0x307B, 0x3099}}, {0x307D, {0x307B, 0x309A}}, {0x3094, {0x3046, 0x3099}}, {0x309E, {0x309D, 0x3099}}, +{0x30AC, {0x30AB, 0x3099}}, {0x30AE, {0x30AD, 0x3099}}, {0x30B0, {0x30AF, 0x3099}}, {0x30B2, {0x30B1, 0x3099}}, {0x30B4, {0x30B3, 0x3099}}, {0x30B6, {0x30B5, 0x3099}}, {0x30B8, {0x30B7, 0x3099}}, +{0x30BA, {0x30B9, 0x3099}}, {0x30BC, {0x30BB, 0x3099}}, {0x30BE, {0x30BD, 0x3099}}, {0x30C0, {0x30BF, 0x3099}}, {0x30C2, {0x30C1, 0x3099}}, {0x30C5, {0x30C4, 0x3099}}, {0x30C7, {0x30C6, 0x3099}}, +{0x30C9, {0x30C8, 0x3099}}, {0x30D0, {0x30CF, 0x3099}}, {0x30D1, {0x30CF, 0x309A}}, {0x30D3, {0x30D2, 0x3099}}, {0x30D4, {0x30D2, 0x309A}}, {0x30D6, {0x30D5, 0x3099}}, {0x30D7, {0x30D5, 0x309A}}, +{0x30D9, {0x30D8, 0x3099}}, {0x30DA, {0x30D8, 0x309A}}, {0x30DC, {0x30DB, 0x3099}}, {0x30DD, {0x30DB, 0x309A}}, {0x30F4, {0x30A6, 0x3099}}, {0x30F7, {0x30EF, 0x3099}}, {0x30F8, {0x30F0, 0x3099}}, +{0x30F9, {0x30F1, 0x3099}}, {0x30FA, {0x30F2, 0x3099}}, {0x30FE, {0x30FD, 0x3099}}, {0xF900, {0x8C48}}, {0xF901, {0x66F4}}, {0xF902, {0x8ECA}}, {0xF903, {0x8CC8}}, {0xF904, {0x6ED1}}, +{0xF905, {0x4E32}}, {0xF906, {0x53E5}}, {0xF907, {0x9F9C}}, {0xF908, {0x9F9C}}, {0xF909, {0x5951}}, {0xF90A, {0x91D1}}, {0xF90B, {0x5587}}, {0xF90C, {0x5948}}, {0xF90D, {0x61F6}}, {0xF90E, {0x7669}}, +{0xF90F, {0x7F85}}, {0xF910, {0x863F}}, {0xF911, {0x87BA}}, {0xF912, {0x88F8}}, {0xF913, {0x908F}}, {0xF914, {0x6A02}}, {0xF915, {0x6D1B}}, {0xF916, {0x70D9}}, {0xF917, {0x73DE}}, {0xF918, {0x843D}}, +{0xF919, {0x916A}}, {0xF91A, {0x99F1}}, {0xF91B, {0x4E82}}, {0xF91C, {0x5375}}, {0xF91D, {0x6B04}}, {0xF91E, {0x721B}}, {0xF91F, {0x862D}}, {0xF920, {0x9E1E}}, {0xF921, {0x5D50}}, {0xF922, {0x6FEB}}, +{0xF923, {0x85CD}}, {0xF924, {0x8964}}, {0xF925, {0x62C9}}, {0xF926, {0x81D8}}, {0xF927, {0x881F}}, {0xF928, {0x5ECA}}, {0xF929, {0x6717}}, {0xF92A, {0x6D6A}}, {0xF92B, {0x72FC}}, {0xF92C, {0x90CE}}, +{0xF92D, {0x4F86}}, {0xF92E, {0x51B7}}, {0xF92F, {0x52DE}}, {0xF930, {0x64C4}}, {0xF931, {0x6AD3}}, {0xF932, {0x7210}}, {0xF933, {0x76E7}}, {0xF934, {0x8001}}, {0xF935, {0x8606}}, {0xF936, {0x865C}}, +{0xF937, {0x8DEF}}, {0xF938, {0x9732}}, {0xF939, {0x9B6F}}, {0xF93A, {0x9DFA}}, {0xF93B, {0x788C}}, {0xF93C, {0x797F}}, {0xF93D, {0x7DA0}}, {0xF93E, {0x83C9}}, {0xF93F, {0x9304}}, {0xF940, {0x9E7F}}, +{0xF941, {0x8AD6}}, {0xF942, {0x58DF}}, {0xF943, {0x5F04}}, {0xF944, {0x7C60}}, {0xF945, {0x807E}}, {0xF946, {0x7262}}, {0xF947, {0x78CA}}, {0xF948, {0x8CC2}}, {0xF949, {0x96F7}}, {0xF94A, {0x58D8}}, +{0xF94B, {0x5C62}}, {0xF94C, {0x6A13}}, {0xF94D, {0x6DDA}}, {0xF94E, {0x6F0F}}, {0xF94F, {0x7D2F}}, {0xF950, {0x7E37}}, {0xF951, {0x964B}}, {0xF952, {0x52D2}}, {0xF953, {0x808B}}, {0xF954, {0x51DC}}, +{0xF955, {0x51CC}}, {0xF956, {0x7A1C}}, {0xF957, {0x7DBE}}, {0xF958, {0x83F1}}, {0xF959, {0x9675}}, {0xF95A, {0x8B80}}, {0xF95B, {0x62CF}}, {0xF95C, {0x6A02}}, {0xF95D, {0x8AFE}}, {0xF95E, {0x4E39}}, +{0xF95F, {0x5BE7}}, {0xF960, {0x6012}}, {0xF961, {0x7387}}, {0xF962, {0x7570}}, {0xF963, {0x5317}}, {0xF964, {0x78FB}}, {0xF965, {0x4FBF}}, {0xF966, {0x5FA9}}, {0xF967, {0x4E0D}}, {0xF968, {0x6CCC}}, +{0xF969, {0x6578}}, {0xF96A, {0x7D22}}, {0xF96B, {0x53C3}}, {0xF96C, {0x585E}}, {0xF96D, {0x7701}}, {0xF96E, {0x8449}}, {0xF96F, {0x8AAA}}, {0xF970, {0x6BBA}}, {0xF971, {0x8FB0}}, {0xF972, {0x6C88}}, +{0xF973, {0x62FE}}, {0xF974, {0x82E5}}, {0xF975, {0x63A0}}, {0xF976, {0x7565}}, {0xF977, {0x4EAE}}, {0xF978, {0x5169}}, {0xF979, {0x51C9}}, {0xF97A, {0x6881}}, {0xF97B, {0x7CE7}}, {0xF97C, {0x826F}}, +{0xF97D, {0x8AD2}}, {0xF97E, {0x91CF}}, {0xF97F, {0x52F5}}, {0xF980, {0x5442}}, {0xF981, {0x5973}}, {0xF982, {0x5EEC}}, {0xF983, {0x65C5}}, {0xF984, {0x6FFE}}, {0xF985, {0x792A}}, {0xF986, {0x95AD}}, +{0xF987, {0x9A6A}}, {0xF988, {0x9E97}}, {0xF989, {0x9ECE}}, {0xF98A, {0x529B}}, {0xF98B, {0x66C6}}, {0xF98C, {0x6B77}}, {0xF98D, {0x8F62}}, {0xF98E, {0x5E74}}, {0xF98F, {0x6190}}, {0xF990, {0x6200}}, +{0xF991, {0x649A}}, {0xF992, {0x6F23}}, {0xF993, {0x7149}}, {0xF994, {0x7489}}, {0xF995, {0x79CA}}, {0xF996, {0x7DF4}}, {0xF997, {0x806F}}, {0xF998, {0x8F26}}, {0xF999, {0x84EE}}, {0xF99A, {0x9023}}, +{0xF99B, {0x934A}}, {0xF99C, {0x5217}}, {0xF99D, {0x52A3}}, {0xF99E, {0x54BD}}, {0xF99F, {0x70C8}}, {0xF9A0, {0x88C2}}, {0xF9A1, {0x8AAA}}, {0xF9A2, {0x5EC9}}, {0xF9A3, {0x5FF5}}, {0xF9A4, {0x637B}}, +{0xF9A5, {0x6BAE}}, {0xF9A6, {0x7C3E}}, {0xF9A7, {0x7375}}, {0xF9A8, {0x4EE4}}, {0xF9A9, {0x56F9}}, {0xF9AA, {0x5BE7}}, {0xF9AB, {0x5DBA}}, {0xF9AC, {0x601C}}, {0xF9AD, {0x73B2}}, {0xF9AE, {0x7469}}, +{0xF9AF, {0x7F9A}}, {0xF9B0, {0x8046}}, {0xF9B1, {0x9234}}, {0xF9B2, {0x96F6}}, {0xF9B3, {0x9748}}, {0xF9B4, {0x9818}}, {0xF9B5, {0x4F8B}}, {0xF9B6, {0x79AE}}, {0xF9B7, {0x91B4}}, {0xF9B8, {0x96B8}}, +{0xF9B9, {0x60E1}}, {0xF9BA, {0x4E86}}, {0xF9BB, {0x50DA}}, {0xF9BC, {0x5BEE}}, {0xF9BD, {0x5C3F}}, {0xF9BE, {0x6599}}, {0xF9BF, {0x6A02}}, {0xF9C0, {0x71CE}}, {0xF9C1, {0x7642}}, {0xF9C2, {0x84FC}}, +{0xF9C3, {0x907C}}, {0xF9C4, {0x9F8D}}, {0xF9C5, {0x6688}}, {0xF9C6, {0x962E}}, {0xF9C7, {0x5289}}, {0xF9C8, {0x677B}}, {0xF9C9, {0x67F3}}, {0xF9CA, {0x6D41}}, {0xF9CB, {0x6E9C}}, {0xF9CC, {0x7409}}, +{0xF9CD, {0x7559}}, {0xF9CE, {0x786B}}, {0xF9CF, {0x7D10}}, {0xF9D0, {0x985E}}, {0xF9D1, {0x516D}}, {0xF9D2, {0x622E}}, {0xF9D3, {0x9678}}, {0xF9D4, {0x502B}}, {0xF9D5, {0x5D19}}, {0xF9D6, {0x6DEA}}, +{0xF9D7, {0x8F2A}}, {0xF9D8, {0x5F8B}}, {0xF9D9, {0x6144}}, {0xF9DA, {0x6817}}, {0xF9DB, {0x7387}}, {0xF9DC, {0x9686}}, {0xF9DD, {0x5229}}, {0xF9DE, {0x540F}}, {0xF9DF, {0x5C65}}, {0xF9E0, {0x6613}}, +{0xF9E1, {0x674E}}, {0xF9E2, {0x68A8}}, {0xF9E3, {0x6CE5}}, {0xF9E4, {0x7406}}, {0xF9E5, {0x75E2}}, {0xF9E6, {0x7F79}}, {0xF9E7, {0x88CF}}, {0xF9E8, {0x88E1}}, {0xF9E9, {0x91CC}}, {0xF9EA, {0x96E2}}, +{0xF9EB, {0x533F}}, {0xF9EC, {0x6EBA}}, {0xF9ED, {0x541D}}, {0xF9EE, {0x71D0}}, {0xF9EF, {0x7498}}, {0xF9F0, {0x85FA}}, {0xF9F1, {0x96A3}}, {0xF9F2, {0x9C57}}, {0xF9F3, {0x9E9F}}, {0xF9F4, {0x6797}}, +{0xF9F5, {0x6DCB}}, {0xF9F6, {0x81E8}}, {0xF9F7, {0x7ACB}}, {0xF9F8, {0x7B20}}, {0xF9F9, {0x7C92}}, {0xF9FA, {0x72C0}}, {0xF9FB, {0x7099}}, {0xF9FC, {0x8B58}}, {0xF9FD, {0x4EC0}}, {0xF9FE, {0x8336}}, +{0xF9FF, {0x523A}}, {0xFA00, {0x5207}}, {0xFA01, {0x5EA6}}, {0xFA02, {0x62D3}}, {0xFA03, {0x7CD6}}, {0xFA04, {0x5B85}}, {0xFA05, {0x6D1E}}, {0xFA06, {0x66B4}}, {0xFA07, {0x8F3B}}, {0xFA08, {0x884C}}, +{0xFA09, {0x964D}}, {0xFA0A, {0x898B}}, {0xFA0B, {0x5ED3}}, {0xFA0C, {0x5140}}, {0xFA0D, {0x55C0}}, {0xFA10, {0x585A}}, {0xFA12, {0x6674}}, {0xFA15, {0x51DE}}, {0xFA16, {0x732A}}, {0xFA17, {0x76CA}}, +{0xFA18, {0x793C}}, {0xFA19, {0x795E}}, {0xFA1A, {0x7965}}, {0xFA1B, {0x798F}}, {0xFA1C, {0x9756}}, {0xFA1D, {0x7CBE}}, {0xFA1E, {0x7FBD}}, {0xFA20, {0x8612}}, {0xFA22, {0x8AF8}}, {0xFA25, {0x9038}}, +{0xFA26, {0x90FD}}, {0xFA2A, {0x98EF}}, {0xFA2B, {0x98FC}}, {0xFA2C, {0x9928}}, {0xFA2D, {0x9DB4}}, {0xFA2E, {0x90DE}}, {0xFA2F, {0x96B7}}, {0xFA30, {0x4FAE}}, {0xFA31, {0x50E7}}, {0xFA32, {0x514D}}, +{0xFA33, {0x52C9}}, {0xFA34, {0x52E4}}, {0xFA35, {0x5351}}, {0xFA36, {0x559D}}, {0xFA37, {0x5606}}, {0xFA38, {0x5668}}, {0xFA39, {0x5840}}, {0xFA3A, {0x58A8}}, {0xFA3B, {0x5C64}}, {0xFA3C, {0x5C6E}}, +{0xFA3D, {0x6094}}, {0xFA3E, {0x6168}}, {0xFA3F, {0x618E}}, {0xFA40, {0x61F2}}, {0xFA41, {0x654F}}, {0xFA42, {0x65E2}}, {0xFA43, {0x6691}}, {0xFA44, {0x6885}}, {0xFA45, {0x6D77}}, {0xFA46, {0x6E1A}}, +{0xFA47, {0x6F22}}, {0xFA48, {0x716E}}, {0xFA49, {0x722B}}, {0xFA4A, {0x7422}}, {0xFA4B, {0x7891}}, {0xFA4C, {0x793E}}, {0xFA4D, {0x7949}}, {0xFA4E, {0x7948}}, {0xFA4F, {0x7950}}, {0xFA50, {0x7956}}, +{0xFA51, {0x795D}}, {0xFA52, {0x798D}}, {0xFA53, {0x798E}}, {0xFA54, {0x7A40}}, {0xFA55, {0x7A81}}, {0xFA56, {0x7BC0}}, {0xFA57, {0x7DF4}}, {0xFA58, {0x7E09}}, {0xFA59, {0x7E41}}, {0xFA5A, {0x7F72}}, +{0xFA5B, {0x8005}}, {0xFA5C, {0x81ED}}, {0xFA5D, {0x8279}}, {0xFA5E, {0x8279}}, {0xFA5F, {0x8457}}, {0xFA60, {0x8910}}, {0xFA61, {0x8996}}, {0xFA62, {0x8B01}}, {0xFA63, {0x8B39}}, {0xFA64, {0x8CD3}}, +{0xFA65, {0x8D08}}, {0xFA66, {0x8FB6}}, {0xFA67, {0x9038}}, {0xFA68, {0x96E3}}, {0xFA69, {0x97FF}}, {0xFA6A, {0x983B}}, {0xFA6B, {0x6075}}, {0xFA6C, {0x242EE}}, {0xFA6D, {0x8218}}, {0xFA70, {0x4E26}}, +{0xFA71, {0x51B5}}, {0xFA72, {0x5168}}, {0xFA73, {0x4F80}}, {0xFA74, {0x5145}}, {0xFA75, {0x5180}}, {0xFA76, {0x52C7}}, {0xFA77, {0x52FA}}, {0xFA78, {0x559D}}, {0xFA79, {0x5555}}, {0xFA7A, {0x5599}}, +{0xFA7B, {0x55E2}}, {0xFA7C, {0x585A}}, {0xFA7D, {0x58B3}}, {0xFA7E, {0x5944}}, {0xFA7F, {0x5954}}, {0xFA80, {0x5A62}}, {0xFA81, {0x5B28}}, {0xFA82, {0x5ED2}}, {0xFA83, {0x5ED9}}, {0xFA84, {0x5F69}}, +{0xFA85, {0x5FAD}}, {0xFA86, {0x60D8}}, {0xFA87, {0x614E}}, {0xFA88, {0x6108}}, {0xFA89, {0x618E}}, {0xFA8A, {0x6160}}, {0xFA8B, {0x61F2}}, {0xFA8C, {0x6234}}, {0xFA8D, {0x63C4}}, {0xFA8E, {0x641C}}, +{0xFA8F, {0x6452}}, {0xFA90, {0x6556}}, {0xFA91, {0x6674}}, {0xFA92, {0x6717}}, {0xFA93, {0x671B}}, {0xFA94, {0x6756}}, {0xFA95, {0x6B79}}, {0xFA96, {0x6BBA}}, {0xFA97, {0x6D41}}, {0xFA98, {0x6EDB}}, +{0xFA99, {0x6ECB}}, {0xFA9A, {0x6F22}}, {0xFA9B, {0x701E}}, {0xFA9C, {0x716E}}, {0xFA9D, {0x77A7}}, {0xFA9E, {0x7235}}, {0xFA9F, {0x72AF}}, {0xFAA0, {0x732A}}, {0xFAA1, {0x7471}}, {0xFAA2, {0x7506}}, +{0xFAA3, {0x753B}}, {0xFAA4, {0x761D}}, {0xFAA5, {0x761F}}, {0xFAA6, {0x76CA}}, {0xFAA7, {0x76DB}}, {0xFAA8, {0x76F4}}, {0xFAA9, {0x774A}}, {0xFAAA, {0x7740}}, {0xFAAB, {0x78CC}}, {0xFAAC, {0x7AB1}}, +{0xFAAD, {0x7BC0}}, {0xFAAE, {0x7C7B}}, {0xFAAF, {0x7D5B}}, {0xFAB0, {0x7DF4}}, {0xFAB1, {0x7F3E}}, {0xFAB2, {0x8005}}, {0xFAB3, {0x8352}}, {0xFAB4, {0x83EF}}, {0xFAB5, {0x8779}}, {0xFAB6, {0x8941}}, +{0xFAB7, {0x8986}}, {0xFAB8, {0x8996}}, {0xFAB9, {0x8ABF}}, {0xFABA, {0x8AF8}}, {0xFABB, {0x8ACB}}, {0xFABC, {0x8B01}}, {0xFABD, {0x8AFE}}, {0xFABE, {0x8AED}}, {0xFABF, {0x8B39}}, {0xFAC0, {0x8B8A}}, +{0xFAC1, {0x8D08}}, {0xFAC2, {0x8F38}}, {0xFAC3, {0x9072}}, {0xFAC4, {0x9199}}, {0xFAC5, {0x9276}}, {0xFAC6, {0x967C}}, {0xFAC7, {0x96E3}}, {0xFAC8, {0x9756}}, {0xFAC9, {0x97DB}}, {0xFACA, {0x97FF}}, +{0xFACB, {0x980B}}, {0xFACC, {0x983B}}, {0xFACD, {0x9B12}}, {0xFACE, {0x9F9C}}, {0xFACF, {0x2284A}}, {0xFAD0, {0x22844}}, {0xFAD1, {0x233D5}}, {0xFAD2, {0x3B9D}}, {0xFAD3, {0x4018}}, +{0xFAD4, {0x4039}}, {0xFAD5, {0x25249}}, {0xFAD6, {0x25CD0}}, {0xFAD7, {0x27ED3}}, {0xFAD8, {0x9F43}}, {0xFAD9, {0x9F8E}}, {0xFB1D, {0x5D9, 0x5B4}}, {0xFB1F, {0x5F2, 0x5B7}}, {0xFB2A, {0x5E9, 0x5C1}}, +{0xFB2B, {0x5E9, 0x5C2}}, {0xFB2C, {0x5E9, 0x5BC, 0x5C1}}, {0xFB2D, {0x5E9, 0x5BC, 0x5C2}}, {0xFB2E, {0x5D0, 0x5B7}}, {0xFB2F, {0x5D0, 0x5B8}}, {0xFB30, {0x5D0, 0x5BC}}, {0xFB31, {0x5D1, 0x5BC}}, +{0xFB32, {0x5D2, 0x5BC}}, {0xFB33, {0x5D3, 0x5BC}}, {0xFB34, {0x5D4, 0x5BC}}, {0xFB35, {0x5D5, 0x5BC}}, {0xFB36, {0x5D6, 0x5BC}}, {0xFB38, {0x5D8, 0x5BC}}, {0xFB39, {0x5D9, 0x5BC}}, +{0xFB3A, {0x5DA, 0x5BC}}, {0xFB3B, {0x5DB, 0x5BC}}, {0xFB3C, {0x5DC, 0x5BC}}, {0xFB3E, {0x5DE, 0x5BC}}, {0xFB40, {0x5E0, 0x5BC}}, {0xFB41, {0x5E1, 0x5BC}}, {0xFB43, {0x5E3, 0x5BC}}, +{0xFB44, {0x5E4, 0x5BC}}, {0xFB46, {0x5E6, 0x5BC}}, {0xFB47, {0x5E7, 0x5BC}}, {0xFB48, {0x5E8, 0x5BC}}, {0xFB49, {0x5E9, 0x5BC}}, {0xFB4A, {0x5EA, 0x5BC}}, {0xFB4B, {0x5D5, 0x5B9}}, +{0xFB4C, {0x5D1, 0x5BF}}, {0xFB4D, {0x5DB, 0x5BF}}, {0xFB4E, {0x5E4, 0x5BF}}, {0x1109A, {0x11099, 0x110BA}}, {0x1109C, {0x1109B, 0x110BA}}, {0x110AB, {0x110A5, 0x110BA}}, +{0x1112E, {0x11131, 0x11127}}, {0x1112F, {0x11132, 0x11127}}, {0x1134B, {0x11347, 0x1133E}}, {0x1134C, {0x11347, 0x11357}}, {0x114BB, {0x114B9, 0x114BA}}, {0x114BC, {0x114B9, 0x114B0}}, +{0x114BE, {0x114B9, 0x114BD}}, {0x115BA, {0x115B8, 0x115AF}}, {0x115BB, {0x115B9, 0x115AF}}, {0x1D15E, {0x1D157, 0x1D165}}, {0x1D15F, {0x1D158, 0x1D165}}, {0x1D160, {0x1D158, 0x1D165, 0x1D16E}}, +{0x1D161, {0x1D158, 0x1D165, 0x1D16F}}, {0x1D162, {0x1D158, 0x1D165, 0x1D170}}, {0x1D163, {0x1D158, 0x1D165, 0x1D171}}, {0x1D164, {0x1D158, 0x1D165, 0x1D172}}, {0x1D1BB, {0x1D1B9, 0x1D165}}, +{0x1D1BC, {0x1D1BA, 0x1D165}}, {0x1D1BD, {0x1D1B9, 0x1D165, 0x1D16E}}, {0x1D1BE, {0x1D1BA, 0x1D165, 0x1D16E}}, {0x1D1BF, {0x1D1B9, 0x1D165, 0x1D16F}}, {0x1D1C0, {0x1D1BA, 0x1D165, 0x1D16F}}, +{0x2F800, {0x4E3D}}, {0x2F801, {0x4E38}}, {0x2F802, {0x4E41}}, {0x2F803, {0x20122}}, {0x2F804, {0x4F60}}, {0x2F805, {0x4FAE}}, {0x2F806, {0x4FBB}}, {0x2F807, {0x5002}}, {0x2F808, {0x507A}}, +{0x2F809, {0x5099}}, {0x2F80A, {0x50E7}}, {0x2F80B, {0x50CF}}, {0x2F80C, {0x349E}}, {0x2F80D, {0x2063A}}, {0x2F80E, {0x514D}}, {0x2F80F, {0x5154}}, {0x2F810, {0x5164}}, {0x2F811, {0x5177}}, +{0x2F812, {0x2051C}}, {0x2F813, {0x34B9}}, {0x2F814, {0x5167}}, {0x2F815, {0x518D}}, {0x2F816, {0x2054B}}, {0x2F817, {0x5197}}, {0x2F818, {0x51A4}}, {0x2F819, {0x4ECC}}, {0x2F81A, {0x51AC}}, +{0x2F81B, {0x51B5}}, {0x2F81C, {0x291DF}}, {0x2F81D, {0x51F5}}, {0x2F81E, {0x5203}}, {0x2F81F, {0x34DF}}, {0x2F820, {0x523B}}, {0x2F821, {0x5246}}, {0x2F822, {0x5272}}, {0x2F823, {0x5277}}, +{0x2F824, {0x3515}}, {0x2F825, {0x52C7}}, {0x2F826, {0x52C9}}, {0x2F827, {0x52E4}}, {0x2F828, {0x52FA}}, {0x2F829, {0x5305}}, {0x2F82A, {0x5306}}, {0x2F82B, {0x5317}}, {0x2F82C, {0x5349}}, +{0x2F82D, {0x5351}}, {0x2F82E, {0x535A}}, {0x2F82F, {0x5373}}, {0x2F830, {0x537D}}, {0x2F831, {0x537F}}, {0x2F832, {0x537F}}, {0x2F833, {0x537F}}, {0x2F834, {0x20A2C}}, {0x2F835, {0x7070}}, +{0x2F836, {0x53CA}}, {0x2F837, {0x53DF}}, {0x2F838, {0x20B63}}, {0x2F839, {0x53EB}}, {0x2F83A, {0x53F1}}, {0x2F83B, {0x5406}}, {0x2F83C, {0x549E}}, {0x2F83D, {0x5438}}, {0x2F83E, {0x5448}}, +{0x2F83F, {0x5468}}, {0x2F840, {0x54A2}}, {0x2F841, {0x54F6}}, {0x2F842, {0x5510}}, {0x2F843, {0x5553}}, {0x2F844, {0x5563}}, {0x2F845, {0x5584}}, {0x2F846, {0x5584}}, {0x2F847, {0x5599}}, +{0x2F848, {0x55AB}}, {0x2F849, {0x55B3}}, {0x2F84A, {0x55C2}}, {0x2F84B, {0x5716}}, {0x2F84C, {0x5606}}, {0x2F84D, {0x5717}}, {0x2F84E, {0x5651}}, {0x2F84F, {0x5674}}, {0x2F850, {0x5207}}, +{0x2F851, {0x58EE}}, {0x2F852, {0x57CE}}, {0x2F853, {0x57F4}}, {0x2F854, {0x580D}}, {0x2F855, {0x578B}}, {0x2F856, {0x5832}}, {0x2F857, {0x5831}}, {0x2F858, {0x58AC}}, {0x2F859, {0x214E4}}, +{0x2F85A, {0x58F2}}, {0x2F85B, {0x58F7}}, {0x2F85C, {0x5906}}, {0x2F85D, {0x591A}}, {0x2F85E, {0x5922}}, {0x2F85F, {0x5962}}, {0x2F860, {0x216A8}}, {0x2F861, {0x216EA}}, {0x2F862, {0x59EC}}, +{0x2F863, {0x5A1B}}, {0x2F864, {0x5A27}}, {0x2F865, {0x59D8}}, {0x2F866, {0x5A66}}, {0x2F867, {0x36EE}}, {0x2F868, {0x36FC}}, {0x2F869, {0x5B08}}, {0x2F86A, {0x5B3E}}, {0x2F86B, {0x5B3E}}, +{0x2F86C, {0x219C8}}, {0x2F86D, {0x5BC3}}, {0x2F86E, {0x5BD8}}, {0x2F86F, {0x5BE7}}, {0x2F870, {0x5BF3}}, {0x2F871, {0x21B18}}, {0x2F872, {0x5BFF}}, {0x2F873, {0x5C06}}, {0x2F874, {0x5F53}}, +{0x2F875, {0x5C22}}, {0x2F876, {0x3781}}, {0x2F877, {0x5C60}}, {0x2F878, {0x5C6E}}, {0x2F879, {0x5CC0}}, {0x2F87A, {0x5C8D}}, {0x2F87B, {0x21DE4}}, {0x2F87C, {0x5D43}}, {0x2F87D, {0x21DE6}}, +{0x2F87E, {0x5D6E}}, {0x2F87F, {0x5D6B}}, {0x2F880, {0x5D7C}}, {0x2F881, {0x5DE1}}, {0x2F882, {0x5DE2}}, {0x2F883, {0x382F}}, {0x2F884, {0x5DFD}}, {0x2F885, {0x5E28}}, {0x2F886, {0x5E3D}}, +{0x2F887, {0x5E69}}, {0x2F888, {0x3862}}, {0x2F889, {0x22183}}, {0x2F88A, {0x387C}}, {0x2F88B, {0x5EB0}}, {0x2F88C, {0x5EB3}}, {0x2F88D, {0x5EB6}}, {0x2F88E, {0x5ECA}}, {0x2F88F, {0x2A392}}, +{0x2F890, {0x5EFE}}, {0x2F891, {0x22331}}, {0x2F892, {0x22331}}, {0x2F893, {0x8201}}, {0x2F894, {0x5F22}}, {0x2F895, {0x5F22}}, {0x2F896, {0x38C7}}, {0x2F897, {0x232B8}}, {0x2F898, {0x261DA}}, +{0x2F899, {0x5F62}}, {0x2F89A, {0x5F6B}}, {0x2F89B, {0x38E3}}, {0x2F89C, {0x5F9A}}, {0x2F89D, {0x5FCD}}, {0x2F89E, {0x5FD7}}, {0x2F89F, {0x5FF9}}, {0x2F8A0, {0x6081}}, {0x2F8A1, {0x393A}}, +{0x2F8A2, {0x391C}}, {0x2F8A3, {0x6094}}, {0x2F8A4, {0x226D4}}, {0x2F8A5, {0x60C7}}, {0x2F8A6, {0x6148}}, {0x2F8A7, {0x614C}}, {0x2F8A8, {0x614E}}, {0x2F8A9, {0x614C}}, {0x2F8AA, {0x617A}}, +{0x2F8AB, {0x618E}}, {0x2F8AC, {0x61B2}}, {0x2F8AD, {0x61A4}}, {0x2F8AE, {0x61AF}}, {0x2F8AF, {0x61DE}}, {0x2F8B0, {0x61F2}}, {0x2F8B1, {0x61F6}}, {0x2F8B2, {0x6210}}, {0x2F8B3, {0x621B}}, +{0x2F8B4, {0x625D}}, {0x2F8B5, {0x62B1}}, {0x2F8B6, {0x62D4}}, {0x2F8B7, {0x6350}}, {0x2F8B8, {0x22B0C}}, {0x2F8B9, {0x633D}}, {0x2F8BA, {0x62FC}}, {0x2F8BB, {0x6368}}, {0x2F8BC, {0x6383}}, +{0x2F8BD, {0x63E4}}, {0x2F8BE, {0x22BF1}}, {0x2F8BF, {0x6422}}, {0x2F8C0, {0x63C5}}, {0x2F8C1, {0x63A9}}, {0x2F8C2, {0x3A2E}}, {0x2F8C3, {0x6469}}, {0x2F8C4, {0x647E}}, {0x2F8C5, {0x649D}}, +{0x2F8C6, {0x6477}}, {0x2F8C7, {0x3A6C}}, {0x2F8C8, {0x654F}}, {0x2F8C9, {0x656C}}, {0x2F8CA, {0x2300A}}, {0x2F8CB, {0x65E3}}, {0x2F8CC, {0x66F8}}, {0x2F8CD, {0x6649}}, {0x2F8CE, {0x3B19}}, +{0x2F8CF, {0x6691}}, {0x2F8D0, {0x3B08}}, {0x2F8D1, {0x3AE4}}, {0x2F8D2, {0x5192}}, {0x2F8D3, {0x5195}}, {0x2F8D4, {0x6700}}, {0x2F8D5, {0x669C}}, {0x2F8D6, {0x80AD}}, {0x2F8D7, {0x43D9}}, +{0x2F8D8, {0x6717}}, {0x2F8D9, {0x671B}}, {0x2F8DA, {0x6721}}, {0x2F8DB, {0x675E}}, {0x2F8DC, {0x6753}}, {0x2F8DD, {0x233C3}}, {0x2F8DE, {0x3B49}}, {0x2F8DF, {0x67FA}}, {0x2F8E0, {0x6785}}, +{0x2F8E1, {0x6852}}, {0x2F8E2, {0x6885}}, {0x2F8E3, {0x2346D}}, {0x2F8E4, {0x688E}}, {0x2F8E5, {0x681F}}, {0x2F8E6, {0x6914}}, {0x2F8E7, {0x3B9D}}, {0x2F8E8, {0x6942}}, {0x2F8E9, {0x69A3}}, +{0x2F8EA, {0x69EA}}, {0x2F8EB, {0x6AA8}}, {0x2F8EC, {0x236A3}}, {0x2F8ED, {0x6ADB}}, {0x2F8EE, {0x3C18}}, {0x2F8EF, {0x6B21}}, {0x2F8F0, {0x238A7}}, {0x2F8F1, {0x6B54}}, {0x2F8F2, {0x3C4E}}, +{0x2F8F3, {0x6B72}}, {0x2F8F4, {0x6B9F}}, {0x2F8F5, {0x6BBA}}, {0x2F8F6, {0x6BBB}}, {0x2F8F7, {0x23A8D}}, {0x2F8F8, {0x21D0B}}, {0x2F8F9, {0x23AFA}}, {0x2F8FA, {0x6C4E}}, {0x2F8FB, {0x23CBC}}, +{0x2F8FC, {0x6CBF}}, {0x2F8FD, {0x6CCD}}, {0x2F8FE, {0x6C67}}, {0x2F8FF, {0x6D16}}, {0x2F900, {0x6D3E}}, {0x2F901, {0x6D77}}, {0x2F902, {0x6D41}}, {0x2F903, {0x6D69}}, {0x2F904, {0x6D78}}, +{0x2F905, {0x6D85}}, {0x2F906, {0x23D1E}}, {0x2F907, {0x6D34}}, {0x2F908, {0x6E2F}}, {0x2F909, {0x6E6E}}, {0x2F90A, {0x3D33}}, {0x2F90B, {0x6ECB}}, {0x2F90C, {0x6EC7}}, {0x2F90D, {0x23ED1}}, +{0x2F90E, {0x6DF9}}, {0x2F90F, {0x6F6E}}, {0x2F910, {0x23F5E}}, {0x2F911, {0x23F8E}}, {0x2F912, {0x6FC6}}, {0x2F913, {0x7039}}, {0x2F914, {0x701E}}, {0x2F915, {0x701B}}, {0x2F916, {0x3D96}}, +{0x2F917, {0x704A}}, {0x2F918, {0x707D}}, {0x2F919, {0x7077}}, {0x2F91A, {0x70AD}}, {0x2F91B, {0x20525}}, {0x2F91C, {0x7145}}, {0x2F91D, {0x24263}}, {0x2F91E, {0x719C}}, {0x2F91F, {0x243AB}}, +{0x2F920, {0x7228}}, {0x2F921, {0x7235}}, {0x2F922, {0x7250}}, {0x2F923, {0x24608}}, {0x2F924, {0x7280}}, {0x2F925, {0x7295}}, {0x2F926, {0x24735}}, {0x2F927, {0x24814}}, {0x2F928, {0x737A}}, +{0x2F929, {0x738B}}, {0x2F92A, {0x3EAC}}, {0x2F92B, {0x73A5}}, {0x2F92C, {0x3EB8}}, {0x2F92D, {0x3EB8}}, {0x2F92E, {0x7447}}, {0x2F92F, {0x745C}}, {0x2F930, {0x7471}}, {0x2F931, {0x7485}}, +{0x2F932, {0x74CA}}, {0x2F933, {0x3F1B}}, {0x2F934, {0x7524}}, {0x2F935, {0x24C36}}, {0x2F936, {0x753E}}, {0x2F937, {0x24C92}}, {0x2F938, {0x7570}}, {0x2F939, {0x2219F}}, {0x2F93A, {0x7610}}, +{0x2F93B, {0x24FA1}}, {0x2F93C, {0x24FB8}}, {0x2F93D, {0x25044}}, {0x2F93E, {0x3FFC}}, {0x2F93F, {0x4008}}, {0x2F940, {0x76F4}}, {0x2F941, {0x250F3}}, {0x2F942, {0x250F2}}, {0x2F943, {0x25119}}, +{0x2F944, {0x25133}}, {0x2F945, {0x771E}}, {0x2F946, {0x771F}}, {0x2F947, {0x771F}}, {0x2F948, {0x774A}}, {0x2F949, {0x4039}}, {0x2F94A, {0x778B}}, {0x2F94B, {0x4046}}, {0x2F94C, {0x4096}}, +{0x2F94D, {0x2541D}}, {0x2F94E, {0x784E}}, {0x2F94F, {0x788C}}, {0x2F950, {0x78CC}}, {0x2F951, {0x40E3}}, {0x2F952, {0x25626}}, {0x2F953, {0x7956}}, {0x2F954, {0x2569A}}, {0x2F955, {0x256C5}}, +{0x2F956, {0x798F}}, {0x2F957, {0x79EB}}, {0x2F958, {0x412F}}, {0x2F959, {0x7A40}}, {0x2F95A, {0x7A4A}}, {0x2F95B, {0x7A4F}}, {0x2F95C, {0x2597C}}, {0x2F95D, {0x25AA7}}, {0x2F95E, {0x25AA7}}, +{0x2F95F, {0x7AEE}}, {0x2F960, {0x4202}}, {0x2F961, {0x25BAB}}, {0x2F962, {0x7BC6}}, {0x2F963, {0x7BC9}}, {0x2F964, {0x4227}}, {0x2F965, {0x25C80}}, {0x2F966, {0x7CD2}}, {0x2F967, {0x42A0}}, +{0x2F968, {0x7CE8}}, {0x2F969, {0x7CE3}}, {0x2F96A, {0x7D00}}, {0x2F96B, {0x25F86}}, {0x2F96C, {0x7D63}}, {0x2F96D, {0x4301}}, {0x2F96E, {0x7DC7}}, {0x2F96F, {0x7E02}}, {0x2F970, {0x7E45}}, +{0x2F971, {0x4334}}, {0x2F972, {0x26228}}, {0x2F973, {0x26247}}, {0x2F974, {0x4359}}, {0x2F975, {0x262D9}}, {0x2F976, {0x7F7A}}, {0x2F977, {0x2633E}}, {0x2F978, {0x7F95}}, {0x2F979, {0x7FFA}}, +{0x2F97A, {0x8005}}, {0x2F97B, {0x264DA}}, {0x2F97C, {0x26523}}, {0x2F97D, {0x8060}}, {0x2F97E, {0x265A8}}, {0x2F97F, {0x8070}}, {0x2F980, {0x2335F}}, {0x2F981, {0x43D5}}, {0x2F982, {0x80B2}}, +{0x2F983, {0x8103}}, {0x2F984, {0x440B}}, {0x2F985, {0x813E}}, {0x2F986, {0x5AB5}}, {0x2F987, {0x267A7}}, {0x2F988, {0x267B5}}, {0x2F989, {0x23393}}, {0x2F98A, {0x2339C}}, {0x2F98B, {0x8201}}, +{0x2F98C, {0x8204}}, {0x2F98D, {0x8F9E}}, {0x2F98E, {0x446B}}, {0x2F98F, {0x8291}}, {0x2F990, {0x828B}}, {0x2F991, {0x829D}}, {0x2F992, {0x52B3}}, {0x2F993, {0x82B1}}, {0x2F994, {0x82B3}}, +{0x2F995, {0x82BD}}, {0x2F996, {0x82E6}}, {0x2F997, {0x26B3C}}, {0x2F998, {0x82E5}}, {0x2F999, {0x831D}}, {0x2F99A, {0x8363}}, {0x2F99B, {0x83AD}}, {0x2F99C, {0x8323}}, {0x2F99D, {0x83BD}}, +{0x2F99E, {0x83E7}}, {0x2F99F, {0x8457}}, {0x2F9A0, {0x8353}}, {0x2F9A1, {0x83CA}}, {0x2F9A2, {0x83CC}}, {0x2F9A3, {0x83DC}}, {0x2F9A4, {0x26C36}}, {0x2F9A5, {0x26D6B}}, {0x2F9A6, {0x26CD5}}, +{0x2F9A7, {0x452B}}, {0x2F9A8, {0x84F1}}, {0x2F9A9, {0x84F3}}, {0x2F9AA, {0x8516}}, {0x2F9AB, {0x273CA}}, {0x2F9AC, {0x8564}}, {0x2F9AD, {0x26F2C}}, {0x2F9AE, {0x455D}}, {0x2F9AF, {0x4561}}, +{0x2F9B0, {0x26FB1}}, {0x2F9B1, {0x270D2}}, {0x2F9B2, {0x456B}}, {0x2F9B3, {0x8650}}, {0x2F9B4, {0x865C}}, {0x2F9B5, {0x8667}}, {0x2F9B6, {0x8669}}, {0x2F9B7, {0x86A9}}, {0x2F9B8, {0x8688}}, +{0x2F9B9, {0x870E}}, {0x2F9BA, {0x86E2}}, {0x2F9BB, {0x8779}}, {0x2F9BC, {0x8728}}, {0x2F9BD, {0x876B}}, {0x2F9BE, {0x8786}}, {0x2F9BF, {0x45D7}}, {0x2F9C0, {0x87E1}}, {0x2F9C1, {0x8801}}, +{0x2F9C2, {0x45F9}}, {0x2F9C3, {0x8860}}, {0x2F9C4, {0x8863}}, {0x2F9C5, {0x27667}}, {0x2F9C6, {0x88D7}}, {0x2F9C7, {0x88DE}}, {0x2F9C8, {0x4635}}, {0x2F9C9, {0x88FA}}, {0x2F9CA, {0x34BB}}, +{0x2F9CB, {0x278AE}}, {0x2F9CC, {0x27966}}, {0x2F9CD, {0x46BE}}, {0x2F9CE, {0x46C7}}, {0x2F9CF, {0x8AA0}}, {0x2F9D0, {0x8AED}}, {0x2F9D1, {0x8B8A}}, {0x2F9D2, {0x8C55}}, {0x2F9D3, {0x27CA8}}, +{0x2F9D4, {0x8CAB}}, {0x2F9D5, {0x8CC1}}, {0x2F9D6, {0x8D1B}}, {0x2F9D7, {0x8D77}}, {0x2F9D8, {0x27F2F}}, {0x2F9D9, {0x20804}}, {0x2F9DA, {0x8DCB}}, {0x2F9DB, {0x8DBC}}, {0x2F9DC, {0x8DF0}}, +{0x2F9DD, {0x208DE}}, {0x2F9DE, {0x8ED4}}, {0x2F9DF, {0x8F38}}, {0x2F9E0, {0x285D2}}, {0x2F9E1, {0x285ED}}, {0x2F9E2, {0x9094}}, {0x2F9E3, {0x90F1}}, {0x2F9E4, {0x9111}}, {0x2F9E5, {0x2872E}}, +{0x2F9E6, {0x911B}}, {0x2F9E7, {0x9238}}, {0x2F9E8, {0x92D7}}, {0x2F9E9, {0x92D8}}, {0x2F9EA, {0x927C}}, {0x2F9EB, {0x93F9}}, {0x2F9EC, {0x9415}}, {0x2F9ED, {0x28BFA}}, {0x2F9EE, {0x958B}}, +{0x2F9EF, {0x4995}}, {0x2F9F0, {0x95B7}}, {0x2F9F1, {0x28D77}}, {0x2F9F2, {0x49E6}}, {0x2F9F3, {0x96C3}}, {0x2F9F4, {0x5DB2}}, {0x2F9F5, {0x9723}}, {0x2F9F6, {0x29145}}, {0x2F9F7, {0x2921A}}, +{0x2F9F8, {0x4A6E}}, {0x2F9F9, {0x4A76}}, {0x2F9FA, {0x97E0}}, {0x2F9FB, {0x2940A}}, {0x2F9FC, {0x4AB2}}, {0x2F9FD, {0x29496}}, {0x2F9FE, {0x980B}}, {0x2F9FF, {0x980B}}, {0x2FA00, {0x9829}}, +{0x2FA01, {0x295B6}}, {0x2FA02, {0x98E2}}, {0x2FA03, {0x4B33}}, {0x2FA04, {0x9929}}, {0x2FA05, {0x99A7}}, {0x2FA06, {0x99C2}}, {0x2FA07, {0x99FE}}, {0x2FA08, {0x4BCE}}, {0x2FA09, {0x29B30}}, +{0x2FA0A, {0x9B12}}, {0x2FA0B, {0x9C40}}, {0x2FA0C, {0x9CFD}}, {0x2FA0D, {0x4CCE}}, {0x2FA0E, {0x4CED}}, {0x2FA0F, {0x9D67}}, {0x2FA10, {0x2A0CE}}, {0x2FA11, {0x4CF8}}, {0x2FA12, {0x2A105}}, +{0x2FA13, {0x2A20E}}, {0x2FA14, {0x2A291}}, {0x2FA15, {0x9EBB}}, {0x2FA16, {0x4D56}}, {0x2FA17, {0x9EF9}}, {0x2FA18, {0x9EFE}}, {0x2FA19, {0x9F05}}, {0x2FA1A, {0x9F0F}}, {0x2FA1B, {0x9F16}}, +{0x2FA1D, {0x2A600}}, +}; + static std::string codepoint_to_utf8(uint32_t cp) { std::string result; if (/* 0x00 <= cp && */ cp <= 0x7f) { From adcb12a9bad87bc96f2f158c95892b3d04aa7ffb Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Wed, 28 Feb 2024 03:52:56 -0500 Subject: [PATCH 05/16] llama : fix non-quantization of expert gating tensors (#5754) This reverts a single line from #5475 --- llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 356ca1076..893bcdbc0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11162,7 +11162,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= !params->only_copy; // do not quantize expert gating tensors - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; // do not quantize positional embeddings and token types (BERT) quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); From a693bea1e6762a17b78b6ddf4611e54136941ea2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 28 Feb 2024 09:55:37 +0100 Subject: [PATCH 06/16] server : hit Ctrl+C twice to exit (#5734) * server: twice ctrl+C to exit * std::atomic_flag * sigint: message * sigint: stderr * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel --------- Co-authored-by: Jared Van Bortel --- examples/server/server.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6b3ee531c..080fa9bd5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2772,7 +2772,16 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } std::function shutdown_handler; -inline void signal_handler(int signal) { shutdown_handler(signal); } +std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; +inline void signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice + // this is for better developer experience, we can remove when the server is stable enough + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); + exit(1); + } + shutdown_handler(signal); +} int main(int argc, char **argv) { From 5f706718566e3a5147916dc381f3b99de0ffad47 Mon Sep 17 00:00:00 2001 From: "UEXTM.com" <84163508+uextm@users.noreply.github.com> Date: Sat, 24 Feb 2024 11:27:36 -0500 Subject: [PATCH 07/16] Introduce backend GUIDs (ggml/743) * Introduce backend GUIDs Initial proposed implementation of backend GUIDs (Discussed in https://github.com/ggerganov/ggml/pull/741) Hardcoded CPU backend GUID (for now) Change ggml_backend_is_cpu logic to use GUID * Remove redundant functions Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion * Add spaces to match style Co-authored-by: slaren * Fix brace style to match Co-authored-by: slaren * Add void to () in function signature Co-authored-by: slaren * Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid * add guids to all backends ggml-ci --------- Co-authored-by: slaren --- ggml-backend-impl.h | 2 ++ ggml-backend.c | 16 ++++++++++++++-- ggml-backend.h | 2 +- ggml-cuda.cu | 8 +++++++- ggml-kompute.cpp | 8 +++++++- ggml-metal.m | 8 +++++++- ggml-sycl.cpp | 8 +++++++- ggml-vulkan.cpp | 8 +++++++- ggml.c | 4 ++++ ggml.h | 10 ++++++++++ 10 files changed, 66 insertions(+), 8 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f95df47f7..0e5bf0ae1 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -104,6 +104,8 @@ extern "C" { }; struct ggml_backend { + ggml_guid_t guid; + struct ggml_backend_i iface; ggml_backend_context_t context; diff --git a/ggml-backend.c b/ggml-backend.c index 5076d9e5e..c86673b04 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -12,7 +12,6 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) - // backend buffer type const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { @@ -159,6 +158,13 @@ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml // backend +ggml_guid_t ggml_backend_guid(ggml_backend_t backend) { + if (backend == NULL) { + return NULL; + } + return backend->guid; +} + const char * ggml_backend_name(ggml_backend_t backend) { if (backend == NULL) { return "NULL"; @@ -781,6 +787,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .supports_op = */ ggml_backend_cpu_supports_op, }; +static ggml_guid_t ggml_backend_cpu_guid(void) { + static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; + return &guid; +} + ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); if (ctx == NULL) { @@ -800,6 +811,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } *cpu_backend = (struct ggml_backend) { + /* .guid = */ ggml_backend_cpu_guid(), /* .interface = */ cpu_backend_i, /* .context = */ ctx }; @@ -807,7 +819,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend && backend->iface.get_name == ggml_backend_cpu_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid()); } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { diff --git a/ggml-backend.h b/ggml-backend.h index f13c69bff..8fb54bd92 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -49,7 +49,7 @@ extern "C" { // Backend // - + GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); GGML_API const char * ggml_backend_name(ggml_backend_t backend); GGML_API void ggml_backend_free(ggml_backend_t backend); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 831c84efb..0c6501e98 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -12277,6 +12277,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .supports_op = */ ggml_backend_cuda_supports_op, }; +static ggml_guid_t ggml_backend_cuda_guid() { + static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; + return &guid; +} + GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { ggml_init_cublas(); // TODO: remove from ggml.c @@ -12294,6 +12299,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { }; ggml_backend_t cuda_backend = new ggml_backend { + /* .guid = */ ggml_backend_cuda_guid(), /* .interface = */ ggml_backend_cuda_interface, /* .context = */ ctx }; @@ -12302,7 +12308,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { } GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend && backend->iface.get_name == ggml_backend_cuda_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); } GGML_CALL int ggml_backend_cuda_get_device_count() { diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 51c5af8ec..e740a76d1 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1953,11 +1953,17 @@ static struct ggml_backend_i kompute_backend_i = { /* .supports_op = */ ggml_backend_kompute_supports_op, }; +static ggml_guid_t ggml_backend_kompute_guid() { + static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca }; + return &guid; +} + ggml_backend_t ggml_backend_kompute_init(int device) { GGML_ASSERT(s_kompute_context == nullptr); s_kompute_context = new ggml_kompute_context(device); ggml_backend_t kompute_backend = new ggml_backend { + /* .guid = */ ggml_backend_kompute_guid(), /* .interface = */ kompute_backend_i, /* .context = */ s_kompute_context, }; @@ -1966,7 +1972,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) { } bool ggml_backend_is_kompute(ggml_backend_t backend) { - return backend && backend->iface.get_name == ggml_backend_kompute_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); } static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) { diff --git a/ggml-metal.m b/ggml-metal.m index 9eba2f5d2..71fcca560 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2771,6 +2771,11 @@ void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * ggml_metal_log_user_data = user_data; } +static ggml_guid_t ggml_backend_metal_guid(void) { + static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 }; + return &guid; +} + ggml_backend_t ggml_backend_metal_init(void) { struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); @@ -2781,6 +2786,7 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { + /* .guid = */ ggml_backend_metal_guid(), /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; @@ -2789,7 +2795,7 @@ ggml_backend_t ggml_backend_metal_init(void) { } bool ggml_backend_is_metal(ggml_backend_t backend) { - return backend && backend->iface.get_name == ggml_backend_metal_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid()); } void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 835967fb6..a054ec8b9 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15162,6 +15162,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .supports_op = */ ggml_backend_sycl_supports_op, }; +static ggml_guid_t ggml_backend_sycl_guid() { + static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 }; + return &guid; +} + ggml_backend_t ggml_backend_sycl_init(int device) { ggml_init_sycl(); // TODO: remove from ggml.c @@ -15179,6 +15184,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) { }; ggml_backend_t sycl_backend = new ggml_backend { + /* .guid = */ ggml_backend_sycl_guid(), /* .interface = */ ggml_backend_sycl_interface, /* .context = */ ctx }; @@ -15187,7 +15193,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) { } bool ggml_backend_is_sycl(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_sycl_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid()); } static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 6caafb822..314e3d7a9 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5244,6 +5244,11 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .supports_op = */ ggml_backend_vk_supports_op, }; +static ggml_guid_t ggml_backend_vk_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { if (vk_instance.initialized[idx]) { return vk_instance.backends[idx]; @@ -5262,6 +5267,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { vk_instance.initialized[idx] = true; ggml_backend_t vk_backend = new ggml_backend { + /* .guid = */ ggml_backend_vk_guid(), /* .interface = */ ggml_backend_vk_interface, /* .context = */ &vk_instance.contexts[ctx->idx], }; @@ -5272,7 +5278,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { } GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { - return backend && backend->iface.get_name == ggml_backend_vk_name; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); } GGML_CALL int ggml_backend_vk_get_device_count() { diff --git a/ggml.c b/ggml.c index 4591644ad..68ac6201e 100644 --- a/ggml.c +++ b/ggml.c @@ -355,6 +355,10 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { } } +bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) { + return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0; +} + // // timing // diff --git a/ggml.h b/ggml.h index 23b768640..0a6d3c051 100644 --- a/ggml.h +++ b/ggml.h @@ -672,6 +672,16 @@ extern "C" { GGML_NUMA_STRATEGY_COUNT }; + // + // GUID + // + + // GUID types + typedef uint8_t ggml_guid[16]; + typedef ggml_guid * ggml_guid_t; + + GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b); + // misc GGML_API void ggml_time_init(void); // call this once at the beginning of the program From 2774b0c97427ee3ad3e2ee121354d078794e89d9 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 25 Feb 2024 20:41:35 +0100 Subject: [PATCH 08/16] add google magika inference example (ggml/748) * add magika inference example * ggml : fix unaligned accesses in custom ops * ggml : fix FP32 GELU for values that exceed the FP16 range * use ggml_pool_1d * add README * Update README.md * pad inputs if the files are too small * cleanup ggml-ci --- ggml.c | 54 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/ggml.c b/ggml.c index 68ac6201e..f29b9f13f 100644 --- a/ggml.c +++ b/ggml.c @@ -1608,9 +1608,15 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + if (x[i] <= -10.0f) { + y[i] = 0.0f; + } else if (x[i] >= 10.0f) { + y[i] = x[i]; + } else { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + } } } #else @@ -5780,11 +5786,13 @@ struct ggml_tensor * ggml_pool_1d( is_node = true; } - const int64_t ne[2] = { + const int64_t ne[4] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], + a->ne[2], + a->ne[3], }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { op, k0, s0, p0 }; ggml_set_op_params(result, params, sizeof(params)); @@ -15081,9 +15089,10 @@ static void ggml_compute_forward_map_custom1( return; } - struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + struct ggml_map_custom1_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); - p->fun(dst, a, params->ith, params->nth, p->userdata); + p.fun(dst, a, params->ith, params->nth, p.userdata); } // ggml_compute_forward_map_custom2 @@ -15099,9 +15108,10 @@ static void ggml_compute_forward_map_custom2( return; } - struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + struct ggml_map_custom2_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); - p->fun(dst, a, b, params->ith, params->nth, p->userdata); + p.fun(dst, a, b, params->ith, params->nth, p.userdata); } // ggml_compute_forward_map_custom3 @@ -15118,9 +15128,10 @@ static void ggml_compute_forward_map_custom3( return; } - struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + struct ggml_map_custom3_op_params p; + memcpy(&p, dst->op_params, sizeof(p)); - p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); + p.fun(dst, a, b, c, params->ith, params->nth, p.userdata); } // ggml_compute_forward_cross_entropy_loss @@ -17386,29 +17397,32 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_MAP_CUSTOM1: { - struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; - if (p->n_tasks == GGML_N_TASKS_MAX) { + struct ggml_map_custom1_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { n_tasks = n_threads; } else { - n_tasks = MIN(p->n_tasks, n_threads); + n_tasks = MIN(p.n_tasks, n_threads); } } break; case GGML_OP_MAP_CUSTOM2: { - struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; - if (p->n_tasks == GGML_N_TASKS_MAX) { + struct ggml_map_custom2_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { n_tasks = n_threads; } else { - n_tasks = MIN(p->n_tasks, n_threads); + n_tasks = MIN(p.n_tasks, n_threads); } } break; case GGML_OP_MAP_CUSTOM3: { - struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; - if (p->n_tasks == GGML_N_TASKS_MAX) { + struct ggml_map_custom3_op_params p; + memcpy(&p, node->op_params, sizeof(p)); + if (p.n_tasks == GGML_N_TASKS_MAX) { n_tasks = n_threads; } else { - n_tasks = MIN(p->n_tasks, n_threads); + n_tasks = MIN(p.n_tasks, n_threads); } } break; case GGML_OP_CROSS_ENTROPY_LOSS: From 8c0e8f4e73e275756ad69f9c99b26ead085ca9f0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Feb 2024 11:17:32 +0200 Subject: [PATCH 09/16] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 59de34370..389c0bdfe 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -8cdf783f288a98eddf521b0ab1b4d405be9e18ba +b458250b736a7473f7ff3560d47c93f1644f3290 From 78aacf36344df724cdca9f1e1af849b2d2519cb8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Feb 2024 17:36:53 +0200 Subject: [PATCH 10/16] awq-py : remove (#5768) --- awq-py/README.md | 116 ------------------ awq-py/awq/apply_awq.py | 254 ---------------------------------------- awq-py/requirements.txt | 2 - 3 files changed, 372 deletions(-) delete mode 100644 awq-py/README.md delete mode 100644 awq-py/awq/apply_awq.py delete mode 100644 awq-py/requirements.txt diff --git a/awq-py/README.md b/awq-py/README.md deleted file mode 100644 index 16e68d027..000000000 --- a/awq-py/README.md +++ /dev/null @@ -1,116 +0,0 @@ -# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp -[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] - -**Supported models:** - -- [X] LLaMA -- [x] LLaMA 2 -- [X] MPT -- [X] Mistral AI v0.1 -- [ ] Bloom -- [ ] Mixtral MoE - -**TODO:** -- [x] Update version work with both MPT and MPT-AWQ model -- [ ] Add OPT model -- [ ] Add Bloom model -- [ ] Add Mixtral MoE -- [ ] Support w3, w2 - - -## Contents - -- [Install](##Install) -- [Convert](##Convert) -- [Quantize](##Quantize) -- [Test](##Test) -- [Benchmark](##Benchmark) -- [Results](##Results) - -## Install -Install requirements -```bash -pip install -r requirements.txt -``` -Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT -```bash -git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache -``` - -## Convert -Example for llama model -```bash -# For llama7b and llama2 models -python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf -# For mistral and mpt models -python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf -``` - -## Quantize -```bash -# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. -./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 -``` - -## Test -```bash -# For all models. -./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" -``` - -## Benchmark -The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. -```bash -# For llama and llama2, and mistral models. -./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw -``` - -## Results -Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison -We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k - -### Llama 7B (Build with OpenBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|-----------:|--------------|-------:|-------:|-------:|-------:| -|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 | -|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | -|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 | -|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | -|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - - -### Llama2 7B (Build with CuBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|------------:|--------------|-------:|-------:|-------:|-------:| -|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 | -|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | -|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 | -|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | -|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - - -### Mistral 7B v0.1 (Build with CuBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|-------------:|--------------|-------:|-------:|-------:|-------:| -|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 | -|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G | -|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 | -|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G | -|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - -### MPT 7B (Build with OpenBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|---------:|--------------|-------:|-------:|-------:|--------:| -|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 | -|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G | -|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873| -|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G | -|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py deleted file mode 100644 index 11132c5d2..000000000 --- a/awq-py/awq/apply_awq.py +++ /dev/null @@ -1,254 +0,0 @@ -""" -Implements the AWQ for llama.cpp use cases. -Original paper: https://arxiv.org/abs/2306.00978 - -This code is based on versions of the AWQ implementation found in the following repositories: -* https://github.com/mit-han-lab/llm-awq -* https://github.com/casper-hansen/AutoAWQ -""" - -import os -import torch -import torch.nn as nn - -from transformers import AutoModelForCausalLM, AutoConfig -from transformers.models.bloom.modeling_bloom import BloomGelu -from transformers.models.llama.modeling_llama import LlamaRMSNorm -from transformers.activations import GELUActivation - - -class ScaledActivation(nn.Module): - """ - ScaledActivation module wraps an existing activation function and applies a - scale factor to its output. - - Args: - module (nn.Module): The activation function to be scaled. - scales (torch.Tensor): A tensor of size (num_features,) containing the initial - scale factors for each feature. - - Returns: - torch.Tensor: The scaled output of the activation function. - """ - - def __init__(self, module, scales): - super().__init__() - self.act = module - self.scales = nn.Parameter(scales.data) - - def forward(self, x): - return self.act(x) / self.scales.view(1, 1, -1).to(x.device) - - -def set_op_by_name(layer, name, new_module): - """ - Set the new module for given module's name. - - Args: - layer (nn.Module): The layer in which to replace the submodule. - name (str): The path to the submodule to be replaced, using dot notation - to access nested modules. - new_module (nn.Module): The new module to replace the existing one. - """ - levels = name.split(".") - if len(levels) > 1: - mod_ = layer - for l_idx in range(len(levels) - 1): - if levels[l_idx].isdigit(): - mod_ = mod_[int(levels[l_idx])] - else: - mod_ = getattr(mod_, levels[l_idx]) - setattr(mod_, levels[-1], new_module) - else: - setattr(layer, name, new_module) - - -def get_op_by_name(module, op_name): - """ - Retrieves a submodule within a given layer based on its name. - - Args: - module (nn.Module): The layer containing the submodule to find. - op_name (str): The name of the submodule. - - Returns: - nn.Module: The requested submodule found within the given layer. - - Raises: - ValueError: If the specified submodule cannot be found within the layer. - """ - for name, m in module.named_modules(): - if name == op_name: - return m - raise ValueError(f"Cannot find op {op_name} in module {module}") - - -@torch.no_grad() -def scale_ln_fcs(ln, fcs, scales): - """ - Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. - - Args: - ln (nn.LayerNorm): The LayerNorm module to be scaled. - fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - """ - - if not isinstance(fcs, list): - fcs = [fcs] - - scales = scales.to(ln.weight.device) - - ln.weight.div_(scales) - if hasattr(ln, "bias") and ln.bias is not None: - ln.bias.div_(scales) - - for fc in fcs: - fc.weight.mul_(scales.view(1, -1)) - - for p in ln.parameters(): - assert torch.isnan(p).sum() == 0 - for fc in fcs: - for p in fc.parameters(): - assert torch.isnan(p).sum() == 0 - - -@torch.no_grad() -def scale_fc_fc(fc1, fc2, scales): - """ - Scales the weights of two fully-connected layers in a specific pattern. - - Args: - fc1 (nn.Linear): The first fully-connected layer to be scaled. - fc2 (nn.Linear): The second fully-connected layer to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - """ - assert isinstance(fc1, nn.Linear) - assert isinstance(fc2, nn.Linear) - - scales = scales.to(fc1.weight.device) - - fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) - if fc1.bias is not None: - fc1.bias.div_(scales.view(-1)) - - fc2.weight.mul_(scales.view(1, -1)) - - for p in fc1.parameters(): - assert torch.isnan(p).sum() == 0 - for p in fc2.parameters(): - assert torch.isnan(p).sum() == 0 - - -@torch.no_grad() -def scale_gelu_fc(gelu, fc, scales): - """ - Scales the weight of a GELU activation and a fully-connected layer proportionally. - - Args: - gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. - fc (nn.Linear): The fully-connected layer to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - - Raises: - TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. - TypeError: If the `fc` module is not of type `nn.Linear`. - """ - assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) - assert isinstance(fc, nn.Linear) - - fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) - - for p in fc.parameters(): - assert torch.isnan(p).sum() == 0 - - -def apply_scale(module, scales_list, input_feat_dict=None): - """ - Applies different scaling strategies to layers based on their type and hierarchy within a given module. - - Args: - module (nn.Module): The module containing the layers to be scaled. - scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: - * prev_op_name (str): The name of the preceding operation or module, - relative to which the layers to be scaled are located. - * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. - * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. - input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding - input features (optional). - """ - for prev_op_name, layer_names, scales in scales_list: - prev_op = get_op_by_name(module, prev_op_name) - layers = [get_op_by_name(module, name) for name in layer_names] - - prev_op.cuda() - for layer in layers: - layer.cuda() - scales.cuda() - - if isinstance(prev_op, nn.Linear): - assert len(layers) == 1 - scale_fc_fc(prev_op, layers[0], scales) - elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): - scale_ln_fcs(prev_op, layers, scales) - elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): - new_module = ScaledActivation(prev_op, scales) - set_op_by_name(module, prev_op_name, new_module) - scale_gelu_fc(prev_op, layers[0], scales) - else: - raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") - - # apply the scaling to input feat if given; prepare it for clipping - if input_feat_dict is not None: - for layer_name in layer_names: - inp = input_feat_dict[layer_name] - inp.div_(scales.view(1, -1).to(inp.device)) - - prev_op.cpu() - for layer in layers: - layer.cpu() - scales.cpu() - - -@torch.no_grad() -def apply_clip(module, clip_list): - """ - Applies element-wise clipping to the weight of a specific layer within a given module. - - Args: - module (nn.Module): The module containing the layer to be clipped. - clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: - * name (str): The name of the layer to be clipped, relative to the root of the module. - * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. - """ - for name, max_val in clip_list: - layer = get_op_by_name(module, name) - layer.cuda() - max_val = max_val.to(layer.weight.device) - org_shape = layer.weight.shape - layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) - layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) - layer.weight.data = layer.weight.data.reshape(org_shape) - layer.cpu() - - -def add_scale_weights(model_path, scale_path, tmp_path): - """ - Adds pre-computed Activation Weight Quantization (AWQ) results to a model, - including scaling factors and clipping bounds. - - Args: - model_path (str): Path to the pre-trained model to be equipped with AWQ. - scale_path (str): Path to the AWQ scale factors (.pt file). - tmp_path (str): Path to the temporary directory where the equipped model will be saved. - """ - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_path, config=config, trust_remote_code=True - ) - model.eval() - awq_results = torch.load(str(scale_path), map_location="cpu") - apply_scale(model, awq_results["scale"]) - apply_clip(model, awq_results["clip"]) - model.save_pretrained(str(tmp_path)) - os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt deleted file mode 100644 index 991896116..000000000 --- a/awq-py/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch>=2.1.1 -transformers>=4.32.0 From 08c5ee87e4cceb603ecceac90734fcdade57311b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Feb 2024 18:43:38 +0200 Subject: [PATCH 11/16] llama : remove deprecated API (#5770) ggml-ci --- llama.cpp | 88 +------------------------------------------------------ llama.h | 45 ---------------------------- 2 files changed, 1 insertion(+), 132 deletions(-) diff --git a/llama.cpp b/llama.cpp index 893bcdbc0..30d5eb32d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7894,9 +7894,9 @@ static int llama_decode_internal( const auto n_batch = cparams.n_batch; GGML_ASSERT(n_tokens <= n_batch); + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT const int64_t t_start_us = ggml_time_us(); @@ -10062,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { - llama_sample_temp(ctx, candidates_p, temp); -} - void llama_sample_repetition_penalties( struct llama_context * ctx, llama_token_data_array * candidates, @@ -10192,38 +10188,6 @@ void llama_sample_apply_guidance( ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } -void llama_sample_classifier_free_guidance( - struct llama_context * ctx, - llama_token_data_array * candidates, - struct llama_context * guidance_ctx, - float scale) { - GGML_ASSERT(ctx); - int64_t t_start_sample_us; - - t_start_sample_us = ggml_time_us(); - const size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); - - GGML_ASSERT(n_vocab == candidates->size); - GGML_ASSERT(!candidates->sorted); - - std::vector logits_base(n_vocab); - for (size_t i = 0; i < n_vocab; ++i) { - logits_base[i] = candidates->data[i].logit; - } - - float * logits_guidance = llama_get_logits(guidance_ctx); - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale); - t_start_sample_us = ggml_time_us(); - - for (size_t i = 0; i < n_vocab; ++i) { - candidates->data[i].logit = logits_base[i]; - } - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; -} - llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { GGML_ASSERT(ctx); @@ -11724,15 +11688,6 @@ bool llama_supports_gpu_offload(void) { #endif } -// deprecated: -bool llama_mmap_supported(void) { - return llama_supports_mmap(); -} - -bool llama_mlock_supported(void) { - return llama_supports_mlock(); -} - void llama_backend_init(void) { ggml_time_init(); @@ -12244,15 +12199,6 @@ uint32_t llama_model_quantize( } } -int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { - try { - return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; - } -} - int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { try { return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); @@ -12802,38 +12748,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi return true; } -int llama_eval( - struct llama_context * ctx, - llama_token * tokens, - int32_t n_tokens, - int32_t n_past) { - llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); - - const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); - if (ret < 0) { - LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); - } - - return ret; -} - -int llama_eval_embd( - struct llama_context * ctx, - float * embd, - int32_t n_tokens, - int32_t n_past) { - llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); - - llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; - - const int ret = llama_decode_internal(*ctx, batch); - if (ret < 0) { - LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); - } - - return ret; -} - void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { ctx->cparams.n_threads = n_threads; ctx->cparams.n_threads_batch = n_threads_batch; diff --git a/llama.h b/llama.h index 16e28e91d..a6823bb2b 100644 --- a/llama.h +++ b/llama.h @@ -364,9 +364,6 @@ extern "C" { LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_gpu_offload(void); - LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead"); - LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead"); - LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @@ -423,14 +420,6 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file( - struct llama_context * ctx, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads), - "use llama_model_apply_lora_from_file instead"); - LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, const char * path_lora, @@ -606,27 +595,6 @@ extern "C" { // Decoding // - // Run the llama inference to obtain the logits and probabilities for the next token(s). - // tokens + n_tokens is the provided batch of new tokens to process - // n_past is the number of tokens to use from previous eval calls - // Returns 0 on success - // DEPRECATED: use llama_decode() instead - LLAMA_API DEPRECATED(int llama_eval( - struct llama_context * ctx, - llama_token * tokens, - int32_t n_tokens, - int32_t n_past), - "use llama_decode() instead"); - - // Same as llama_eval, but use float matrix input directly. - // DEPRECATED: use llama_decode() instead - LLAMA_API DEPRECATED(int llama_eval_embd( - struct llama_context * ctx, - float * embd, - int32_t n_tokens, - int32_t n_past), - "use llama_decode() instead"); - // Return batch for single sequence of tokens starting at pos_0 // // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it @@ -800,13 +768,6 @@ extern "C" { float * logits_guidance, float scale); - LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance( - struct llama_context * ctx, - llama_token_data_array * candidates, - struct llama_context * guidance_ctx, - float scale), - "use llama_sample_apply_guidance() instead"); - /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax( struct llama_context * ctx, @@ -860,12 +821,6 @@ extern "C" { llama_token_data_array * candidates, float temp); - LLAMA_API DEPRECATED(void llama_sample_temperature( - struct llama_context * ctx, - llama_token_data_array * candidates, - float temp), - "use llama_sample_temp instead"); - /// @details Apply constraints from grammar LLAMA_API void llama_sample_grammar( struct llama_context * ctx, From 317709b2a81dbaf87850202686ec5bb2602a504e Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Wed, 28 Feb 2024 19:33:37 +0000 Subject: [PATCH 12/16] make portability_enumeration_ext apple only (#5757) --- ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 314e3d7a9..896c290b2 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1106,7 +1106,9 @@ void ggml_vk_instance_init() { const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions); +#ifdef __APPLE__ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); +#endif std::vector layers; @@ -1117,13 +1119,17 @@ void ggml_vk_instance_init() { if (validation_ext) { extensions.push_back("VK_EXT_validation_features"); } +#ifdef __APPLE__ if (portability_enumeration_ext) { extensions.push_back("VK_KHR_portability_enumeration"); } +#endif vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); +#ifdef __APPLE__ if (portability_enumeration_ext) { instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; } +#endif std::vector features_enable; vk::ValidationFeaturesEXT validation_features; From 87c91c07663b707e831c59ec373b5e665ff9d64a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Feb 2024 21:44:21 +0200 Subject: [PATCH 13/16] ci : reduce 3b ppl chunks to 1 to avoid timeout (#5771) ggml-ci --- ci/run.sh | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index f3a29c2e9..35eb3c7aa 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -272,19 +272,19 @@ function gg_run_open_llama_3b_v2 { (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log @@ -343,17 +343,17 @@ function gg_run_open_llama_3b_v2 { python3 ../convert-lora-to-ggml.py ${path_lora} # f16 - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 + f16 lora-base - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log set +e From d5ab29757ebc59a30f03e408294ec20628a6374e Mon Sep 17 00:00:00 2001 From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Date: Thu, 29 Feb 2024 00:17:23 -0800 Subject: [PATCH 14/16] llama : constified `llama_set_state_data`'s `src` (#5774) --- llama.cpp | 6 +++--- llama.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 30d5eb32d..62699ce52 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12545,8 +12545,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { - uint8_t * inp = src; +size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { + const uint8_t * inp = src; // set rng { @@ -12555,7 +12555,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - std::string rng_str((char *)inp, rng_size); inp += rng_size; + std::string rng_str((const char *)inp, rng_size); inp += rng_size; std::istringstream rng_ss(rng_str); rng_ss >> ctx->rng; diff --git a/llama.h b/llama.h index a6823bb2b..4d0ebe37d 100644 --- a/llama.h +++ b/llama.h @@ -575,7 +575,7 @@ extern "C" { // Returns the number of bytes read LLAMA_API size_t llama_set_state_data( struct llama_context * ctx, - uint8_t * src); + const uint8_t * src); // Save/load session file LLAMA_API bool llama_load_session_file( From 052051d8ae4639a1c3c61e7da3237bcc572469d4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 29 Feb 2024 21:42:11 +0100 Subject: [PATCH 15/16] Server: normalize naming (#5779) * server: normalize naming * fix spacing --- examples/server/server.cpp | 370 ++++++++++++++++--------------------- examples/server/utils.hpp | 186 ++++++++++++------- 2 files changed, 277 insertions(+), 279 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 080fa9bd5..bf20e0cf1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,8 +33,7 @@ using json = nlohmann::json; -struct server_params -{ +struct server_params { std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; @@ -49,103 +48,50 @@ struct server_params bool server_verbose = false; bool server_log_json = true; -static size_t common_part(const std::vector &a, const std::vector &b) -{ - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) - { - } - return i; -} - -enum stop_type -{ +enum stop_type { STOP_FULL, STOP_PARTIAL, }; -static bool ends_with(const std::string &str, const std::string &suffix) -{ - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} +// TODO: can become bool if we can't find use of more states +enum slot_state { + IDLE, + PROCESSING, +}; -static size_t find_partial_stop_string(const std::string &stop, - const std::string &text) -{ - if (!text.empty() && !stop.empty()) - { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) - { - if (stop[char_index] == text_last_char) - { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) - { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} +enum slot_command { + NONE, + LOAD_PROMPT, + RELEASE, +}; -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) -{ - std::string ret; - for (; begin != end; ++begin) - { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; -} +struct slot_params { + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) -{ - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) - { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) -{ - json out = json::array(); - for (const auto &prob : probs) - { - json probs_for_token = json::array(); - for (const auto &p : prob.probs) - { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json - { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; -} + std::vector antiprompt; -struct llama_client_slot -{ + json input_prefix; + json input_suffix; +}; + +struct slot_image { + int32_t id; + + bool request_encode_image = false; + float * image_embedding = nullptr; + int32_t image_tokens = 0; + + clip_image_u8 * img_data; + + std::string prefix_prompt; // before of this image +}; + +struct server_slot { int id; int task_id = -1; @@ -165,8 +111,8 @@ struct llama_client_slot int32_t i_batch = -1; int32_t n_predict = -1; - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; + int32_t n_prompt_tokens = 0; + int32_t n_prompt_tokens_processed = 0; json prompt; std::string generated_text; @@ -201,8 +147,8 @@ struct llama_client_slot std::vector images; // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; + size_t n_sent_text = 0; // number of sent text character + size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_genereration; @@ -214,7 +160,7 @@ struct llama_client_slot int multitask_id = -1; void reset() { - num_prompt_tokens = 0; + n_prompt_tokens = 0; generated_text = ""; truncated = false; stopped_eos = false; @@ -222,16 +168,15 @@ struct llama_client_slot stopped_limit = false; stopping_word = ""; n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; + n_sent_text = 0; + n_sent_token_probs = 0; infill = false; ga_i = 0; n_past_se = 0; generated_token_probs.clear(); - for (slot_image & img : images) - { + for (slot_image & img : images) { free(img.image_embedding); if (img.img_data) { clip_image_u8_free(img.img_data); @@ -243,19 +188,15 @@ struct llama_client_slot } bool has_budget(gpt_params &global_params) { - if (params.n_predict == -1 && global_params.n_predict == -1) - { + if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } n_remaining = -1; - if (params.n_predict != -1) - { + if (params.n_predict != -1) { n_remaining = params.n_predict - n_decoded; - } - else if (global_params.n_predict != -1) - { + } else if (global_params.n_predict != -1) { n_remaining = global_params.n_predict - n_decoded; } @@ -271,8 +212,7 @@ struct llama_client_slot } void add_token_string(const completion_token_output &token) { - if (command == RELEASE) - { + if (command == RELEASE) { return; } cache_tokens.push_back(token.tok); @@ -290,10 +230,10 @@ struct llama_client_slot json get_formated_timings() { return json { - {"prompt_n", num_prompt_tokens_processed}, + {"prompt_n", n_prompt_tokens_processed}, {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, {"predicted_n", n_decoded}, {"predicted_ms", t_token_generation}, @@ -304,18 +244,18 @@ struct llama_client_slot void print_timings() const { char buffer[512]; - double t_token = t_prompt_processing / num_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + double t_token = t_prompt_processing / n_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, num_prompt_tokens_processed, + t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second); LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"num_prompt_tokens_processed", num_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"n_prompt_tokens_processed", n_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, }); t_token = t_token_generation / n_decoded; @@ -343,7 +283,7 @@ struct llama_client_slot } }; -struct llama_metrics { +struct server_metrics { uint64_t n_prompt_tokens_processed_total = 0; uint64_t n_tokens_predicted_total = 0; @@ -354,18 +294,16 @@ struct llama_metrics { uint64_t t_tokens_generation = 0; - void on_prompt_eval(const llama_client_slot &slot) { - n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; - - n_prompt_tokens_processed += slot.num_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; + void on_prompt_eval(const server_slot &slot) { + n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; + n_prompt_tokens_processed += slot.n_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; } - void on_prediction(const llama_client_slot &slot) { + void on_prediction(const server_slot &slot) { n_tokens_predicted_total += slot.n_decoded; - - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; } void reset_bucket() { @@ -404,13 +342,13 @@ struct llama_server_context std::string name_assistant; // slots / clients - std::vector slots; + std::vector slots; json default_generation_settings_for_props; - llama_server_queue queue_tasks; + llama_server_queue queue_tasks; llama_server_response queue_results; - llama_metrics metrics; + server_metrics metrics; ~llama_server_context() { @@ -487,7 +425,7 @@ struct llama_server_context LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; + server_slot slot; slot.id = i; slot.n_ctx = n_ctx_slot; @@ -579,11 +517,11 @@ struct llama_server_context return prompt_tokens; } - llama_client_slot* get_slot(int id) { + server_slot* get_slot(int id) { int64_t t_last = ggml_time_us(); - llama_client_slot *last_used = nullptr; + server_slot *last_used = nullptr; - for (llama_client_slot & slot : slots) + for (server_slot & slot : slots) { if (slot.id == id && slot.available()) { @@ -600,7 +538,7 @@ struct llama_server_context return last_used; } - bool launch_slot_with_data(llama_client_slot* &slot, json data) { + bool launch_slot_with_data(server_slot* &slot, json data) { slot_params default_params; llama_sampling_params default_sparams; @@ -888,7 +826,7 @@ struct llama_server_context clean_kv_cache = false; } - void update_system_prompt() { + void system_prompt_update() { kv_cache_clear(); system_tokens.clear(); @@ -933,9 +871,9 @@ struct llama_server_context system_need_update = false; } - void notify_system_prompt_changed() { + void system_prompt_notify() { // release all slots - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { slot.release(); } @@ -943,17 +881,17 @@ struct llama_server_context system_need_update = true; } - void process_system_prompt_data(const json &sys_props) { + void system_prompt_process(const json &sys_props) { system_prompt = sys_props.value("prompt", ""); name_user = sys_props.value("anti_prompt", ""); name_assistant = sys_props.value("assistant_name", ""); - notify_system_prompt_changed(); + system_prompt_notify(); } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, - const stop_type type, llama_client_slot &slot) + const stop_type type, server_slot &slot) { size_t stop_pos = std::string::npos; @@ -975,8 +913,8 @@ struct llama_server_context { if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; + slot.stopped_word = true; + slot.stopping_word = word; slot.has_next_token = false; } stop_pos = pos; @@ -986,7 +924,7 @@ struct llama_server_context return stop_pos; } - bool process_token(completion_token_output &result, llama_client_slot &slot) { + bool process_token(completion_token_output &result, server_slot &slot) { // remember which tokens were sampled - used for repetition penalties during sampling const std::string token_str = llama_token_to_piece(ctx, result.tok); slot.sampled = result.tok; @@ -1032,7 +970,7 @@ struct llama_server_context if (!incomplete) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); + size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); bool is_stop_full = false; size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); @@ -1042,7 +980,7 @@ struct llama_server_context slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); } else { @@ -1055,7 +993,7 @@ struct llama_server_context { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); + slot.n_sent_text += result.text_to_send.size(); // add the token to slot queue and cache } slot.add_token_string(result); @@ -1099,7 +1037,7 @@ struct llama_server_context return slot.has_next_token; // continue } - bool process_images(llama_client_slot &slot) const + bool process_images(server_slot &slot) const { for (slot_image &img : slot.images) { @@ -1132,7 +1070,7 @@ struct llama_server_context queue_results.send(res); } - json get_formated_generation(llama_client_slot &slot) + json get_formated_generation(server_slot &slot) { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && @@ -1179,7 +1117,7 @@ struct llama_server_context }; } - void send_partial_response(llama_client_slot &slot, completion_token_output tkn) + void send_partial_response(server_slot &slot, completion_token_output tkn) { task_result res; res.id = slot.task_id; @@ -1199,13 +1137,13 @@ struct llama_server_context { std::vector probs_output = {}; const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size()); - size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); + size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); + size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); if (probs_pos < probs_stop_pos) { probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } - slot.sent_token_probs_index = probs_stop_pos; + slot.n_sent_token_probs = probs_stop_pos; res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } @@ -1218,7 +1156,7 @@ struct llama_server_context queue_results.send(res); } - void send_final_response(llama_client_slot &slot) + void send_final_response(server_slot &slot) { task_result res; res.id = slot.task_id; @@ -1233,7 +1171,7 @@ struct llama_server_context {"stop", true}, {"model", params.model_alias}, {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, + {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, {"prompt", slot.prompt}, {"truncated", slot.truncated}, @@ -1271,7 +1209,7 @@ struct llama_server_context queue_results.send(res); } - void send_embedding(llama_client_slot &slot) + void send_embedding(server_slot &slot) { task_result res; res.id = slot.task_id; @@ -1282,9 +1220,7 @@ struct llama_server_context const int n_embd = llama_n_embd(model); if (!params.embedding) { - LOG_WARNING("embedding disabled", { - {"params.embedding", params.embedding}, - }); + LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}}); res.result_json = json { {"embedding", std::vector(n_embd, 0.0f)}, @@ -1296,7 +1232,7 @@ struct llama_server_context std::vector embedding(data, data + n_embd); res.result_json = json { - {"embedding", embedding }, + {"embedding", embedding}, }; } queue_results.send(res); @@ -1345,7 +1281,7 @@ struct llama_server_context } // for multiple images processing - bool ingest_images(llama_client_slot &slot, int n_batch) + bool ingest_images(server_slot &slot, int n_batch) { int image_idx = 0; @@ -1384,7 +1320,17 @@ struct llama_server_context } const int n_embd = llama_n_embd(model); - llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; + llama_batch batch_img = { + n_eval, + nullptr, + (img.image_embedding + i * n_embd), + nullptr, + nullptr, + nullptr, + nullptr, + slot.n_past, + 1, 0 + }; if (llama_decode(ctx, batch_img)) { LOG_TEE("%s : failed to eval image\n", __func__); @@ -1454,7 +1400,7 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + server_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { // if no slot is available, we defer this task for processing later @@ -1469,10 +1415,10 @@ struct llama_server_context send_error(task, "system prompt can only be updated when all slots are idle"); break; } - process_system_prompt_data(task.data["system_prompt"]); + system_prompt_process(task.data["system_prompt"]); // reset cache_tokens for all slots - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { slot.cache_tokens.clear(); slot.n_past = 0; @@ -1512,20 +1458,20 @@ struct llama_server_context int n_idle_slots = 0; int n_processing_slots = 0; - for (llama_client_slot &slot: slots) { + for (server_slot &slot: slots) { json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; slot_data["state"] = slot.state; slot_data["prompt"] = slot.prompt; slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, }; if (slot_data["state"] == IDLE) { n_idle_slots++; @@ -1563,10 +1509,10 @@ struct llama_server_context { "n_tokens_predicted", metrics.n_tokens_predicted}, { "t_tokens_generation", metrics.t_tokens_generation}, - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - { "slots", slots_data }, + { "slots", slots_data }, }; metrics.reset_bucket(); queue_results.send(res); @@ -1597,7 +1543,7 @@ struct llama_server_context if (system_need_update) { LOG_INFO("updating system prompt", {}); - update_system_prompt(); + system_prompt_update(); } llama_batch_clear(batch); @@ -1618,7 +1564,7 @@ struct llama_server_context task.target_id = -1; queue_tasks.post(task); - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { if (slot.ga_n == 1) { @@ -1754,45 +1700,50 @@ struct llama_server_context prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt } - slot.num_prompt_tokens = prompt_tokens.size(); + slot.n_prompt_tokens = prompt_tokens.size(); if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; + slot.params.n_keep = slot.n_prompt_tokens; } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) + if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; - const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + std::vector new_tokens( + prompt_tokens.begin(), + prompt_tokens.begin() + slot.params.n_keep); + new_tokens.insert( + new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.end()); LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, }); slot.truncated = true; prompt_tokens = new_tokens; - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + slot.n_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); - slot.n_past = 0; + slot.n_past = 0; slot.n_past_se = 0; - slot.ga_i = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; + slot.ga_i = 0; + slot.n_prompt_tokens_processed = slot.n_prompt_tokens; } else { @@ -1811,7 +1762,7 @@ struct llama_server_context slot.n_past -= 1; } - slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; + slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; if (slot.ga_n != 1) { @@ -1836,13 +1787,13 @@ struct llama_server_context { "slot_id", slot.id }, { "task_id", slot.task_id }, { "n_past", slot.n_past }, - { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } }); } slot.cache_tokens = prompt_tokens; - if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) + if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. LOG_INFO("we have to evaluate at least 1 token to generate logits", { @@ -1898,8 +1849,8 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { LOG_ERROR("failed processing images", { - "slot_id", slot.id, - "task_id", slot.task_id, + {"slot_id", slot.id}, + {"task_id", slot.task_id}, }); // FIXME @phymbert: to be properly tested // early returning without changing the slot state will block the slot for ever @@ -2049,10 +2000,6 @@ struct llama_server_context LOG_VERBOSE("slots updated", {}); return true; } - - void run_on_all_tasks_finished() { - update_slots(); - } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, @@ -2561,7 +2508,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::istreambuf_iterator(), std::back_inserter(systm_content) ); - llama.process_system_prompt_data(json::parse(systm_content)); + llama.system_prompt_process(json::parse(systm_content)); } else if (arg == "-ctk" || arg == "--cache-type-k") { params.cache_type_k = argv[++i]; @@ -2692,7 +2639,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, /* llama.cpp completion api semantics */ static json format_partial_response( - llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs + llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector &probs ) { json res = json { @@ -2748,14 +2695,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo }); } -struct token_translator -{ - llama_context * ctx; - std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } - std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } -}; - -static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot) { auto & gtps = slot->generated_token_probs; auto translator = token_translator{llama.ctx}; @@ -3526,8 +3466,8 @@ int main(int argc, char **argv) &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); - llama.queue_tasks.on_all_tasks_finished(std::bind( - &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_tasks.on_run_slots(std::bind( + &llama_server_context::update_slots, &llama)); llama.queue_results.on_multitask_update(std::bind( &llama_server_queue::update_multitask, &llama.queue_tasks, diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d7abd7cbb..d98541f26 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -37,10 +37,6 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) -// -// parallel -// - enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_READY, // Server is ready and model is loaded @@ -78,51 +74,8 @@ struct task_multi { std::vector results{}; }; -// TODO: can become bool if we can't find use of more states -enum slot_state -{ - IDLE, - PROCESSING, -}; - -enum slot_command -{ - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params -{ - bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image -{ - int32_t id; - - bool request_encode_image = false; - float * image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8 * img_data; - - std::string prefix_prompt; // before of this image -}; - // completion token output with probabilities -struct completion_token_output -{ +struct completion_token_output { struct token_prob { llama_token tok; @@ -134,8 +87,13 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) -{ +struct token_translator { + llama_context * ctx; + std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } + std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } +}; + +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); json log = nlohmann::ordered_json{ @@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line, // template -static T json_value(const json &body, const std::string &key, const T &default_value) -{ +static T json_value(const json &body, const std::string &key, const T &default_value) { // Fallback null to default value return body.contains(key) && !body.at(key).is_null() ? body.value(key, default_value) @@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) { } // Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) -{ +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { size_t alloc_size = 0; // vector holding all allocated string to be passed to llama_chat_apply_template std::vector str(messages.size() * 2); @@ -250,7 +206,7 @@ struct llama_server_queue { // callback functions std::function callback_new_task; std::function callback_finish_multitask; - std::function callback_all_task_finished; + std::function callback_run_slots; // Add a new task to the end of the queue int post(task_server task) { @@ -283,14 +239,14 @@ struct llama_server_queue { callback_new_task = callback; } - // Register function to process a multitask + // Register function to process a multitask when it is finished void on_finish_multitask(std::function callback) { callback_finish_multitask = callback; } - // Register the function to be called when the batch of tasks is finished - void on_all_tasks_finished(std::function callback) { - callback_all_task_finished = callback; + // Register the function to be called when all slots data is ready to be processed + void on_run_slots(std::function callback) { + callback_run_slots = callback; } // Call when the state of one slot is changed @@ -312,7 +268,13 @@ struct llama_server_queue { condition_tasks.notify_all(); } - // Start the main loop. + /** + * Main loop consists of these steps: + * - Wait until a new task arrives + * - Process the task (i.e. maybe copy data into slot) + * - Check if multitask is finished + * - Run all slots + */ void start_loop() { running = true; while (true) { @@ -331,8 +293,8 @@ struct llama_server_queue { LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } - LOG_VERBOSE("callback_all_task_finished", {}); - // process and update all the multitasks + LOG_VERBOSE("update_multitasks", {}); + // check if we have any finished multitasks auto queue_iterator = queue_multitasks.begin(); while (queue_iterator != queue_multitasks.end()) { @@ -349,8 +311,9 @@ struct llama_server_queue { ++queue_iterator; } } - // all tasks in the current loop is finished - callback_all_task_finished(); + // all tasks in the current loop is processed, slots data is now ready + LOG_VERBOSE("callback_run_slots", {}); + callback_run_slots(); } LOG_VERBOSE("wait for new task", {}); // wait for new task @@ -408,12 +371,14 @@ struct llama_server_response { std::mutex mutex_results; std::condition_variable condition_results; + // add the task_id to the list of tasks waiting for response void add_waiting_task_id(int task_id) { LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } + // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int task_id) { LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); @@ -574,3 +539,96 @@ static std::string gen_chatcmplid() chatcmplid << "chatcmpl-" << random_string(); return chatcmplid.str(); } + +// +// other common utils +// + +static size_t common_part(const std::vector &a, const std::vector &b) +{ + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) + { + } + return i; +} + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +// TODO: reuse llama_detokenize +template +static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) +{ + std::string ret; + for (; begin != end; ++begin) + { + ret += llama_token_to_piece(ctx, *begin); + } + return ret; +} + +// format incomplete utf-8 multibyte character for output +static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) +{ + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) + { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; + } + return out; +} + +// convert a vector of completion_token_output to json +static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) +{ + json out = json::array(); + for (const auto &prob : probs) + { + json probs_for_token = json::array(); + for (const auto &p : prob.probs) + { + std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json + { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json{ + {"content", tok_str}, + {"probs", probs_for_token}, + }); + } + return out; +} From 38d152160898b0173ffe4dc7df5daadcbd2eceb0 Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Fri, 1 Mar 2024 07:36:47 +0000 Subject: [PATCH 16/16] [SYCL] Use batched mul_mat pathway (#5591) * Use batched mul_mat pathway * rm extra line * Explicitly state scaled data type --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> --- ggml-sycl.cpp | 107 +++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index a054ec8b9..6f391b0c6 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -12726,6 +12726,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -13269,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16, int64_t i03 = i13 / r3; int64_t i02 = i12 / r2; - ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; - ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, - const ggml_tensor *src1, - ggml_tensor *dst) try { +static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_BINARY_OP_LOCALS - GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb); - - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - - GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb); - - const int64_t ne1 = ggml_nelements(src1); - const int64_t ne = ggml_nelements(dst); + const int64_t ne_dst = ggml_nelements(dst); SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; @@ -13312,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; // convert src1 to fp16 - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); - GGML_ASSERT(to_fp16_sycl != nullptr); - - sycl_pool_alloc src1_as_f16(ne1); - to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream); + sycl_pool_alloc src1_f16_alloc; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); + const int64_t ne_src1 = ggml_nelements(src1); + src1_f16_alloc.alloc(ne_src1); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + } + sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf + : src1_f16_alloc.get(); sycl_pool_alloc dst_f16; char * dst_t; @@ -13337,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, const void * alpha = &alpha_f16; const void * beta = &beta_f16; - if (dst->op_params[0] == GGML_PREC_DEFAULT) { - dst_t = (char *) dst_f16.alloc(ne); + // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway + // once oneMKL open source supports half, half, float, float: datatypes + dst_t = (char *) dst_f16.alloc(ne_dst); - nbd2 /= sizeof(float) / sizeof(sycl::half); - nbd3 /= sizeof(float) / sizeof(sycl::half); - } else { - dst_t = (char *) dst_ddf; - - cu_compute_type = dpct::library_data_t::real_float; - cu_data_type = dpct::library_data_t::real_float; - - alpha = &alpha_f32; - beta = &beta_f32; - } + nbd2 /= sizeof(float) / sizeof(sycl::half); + nbd3 /= sizeof(float) / sizeof(sycl::half); GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -13386,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *)src0_as_f16, dpct::library_data_t::real_half, - nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half), - (const char *)src1_as_f16.get(), dpct::library_data_t::real_half, - nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta, - (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float), + nb01 / nb00, nb02 / nb00, + (const char *)src1_f16, dpct::library_data_t::real_half, + nb11 / nb10, nb12 / nb10, beta, + (char *)dst_t, cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type))); } else { // use syclGemmBatchedEx @@ -13409,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, {sycl::aspect::fp16}); main_stream->submit([&](sycl::handler &cgh) { - const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get(); - const void **ptrs_src_get_ct3 = ptrs_src.get(); - void **ptrs_dst_get_ct4 = ptrs_dst.get(); - + const void **ptrs_src_get = ptrs_src.get(); + void **ptrs_dst_get = ptrs_dst.get(); + size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2; + size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2; cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_compute_batched_ptrs( - src0_as_f16, src1_as_f16_get_ct1, - dst_t, ptrs_src_get_ct3, - ptrs_dst_get_ct4, ne12, ne13, ne23, - nb02, nb03, nb12, nb13, nbd2, nbd3, r2, - r3, item_ct1); + src0_as_f16, src1_f16, + dst_t, ptrs_src_get, + ptrs_dst_get, ne12, ne13, ne23, + nb02, nb03, nb12_scaled, nb13_scaled, + nbd2, nbd3, r2, r3, item_ct1); }); }); } - /* - DPCT1010:95: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this - code. - */ - SYCL_CHECK(0); - SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **)(ptrs_src.get() + 0 * ne23), - dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), + dpct::library_data_t::real_half, nb01 / nb00, (const void **)(ptrs_src.get() + 1 * ne23), - dpct::library_data_t::real_half, nb11 / sizeof(float), beta, + dpct::library_data_t::real_half, nb11 / nb10, beta, (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type))); } #endif - if (dst->op_params[0] == GGML_PREC_DEFAULT) { - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); - to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream); - } + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); + to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -13491,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // KQV single-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n"); ggml_sycl_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n"); - ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst); + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n"); + ggml_sycl_mul_mat_batched_sycl(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);