From 2b4ea35e56792064598e922e46d081e02bc96b94 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 24 Oct 2023 16:48:37 +0300 Subject: [PATCH 01/63] cuda : add batched cuBLAS GEMM for faster attention (#3749) * cmake : add helper for faster CUDA builds * batched : add NGL arg * ggml : skip nops in compute_forward * cuda : minor indentation * cuda : batched cuBLAS GEMMs for src0 F16 and src1 F32 (attention ops) * Apply suggestions from code review These changes plus: ```c++ #define cublasGemmBatchedEx hipblasGemmBatchedEx ``` are needed to compile with ROCM. I haven't done performance testing, but it seems to work. I couldn't figure out how to propose a change for lines outside what the pull changed, also this is the first time trying to create a multi-part review so please forgive me if I mess something up. * cuda : add ROCm / hipBLAS cublasGemmBatchedEx define * cuda : add cublasGemmStridedBatchedEx for non-broadcasted cases * cuda : reduce mallocs in cublasGemmBatchedEx branch * cuda : add TODO for calling cublas from kernel + using mem pool --------- Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> --- CMakeLists.txt | 1 + examples/batched/batched.cpp | 11 +- ggml-cuda.cu | 190 +++++++++++++++++++++++++++++++++-- ggml.c | 4 + 4 files changed, 193 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6af42a6c2..202f26049 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -331,6 +331,7 @@ if (LLAMA_CUBLAS) set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics else() set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 75856a81f..22a4265df 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -11,7 +11,7 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]); + printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]); return 1 ; } @@ -21,6 +21,9 @@ int main(int argc, char ** argv) { // total length of the sequences including the prompt int n_len = 32; + // number of layers to offload to the GPU + int n_gpu_layers = 0; + if (argc >= 2) { params.model = argv[1]; } @@ -37,6 +40,10 @@ int main(int argc, char ** argv) { n_len = std::atoi(argv[4]); } + if (argc >= 6) { + n_gpu_layers = std::atoi(argv[5]); + } + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } @@ -49,7 +56,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); - // model_params.n_gpu_layers = 99; // offload all layers to the GPU + model_params.n_gpu_layers = n_gpu_layers; llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 654d3632f..db053e3b8 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -29,6 +29,8 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx #define cublasHandle_t hipblasHandle_t #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream @@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const half * x = (const half *) vx; - const int row_x = blockDim.y*blockIdx.y + threadIdx.y; - const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; const int channel_x = channel / channel_x_divisor; - const int nrows_y = ncols_x; + const int nrows_y = ncols_x; const int nrows_dst = nrows_x; - const int row_dst = row_x; + const int row_dst = row_x; const int idst = channel*nrows_dst + row_dst; @@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous break; } - const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; - const float xi = __half2float(x[ix]); - const int row_y = col_x; + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; const int iy = channel*nrows_y + row_y; + const float xi = __half2float(x[ix]); + tmp += xi * y[iy]; } @@ -7013,7 +7015,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens } static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ - GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -7023,11 +7026,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; - const int64_t ne12 = src1->ne[2]; - const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; + const int64_t ne12 = src1->ne[2]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; @@ -7046,6 +7049,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } +static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream)); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + half * src0_as_f16 = (half *) src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + size_t src1_as = 0; + half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); + to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); + + size_t dst_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + +#if 0 + // use cublasGemmEx + { + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + CUBLAS_CHECK( + cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + } +#else + if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { + // there is no broadcast and src0, src1 are contiguous across dims 2, 3 + // use cublasGemmStridedBatchedEx + CUBLAS_CHECK( + cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA + (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB + &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC + ne12*ne13, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + // use cublasGemmBatchedEx + // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000 + const int ne23 = ne12*ne13; + + // TODO: avoid this alloc + void ** ptrs = (void **) malloc(3*ne23*sizeof(void *)); + + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3]; + ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2; + ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2; + } + } + + // allocate device memory for pointers + void ** ptrs_as = nullptr; + CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *))); + + // TODO: this does not work for some reason -- not sure why? + //size_t ptrs_s = 0; + //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s); + + // copy pointers to device + CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice)); + + free(ptrs); + + CUBLAS_CHECK( + cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01, + ne23, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // free device memory for pointers + CUDA_CHECK(cudaFree(ptrs_as)); + //ggml_cuda_pool_free(ptrs_as, ptrs_s); + } +#endif + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(src1_as_f16, src1_as); + ggml_cuda_pool_free(dst_f16, dst_as); +} + static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; @@ -7058,10 +7214,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } + // debug helpers + //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); + //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); + //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + // KQ ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { + } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + // KQV ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { diff --git a/ggml.c b/ggml.c index 49f3b7aba..17f0ce487 100644 --- a/ggml.c +++ b/ggml.c @@ -16602,6 +16602,10 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); + if (tensor->op == GGML_OP_NONE) { + return; + } + #ifdef GGML_USE_CUBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { From abd21fc99f1d35e2081e4c01dc09c71a86bf3c5a Mon Sep 17 00:00:00 2001 From: John Smith <67539080+kingsidelee@users.noreply.github.com> Date: Wed, 25 Oct 2023 01:48:45 +0800 Subject: [PATCH 02/63] cmake : add missed dependencies (#3763) --- examples/main-cmake-pkg/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index 908131884..cb00edbbb 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -16,6 +16,8 @@ add_library(common OBJECT ${_common_path}/console.cpp ${_common_path}/grammar-parser.h ${_common_path}/grammar-parser.cpp + ${_common_path}/sampling.h + ${_common_path}/sampling.cpp ) # WARNING: because build-info.h is auto-generated, it will only From b2f7e04bd312eaf97eee0523aa09d950d585626b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 24 Oct 2023 21:51:20 +0300 Subject: [PATCH 03/63] sync : ggml (conv ops + cuda MSVC fixes) (#3765) ggml-ci --- ggml-cuda.cu | 10 +- ggml.c | 438 +++++++++++++++++++++++++++++++++++++++++---------- ggml.h | 15 +- 3 files changed, 369 insertions(+), 94 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db053e3b8..d1e874b6c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5664,10 +5664,10 @@ void ggml_init_cublas() { GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -5677,15 +5677,15 @@ void ggml_init_cublas() { g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); // create cuda streams - for (int64_t is = 0; is < MAX_STREAMS; ++is) { + for (int is = 0; is < MAX_STREAMS; ++is) { CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); } diff --git a/ggml.c b/ggml.c index 17f0ce487..6f66bab05 100644 --- a/ggml.c +++ b/ggml.c @@ -571,7 +571,6 @@ int64_t ggml_cycles_per_ms(void) { #define ggml_perf_cycles_per_ms() 0 #endif - // // cache line // @@ -1828,7 +1827,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { return type_traits[type]; } - // // simd mappings // @@ -4057,16 +4055,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", "CONV_TRANSPOSE_1D", "CONV_2D", + "CONV_2D_STAGE_0", + "CONV_2D_STAGE_1", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", - "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4092,7 +4091,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4143,16 +4142,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", "conv_transpose_1d(x)", "conv_2d(x)", + "conv_2d_stage_0(x)", + "conv_2d_stage_1(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", - "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4178,7 +4178,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4209,8 +4209,10 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_CONV_1D ] = true; p[GGML_OP_CONV_1D_STAGE_0 ] = true; p[GGML_OP_CONV_1D_STAGE_1 ] = true; - p[GGML_OP_CONV_2D ] = true; p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_2D_STAGE_0 ] = true; + p[GGML_OP_CONV_2D_STAGE_1 ] = true; p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -5954,7 +5956,6 @@ struct ggml_tensor * ggml_sqrt_inplace( return ggml_sqrt_impl(ctx, a, true); } - // ggml_log static struct ggml_tensor * ggml_log_impl( @@ -6008,7 +6009,6 @@ struct ggml_tensor * ggml_sum( return result; } - // ggml_sum_rows struct ggml_tensor * ggml_sum_rows( @@ -6640,7 +6640,6 @@ struct ggml_tensor * ggml_set_2d_inplace( return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } - // ggml_cpy static struct ggml_tensor * ggml_cpy_impl( @@ -6720,7 +6719,6 @@ struct ggml_tensor * ggml_cont_inplace( return ggml_cont_impl(ctx, a, true); } - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, @@ -7173,7 +7171,6 @@ struct ggml_tensor * ggml_diag( return result; } - // ggml_diag_mask_inf static struct ggml_tensor * ggml_diag_mask_inf_impl( @@ -7285,7 +7282,6 @@ struct ggml_tensor * ggml_soft_max_inplace( return ggml_soft_max_impl(ctx, a, true); } - // ggml_soft_max_back static struct ggml_tensor * ggml_soft_max_back_impl( @@ -7702,7 +7698,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( // ggml_conv_2d -struct ggml_tensor * ggml_conv_2d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +static struct ggml_tensor * ggml_conv_2d_stage_0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -7721,17 +7721,21 @@ struct ggml_tensor * ggml_conv_2d( is_node = true; } + const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { - ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], b->ne[3], + a->ne[2] * a->ne[1] * a->ne[0], + OW, + OH, + b->ne[3], }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_2D; + result->op = GGML_OP_CONV_2D_STAGE_0; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7740,8 +7744,61 @@ struct ggml_tensor * ggml_conv_2d( } -// ggml_conv_2d_sk_p0 +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// a: [OC, IC, KH, KW] +// b: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static struct ggml_tensor * ggml_conv_2d_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + b->ne[2], + a->ne[3], + b->ne[3], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_2D_STAGE_1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; + +} + +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OC, OH, OW] +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW] + result = ggml_conv_2d_stage_1(ctx, a, result); + + return result; + +} + +// ggml_conv_2d_sk_p0 struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8180,7 +8237,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl( return result; } - struct ggml_tensor * ggml_add_rel_pos( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8625,8 +8681,6 @@ struct ggml_tensor * ggml_map_custom3_inplace( return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } - - // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9828,7 +9882,6 @@ static void ggml_compute_forward_add1( } } - // ggml_compute_forward_acc static void ggml_compute_forward_acc_f32( @@ -9968,7 +10021,6 @@ static void ggml_compute_forward_sub_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE vDSP_vsub( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, @@ -10149,7 +10201,6 @@ static void ggml_compute_forward_div_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE UNUSED(ggml_vec_div_f32); @@ -10287,7 +10338,6 @@ static void ggml_compute_forward_sqrt( } } - // ggml_compute_forward_log static void ggml_compute_forward_log_f32( @@ -12120,7 +12170,6 @@ static void ggml_compute_forward_out_prod_f32( } } - //int64_t t1 = ggml_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; @@ -12316,7 +12365,6 @@ static void ggml_compute_forward_scale_f32( const size_t nb1 = dst->nb[1]; - for (int i1 = ir0; i1 < ir1; i1++) { if (dst->data != src0->data) { // src0 is same shape as dst => same indices @@ -12714,7 +12762,6 @@ static void ggml_compute_forward_get_rows_back_f32( } } - static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -13997,6 +14044,7 @@ static void ggml_compute_forward_conv_1d_f32( } } +// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, ggml_fp16_t * A, ggml_fp16_t * B, @@ -14298,6 +14346,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14370,7 +14421,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; + dst_data[i00*ne02 + i02] = src[i00]; } } } @@ -14389,6 +14440,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14450,6 +14504,144 @@ static void ggml_compute_forward_conv_transpose_1d( // ggml_compute_forward_conv_2d +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_conv_2d_stage_0_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + // const int64_t OC = ne03; + // const int64_t IC = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OH = ne2; + const int64_t OW = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// src0: [OC, IC, KH, KW] +// src1: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static void ggml_compute_forward_conv_2d_stage_1_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne13; + const int OH = ne12; + const int OW = ne11; + + const int OC = ne03; + const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; + + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -14462,16 +14654,40 @@ static void ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS + + // src1: image [N, IC, IH, IW] + // src0: kernel [OC, IC, KH, KW] + // dst: result [N, OC, OH, OW] + // ne12: IC + // ne0: OW + // ne1: OH + // nk0: KW + // nk1: KH + // ne13: N + + const int N = ne13; + const int IC = ne12; + const int IH = ne11; + const int IW = ne10; + + const int OC = ne03; + // const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int OH = ne1; + const int OW = ne0; const int ith = params->ith; const int nth = params->nth; - const int nk0 = ne00; - const int nk1 = ne01; + // const int nk0 = ne00; + // const int nk1 = ne01; // size of the convolution row - the kernel size unrolled across all channels - const int ew0 = nk0*nk1*ne02; + // const int ew0 = nk0*nk1*ne02; + // ew0: IC*KH*KW const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -14487,24 +14703,27 @@ static void ggml_compute_forward_conv_2d_f16_f32( memset(params->wdata, 0, params->wsize); // prepare source data (src1) + // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW] + { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + for (int in = 0; in < N; in++) { + for (int iic = 0; iic < IC; iic++) { + for (int ioh = 0; ioh < OH; ioh++) { + for (int iow = 0; iow < OW; iow++) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + for (int ikh = 0; ikh < KH; ikh++) { + for (int ikw = 0; ikw < KW; ikw++) { + const int iiw = iow*s0 + ikw*d0 - p0; + const int iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -14521,30 +14740,22 @@ static void ggml_compute_forward_conv_2d_f16_f32( return; } - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + // wdata: [N*OH*OW, IC*KH*KW] + // dst: result [N, OC, OH, OW] + // src0: kernel [OC, IC, KH, KW] - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); - } - } - } + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m * k] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); } } @@ -14570,6 +14781,48 @@ static void ggml_compute_forward_conv_2d( } } +static void ggml_compute_forward_conv_2d_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_2d_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_conv_transpose_2d static void ggml_compute_forward_conv_transpose_2d( @@ -14628,6 +14881,8 @@ static void ggml_compute_forward_conv_transpose_2d( } } + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -16126,7 +16381,6 @@ static void ggml_compute_forward_add_rel_pos_f32( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - for (int64_t i13 = ip0; i13 < ip1; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { @@ -16193,7 +16447,6 @@ static void ggml_compute_forward_map_unary_f32( } } - static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16241,7 +16494,6 @@ static void ggml_compute_forward_map_binary_f32( } } - static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16293,7 +16545,6 @@ static void ggml_compute_forward_map_custom2_f32( fun(dst, a, b); } - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -16568,7 +16819,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( ggml_vec_sub_f32(nc, ds0, ds0, s1); ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); - #ifndef NDEBUG for (int i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); @@ -16596,7 +16846,6 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -16808,6 +17057,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17737,11 +17994,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_TRANSPOSE_1D: + case GGML_OP_CONV_2D_STAGE_0: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D_STAGE_1: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -18670,6 +18935,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne0 = node->ne[0]; const int64_t ne1 = node->ne[1]; const int64_t ne2 = node->ne[2]; + const int64_t ne3 = node->ne[3]; const int64_t nk = ne00*ne01; const int64_t ew0 = nk * ne02; @@ -18680,7 +18946,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (node->src[0]->type == GGML_TYPE_F16 && node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + // im2col: [N*OH*OW, IC*KH*KW] + cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); @@ -18690,6 +18957,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { work_size = MAX(work_size, cur); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + n_tasks = n_threads; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -19878,7 +20153,6 @@ static enum ggml_opt_result ggml_opt_adam( opt->loss_after = fx; - // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); diff --git a/ggml.h b/ggml.h index 16aaf169e..08bff5511 100644 --- a/ggml.h +++ b/ggml.h @@ -401,15 +401,16 @@ extern "C" { GGML_OP_ALIBI, GGML_OP_CLAMP, GGML_OP_CONV_1D, - GGML_OP_CONV_2D, + GGML_OP_CONV_1D_STAGE_0, // internal + GGML_OP_CONV_1D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_1D, + GGML_OP_CONV_2D, + GGML_OP_CONV_2D_STAGE_0, // internal + GGML_OP_CONV_2D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, - GGML_OP_CONV_1D_STAGE_0, // internal - GGML_OP_CONV_1D_STAGE_1, // internal - GGML_OP_UPSCALE, // nearest interpolate GGML_OP_FLASH_ATTN, @@ -1020,9 +1021,9 @@ extern "C" { struct ggml_tensor * b, float eps); - // A: n columns, m rows - // B: n columns, p rows (i.e. we transpose it internally) - // result is m columns, p rows + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] GGML_API struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, From 1717521cdb976a2219888b0e5cba36e210eee9df Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 24 Oct 2023 23:08:20 +0300 Subject: [PATCH 04/63] server : do not block system prompt update (#3767) * server : do not block system prompt update * server : update state machine logic to process system prompts * server : minor --- examples/server/server.cpp | 57 +++++++++++++------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 693f9b773..f52a928c8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -454,7 +454,7 @@ struct llama_client_slot } void release() { - if (state == PROCESSING) + if (state == IDLE || state == PROCESSING) { t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; command = RELEASE; @@ -754,6 +754,7 @@ struct llama_server_context } slot->params.antiprompt.clear(); + const auto &stop = data.find("stop"); if (stop != data.end() && stop->is_array()) { @@ -867,7 +868,7 @@ struct llama_server_context kv_cache_clear(); - for (int32_t i = 0; i < batch.n_tokens; ++i) + for (int i = 0; i < (int) system_tokens.size(); ++i) { llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } @@ -894,16 +895,8 @@ struct llama_server_context { slot.release(); } - wait_all_are_idle(); - all_slots_are_idle = true; - // wait until system prompt load system_need_update = true; - while (system_need_update) - { - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - // system prompt loaded, continue } void process_system_prompt_data(const json &sys_props) { @@ -915,26 +908,6 @@ struct llama_server_context { notify_system_prompt_changed(); } - else - { - system_need_update = true; - } - } - - void wait_all_are_idle() { - bool wait = true; - while (wait) - { - wait = false; - for (auto &slot : slots) - { - if (!slot.available()) - { - wait = true; - break; - } - } - } } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, @@ -965,7 +938,6 @@ struct llama_server_context slot.has_next_token = false; } stop_pos = pos; - } } @@ -1444,7 +1416,7 @@ struct llama_server_context process_tasks(); // update the system prompt wait until all slots are idle state - if (system_need_update) + if (system_need_update && all_slots_are_idle) { LOG_TEE("updating system prompt\n"); update_system_prompt(); @@ -1498,7 +1470,7 @@ struct llama_server_context for (auto & slot : slots) { // release the slot - if (slot.state == PROCESSING && slot.command == RELEASE) + if (slot.command == RELEASE) { slot.state = IDLE; slot.command = NONE; @@ -1509,7 +1481,7 @@ struct llama_server_context continue; } - if (slot.state == IDLE || slot.command == RELEASE) + if (slot.state == IDLE) { continue; } @@ -1530,6 +1502,17 @@ struct llama_server_context { for (auto & slot : slots) { + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); + + // empty prompt passed -> release the slot and send empty response + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + // need process the prompt if (slot.state == IDLE && slot.command == LOAD_PROMPT) { @@ -1749,8 +1732,8 @@ struct llama_server_context if (!process_token(result, slot)) { slot.release(); - send_final_response(slot); slot.print_timings(); + send_final_response(slot); } slot.i_batch = -1; @@ -2285,7 +2268,7 @@ int main(int argc, char **argv) if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); - if(!result.error && result.stop) { + if (!result.error && result.stop) { res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); } else @@ -2312,7 +2295,7 @@ int main(int argc, char **argv) { return false; } - if(result.stop) { + if (result.stop) { break; } } else { From ad939626577cd25b462e8026cc543efb71528472 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Tue, 24 Oct 2023 16:10:43 -0400 Subject: [PATCH 05/63] server : add parameter -tb N, --threads-batch N (#3584) (#3768) Co-authored-by: Michael Coppola Co-authored-by: Michael Coppola --- examples/server/server.cpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f52a928c8..b4c4d0a20 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1749,15 +1749,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("usage: %s [options]\n", argv0); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); @@ -1907,6 +1908,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } + else if (arg == "--threads-batch" || arg == "-tb") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) From cc448774866e6479c750bd7c135cd8f92cedee67 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 25 Oct 2023 10:09:16 +0300 Subject: [PATCH 06/63] log : disable pid in log filenames --- common/log.h | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/common/log.h b/common/log.h index 70e7e4ca2..d2c864cea 100644 --- a/common/log.h +++ b/common/log.h @@ -97,22 +97,23 @@ #define LOG_TEE_TARGET stderr #endif +// NOTE: currently disabled as it produces too many log files // Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} +//inline std::string log_get_pid() +//{ +// static std::string pid; +// if (pid.empty()) +// { +// // std::this_thread::get_id() is the most portable way of obtaining a "process id" +// // it's not the same as "pid" but is unique enough to solve multiple instances +// // trying to write to the same log. +// std::stringstream ss; +// ss << std::this_thread::get_id(); +// pid = ss.str(); +// } +// +// return pid; +//} // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" @@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base std::stringstream buf; buf << log_file_basename; - buf << "."; - buf << log_get_pid(); + //buf << "."; + //buf << log_get_pid(); buf << "."; buf << log_file_extension; From 6961c4bd0b5176e10ab03b35394f1e9eab761792 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 25 Oct 2023 10:26:27 +0300 Subject: [PATCH 07/63] batched-bench : print params at start --- examples/batched-bench/batched-bench.cpp | 4 ++++ ggml-cuda.cu | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index c552eaa73..43f9c971d 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -154,6 +154,10 @@ int main(int argc, char ** argv) { } } + LOG_TEE("\n"); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq); + LOG_TEE("\n"); + LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d1e874b6c..ba0cd5a7d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6254,16 +6254,15 @@ inline void ggml_cuda_op_mul_mat_cublas( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const cudaStream_t & stream) { - GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src0_dd_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_dd_i != nullptr); - + GGML_ASSERT(dst_dd_i != nullptr); const int64_t ne00 = src0->ne[0]; - const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; + const int64_t row_diff = row_high - row_low; int id; @@ -7223,12 +7222,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - // KQ + // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { - // KQV + // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); From 34b2a5e1ee4fe6295fb4420eb91131d743694c65 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Oct 2023 22:53:37 +0300 Subject: [PATCH 08/63] server : do not release slot on image input (#3798) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b4c4d0a20..5b7e4139d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1502,7 +1502,7 @@ struct llama_server_context { for (auto & slot : slots) { - const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); // empty prompt passed -> release the slot and send empty response if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) From 2f9ec7e271220a78fe27c9e6ccbcc0dda31cda0f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 27 Oct 2023 17:01:23 +0300 Subject: [PATCH 09/63] cuda : improve text-generation and batched decoding performance (#3776) * cuda : prints wip * cuda : new cublas gemm branch for multi-batch quantized src0 * cuda : add F32 sgemm branch * cuda : fine-tune >= VOLTA params + use MMQ only for small batches * cuda : remove duplicated cuBLAS GEMM code * cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros * build : add compile option to force use of MMQ kernels --- CMakeLists.txt | 7 +++ Makefile | 3 ++ ggml-cuda.cu | 130 +++++++++++++++++++++++++++++++++++++++++++------ llama.cpp | 2 - llama.h | 2 +- 5 files changed, 125 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 202f26049..d9fc86237 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use CUDA" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) +option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -305,6 +306,9 @@ if (LLAMA_CUBLAS) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() + if (LLAMA_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -405,6 +409,9 @@ if (LLAMA_HIPBLAS) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) endif() + if (LLAMA_CUDA_FORCE_MMQ) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) + endif() target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/Makefile b/Makefile index 80179631f..68069f9ff 100644 --- a/Makefile +++ b/Makefile @@ -397,6 +397,9 @@ endif # CUDA_DOCKER_ARCH ifdef LLAMA_CUDA_FORCE_DMMV NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV +ifdef LLAMA_CUDA_FORCE_MMQ + NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ +endif # LLAMA_CUDA_FORCE_MMQ ifdef LLAMA_CUDA_DMMV_X NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) else diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ba0cd5a7d..1ba951f68 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -87,6 +87,24 @@ #define CC_OFFSET_AMD 1000000 #define CC_RDNA2 (CC_OFFSET_AMD + 1030) +// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication +// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant +// for large computational tasks. the drawback is that this requires some extra amount of VRAM: +// - 7B quantum model: +100-200 MB +// - 13B quantum model: +200-400 MB +// +//#define GGML_CUDA_FORCE_MMQ + +// TODO: improve this to be correct for more hardware +// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores +// probably other such cases, and not sure what happens on AMD hardware +#if !defined(GGML_CUDA_FORCE_MMQ) +#define CUDA_USE_TENSOR_CORES +#endif + +// max batch size to use MMQ kernels when tensor cores are available +#define MMQ_MAX_BATCH_SIZE 32 + #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -470,7 +488,6 @@ static int g_device_count = -1; static int g_main_device = 0; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = true; static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 0; // disabled by default @@ -3554,9 +3571,15 @@ static __device__ __forceinline__ void mul_mat_q( #define MMQ_X_Q4_0_RDNA1 64 #define MMQ_Y_Q4_0_RDNA1 64 #define NWARPS_Q4_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else #define MMQ_X_Q4_0_AMPERE 64 #define MMQ_Y_Q4_0_AMPERE 128 #define NWARPS_Q4_0_AMPERE 4 +#endif #define MMQ_X_Q4_0_PASCAL 64 #define MMQ_Y_Q4_0_PASCAL 64 #define NWARPS_Q4_0_PASCAL 8 @@ -3615,9 +3638,15 @@ template static __global__ void #define MMQ_X_Q4_1_RDNA1 64 #define MMQ_Y_Q4_1_RDNA1 64 #define NWARPS_Q4_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else #define MMQ_X_Q4_1_AMPERE 64 #define MMQ_Y_Q4_1_AMPERE 128 #define NWARPS_Q4_1_AMPERE 4 +#endif #define MMQ_X_Q4_1_PASCAL 64 #define MMQ_Y_Q4_1_PASCAL 64 #define NWARPS_Q4_1_PASCAL 8 @@ -3678,9 +3707,15 @@ template static __global__ void #define MMQ_X_Q5_0_RDNA1 64 #define MMQ_Y_Q5_0_RDNA1 64 #define NWARPS_Q5_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else #define MMQ_X_Q5_0_AMPERE 128 #define MMQ_Y_Q5_0_AMPERE 64 #define NWARPS_Q5_0_AMPERE 4 +#endif #define MMQ_X_Q5_0_PASCAL 64 #define MMQ_Y_Q5_0_PASCAL 64 #define NWARPS_Q5_0_PASCAL 8 @@ -3739,9 +3774,15 @@ template static __global__ void #define MMQ_X_Q5_1_RDNA1 64 #define MMQ_Y_Q5_1_RDNA1 64 #define NWARPS_Q5_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else #define MMQ_X_Q5_1_AMPERE 128 #define MMQ_Y_Q5_1_AMPERE 64 #define NWARPS_Q5_1_AMPERE 4 +#endif #define MMQ_X_Q5_1_PASCAL 64 #define MMQ_Y_Q5_1_PASCAL 64 #define NWARPS_Q5_1_PASCAL 8 @@ -3800,9 +3841,15 @@ mul_mat_q5_1( #define MMQ_X_Q8_0_RDNA1 64 #define MMQ_Y_Q8_0_RDNA1 64 #define NWARPS_Q8_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else #define MMQ_X_Q8_0_AMPERE 128 #define MMQ_Y_Q8_0_AMPERE 64 #define NWARPS_Q8_0_AMPERE 4 +#endif #define MMQ_X_Q8_0_PASCAL 64 #define MMQ_Y_Q8_0_PASCAL 64 #define NWARPS_Q8_0_PASCAL 8 @@ -3861,9 +3908,15 @@ template static __global__ void #define MMQ_X_Q2_K_RDNA1 128 #define MMQ_Y_Q2_K_RDNA1 32 #define NWARPS_Q2_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else #define MMQ_X_Q2_K_AMPERE 64 #define MMQ_Y_Q2_K_AMPERE 128 #define NWARPS_Q2_K_AMPERE 4 +#endif #define MMQ_X_Q2_K_PASCAL 64 #define MMQ_Y_Q2_K_PASCAL 64 #define NWARPS_Q2_K_PASCAL 8 @@ -3922,9 +3975,15 @@ mul_mat_q2_K( #define MMQ_X_Q3_K_RDNA1 32 #define MMQ_Y_Q3_K_RDNA1 128 #define NWARPS_Q3_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else #define MMQ_X_Q3_K_AMPERE 128 #define MMQ_Y_Q3_K_AMPERE 128 #define NWARPS_Q3_K_AMPERE 4 +#endif #define MMQ_X_Q3_K_PASCAL 64 #define MMQ_Y_Q3_K_PASCAL 64 #define NWARPS_Q3_K_PASCAL 8 @@ -3985,9 +4044,15 @@ template static __global__ void #define MMQ_X_Q4_K_RDNA1 32 #define MMQ_Y_Q4_K_RDNA1 64 #define NWARPS_Q4_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else #define MMQ_X_Q4_K_AMPERE 64 #define MMQ_Y_Q4_K_AMPERE 128 #define NWARPS_Q4_K_AMPERE 4 +#endif #define MMQ_X_Q4_K_PASCAL 64 #define MMQ_Y_Q4_K_PASCAL 64 #define NWARPS_Q4_K_PASCAL 8 @@ -4048,9 +4113,15 @@ template static __global__ void #define MMQ_X_Q5_K_RDNA1 32 #define MMQ_Y_Q5_K_RDNA1 64 #define NWARPS_Q5_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else #define MMQ_X_Q5_K_AMPERE 64 #define MMQ_Y_Q5_K_AMPERE 128 #define NWARPS_Q5_K_AMPERE 4 +#endif #define MMQ_X_Q5_K_PASCAL 64 #define MMQ_Y_Q5_K_PASCAL 64 #define NWARPS_Q5_K_PASCAL 8 @@ -4109,9 +4180,15 @@ mul_mat_q5_K( #define MMQ_X_Q6_K_RDNA1 32 #define MMQ_Y_Q6_K_RDNA1 64 #define NWARPS_Q6_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else #define MMQ_X_Q6_K_AMPERE 64 #define MMQ_Y_Q6_K_AMPERE 64 #define NWARPS_Q6_K_AMPERE 4 +#endif #define MMQ_X_Q6_K_PASCAL 64 #define MMQ_Y_Q6_K_PASCAL 64 #define NWARPS_Q6_K_PASCAL 8 @@ -5663,6 +5740,16 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; +#if defined(GGML_CUDA_FORCE_MMQ) + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); +#endif +#if defined(CUDA_USE_TENSOR_CORES) + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); +#else + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); +#endif fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; @@ -6347,7 +6434,7 @@ inline void ggml_cuda_op_mul_mat_cublas( cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, &alpha, src0_ddf_i, ne00, - src1_ddf_i, ne10, + src1_ddf_i, ne10, &beta, dst_dd_i, ldc)); if (src0_as != 0) { @@ -7048,9 +7135,10 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } -static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -7202,17 +7290,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + const bool all_on_device = + (src0->backend == GGML_BACKEND_GPU) && + (src1->backend == GGML_BACKEND_GPU) && + ( dst->backend == GGML_BACKEND_GPU); int64_t min_compute_capability = INT_MAX; for (int64_t id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_compute_capabilities[id] - && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { min_compute_capability = g_compute_capabilities[id]; } } +#ifdef CUDA_USE_TENSOR_CORES + const bool use_tensor_cores = true; +#else + const bool use_tensor_cores = false; +#endif + // debug helpers //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); @@ -7221,20 +7316,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { - #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else @@ -7247,7 +7341,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); } } else { - if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { + bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + + // when tensor cores are available, use them for large batch size + // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { + use_mul_mat_q = false; + } + + if (use_mul_mat_q) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); @@ -7601,10 +7703,6 @@ void ggml_cuda_set_main_device(const int main_device) { } } -void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { - g_mul_mat_q = mul_mat_q; -} - void ggml_cuda_set_scratch_size(const size_t scratch_size) { // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously // it still won't always work as expected, but it's better than nothing diff --git a/llama.cpp b/llama.cpp index 61f30c398..cc8669b0e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5959,8 +5959,6 @@ static int llama_decode_internal( } } - ggml_cuda_set_mul_mat_q(cparams.mul_mat_q); - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed if (!lctx.embedding.empty()) { embeddings->backend = GGML_BACKEND_CPU; diff --git a/llama.h b/llama.h index 2f2fee0e2..beac9a0ce 100644 --- a/llama.h +++ b/llama.h @@ -178,7 +178,7 @@ extern "C" { float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels + bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only From c8d6a1f34ab6f1b6bd468d256e535a61f98f114c Mon Sep 17 00:00:00 2001 From: Thibault Terrasson Date: Fri, 27 Oct 2023 16:37:41 +0200 Subject: [PATCH 10/63] simple : fix batch handling (#3803) --- examples/simple/simple.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index f376c0509..374aef6f1 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -95,13 +95,8 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(512, 0, 1); // evaluate the initial prompt - batch.n_tokens = tokens_list.size(); - - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = tokens_list[i]; - batch.pos[i] = i; - batch.seq_id[i] = 0; - batch.logits[i] = false; + for (size_t i = 0; i < tokens_list.size(); i++) { + llama_batch_add(batch, tokens_list[i], i, { 0 }, false); } // llama_decode will output logits only for the last token of the prompt @@ -148,15 +143,10 @@ int main(int argc, char ** argv) { fflush(stdout); // prepare the next batch - batch.n_tokens = 0; + llama_batch_clear(batch); // push this new token for next evaluation - batch.token [batch.n_tokens] = new_token_id; - batch.pos [batch.n_tokens] = n_cur; - batch.seq_id[batch.n_tokens] = 0; - batch.logits[batch.n_tokens] = true; - - batch.n_tokens += 1; + llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); n_decode += 1; } From 6d459cbfbe5a011dfca94f9550527a504b6f9aa1 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Fri, 27 Oct 2023 17:33:53 -0400 Subject: [PATCH 11/63] llama : correctly report GGUFv3 format (#3818) --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index cc8669b0e..408533d8a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1578,12 +1578,14 @@ static void llama_kv_cache_seq_shift( enum llama_fver { GGUF_FILE_VERSION_V1 = 1, GGUF_FILE_VERSION_V2 = 2, + GGUF_FILE_VERSION_V3 = 3, }; static const char * llama_file_version_name(llama_fver version) { switch (version) { case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; - case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2"; + case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; } return "unknown"; From 41aee4df821854f37d90a45281f03b6db8d27de2 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:40:07 -0600 Subject: [PATCH 12/63] speculative : ensure draft and target model vocab matches (#3812) * speculative: Ensure draft and target model vocab matches * Tolerate small differences when checking dft vs tgt vocab --- examples/speculative/speculative.cpp | 33 +++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 92ad27e8e..f921b7845 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -8,6 +8,9 @@ #include #include +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 +#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 + struct seq_draft { bool active = false; bool drafting = false; @@ -64,6 +67,33 @@ int main(int argc, char ** argv) { params.n_gpu_layers = params.n_gpu_layers_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + { + const int n_vocab_tgt = llama_n_vocab(model_tgt); + const int n_vocab_dft = llama_n_vocab(model_dft); + const int vocab_diff = n_vocab_tgt > n_vocab_dft + ? n_vocab_tgt - n_vocab_dft + : n_vocab_dft - n_vocab_tgt; + + if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { + fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__); + fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + return 1; + } + + for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { + const char * token_text_tgt = llama_token_get_text(model_tgt, i); + const char * token_text_dft = llama_token_get_text(model_dft, i); + if (std::strcmp(token_text_tgt, token_text_dft) != 0) { + fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); + fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, + llama_token_to_piece(ctx_tgt, i).c_str(), + llama_token_to_piece(ctx_dft, i).c_str()); + return 1; + } + } + } + // tokenize the prompt std::vector inp; inp = ::llama_tokenize(ctx_tgt, params.prompt, true); @@ -227,6 +257,7 @@ int main(int argc, char ** argv) { llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); + // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode (ctx_dft, batch_dft); ++n_past_dft; @@ -370,7 +401,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); } - //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt)); + // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); ++n_past_tgt; } From fdee152e4eebb78c191df0b074857111d7f2aba7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 12:06:08 +0300 Subject: [PATCH 13/63] starcoder : add GPU offloading (#3827) * starcoder : do not GPU split 1D bias tensors * starcoder : offload layers to GPU ggml-ci --- llama.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 21 deletions(-) diff --git a/llama.cpp b/llama.cpp index 408533d8a..6caa58960 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2695,8 +2695,8 @@ static void llm_load_tensors( } break; case LLM_ARCH_STARCODER: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); // output { @@ -2747,19 +2747,19 @@ static void llm_load_tensors( layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -4616,6 +4616,8 @@ static struct ggml_cgraph * llm_build_starcoder( const float norm_eps = hparams.f_norm_eps; + const int n_gpu_layers = model.n_gpu_layers; + const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; @@ -4660,6 +4662,27 @@ static struct ggml_cgraph * llm_build_starcoder( } } + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + { // Compute position embeddings. struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -4685,6 +4708,7 @@ static struct ggml_cgraph * llm_build_starcoder( // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); + offload_func_kq(KQ_mask); ggml_allocr_alloc(lctx.alloc, KQ_mask); if (!ggml_allocr_is_measure(lctx.alloc)) { float * data = (float *) KQ_mask->data; @@ -4708,44 +4732,67 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_set_name(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func(cur); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + offload_func(cur); } { // Self Attention - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + offload_func_kq(cur); - struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + offload_func_kq(cur); - struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + ggml_set_name(tmpq, "tmpq"); + ggml_set_name(tmpk, "tmpk"); + ggml_set_name(tmpv, "tmpv"); + + offload_func_kq(tmpq); + offload_func_kq(tmpk); + offload_func_v (tmpv); + + struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); + struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); + offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + offload_func_v(v); + ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -4754,23 +4801,28 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -4783,22 +4835,25 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); ggml_set_name(KQV, "KQV"); - // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); } // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + offload_func(cur); // Add the input cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); struct ggml_tensor * inpFF = cur; @@ -4807,27 +4862,36 @@ static struct ggml_cgraph * llm_build_starcoder( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); + offload_func_nr(cur); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + offload_func_nr(cur); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + offload_func(cur); // GELU activation cur = ggml_gelu(ctx0, cur); + offload_func(cur); // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + offload_func(cur); } inpL = ggml_add(ctx0, cur, inpFF); + } // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func_nr(cur); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + ggml_set_name(cur, "result_norm"); } - ggml_set_name(cur, "result_norm"); cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); From 177461104b454163473dced2a5038f4e016cdb7e Mon Sep 17 00:00:00 2001 From: Henk Poley Date: Sat, 28 Oct 2023 12:16:33 +0200 Subject: [PATCH 14/63] common : print that one line of the syntax help *also* to standard output (#3823) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 44bb76618..c0d4924e2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -743,7 +743,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // GGML_USE_CUBLAS #endif printf(" --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); + printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); From ee1a0ec9cb367ba41d138134795cbbbe93d2bf1c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 14:23:11 +0300 Subject: [PATCH 15/63] llama : add option for greedy sampling with probs (#3813) * llama : add option for greedy sampling with probs * llama : add comment about llama_sample_token_greedy() missing probs * sampling : temp == 0.0 -> no probs, temp < 0.0 -> probs --- common/common.cpp | 1 + common/sampling.cpp | 8 ++++++-- examples/speculative/speculative.cpp | 2 +- llama.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c0d4924e2..f81f4d354 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -224,6 +224,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); } else if (arg == "--tfs") { if (++i >= argc) { invalid_param = true; diff --git a/common/sampling.cpp b/common/sampling.cpp index 5258d4e82..c4996c985 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -167,8 +167,12 @@ llama_token llama_sampling_sample( llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } - if (temp <= 0) { - // greedy sampling + if (temp < 0.0) { + // greedy sampling, with probs + llama_sample_softmax(ctx_main, &cur_p); + id = cur_p.data[0].id; + } else if (temp == 0.0) { + // greedy sampling, no probs id = llama_sample_token_greedy(ctx_main, &cur_p); } else { if (mirostat == 1) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index f921b7845..323c74652 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -148,7 +148,7 @@ int main(int argc, char ** argv) { std::vector drafts(n_seq_dft); params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sparams.temp = std::max(0.01f, params.sparams.temp); + params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model for (int s = 0; s < n_seq_dft; ++s) { drafts[s].ctx_sampling = llama_sampling_init(params.sparams); diff --git a/llama.h b/llama.h index beac9a0ce..d901dcd91 100644 --- a/llama.h +++ b/llama.h @@ -658,6 +658,7 @@ extern "C" { float * mu); /// @details Selects the token with the highest probability. + /// Does not compute the token probabilities. Use llama_sample_softmax() instead. LLAMA_API llama_token llama_sample_token_greedy( struct llama_context * ctx, llama_token_data_array * candidates); From bd6d9e205982b34e0ba2c3b22bbf31a1ef1a1bb5 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 28 Oct 2023 05:54:24 -0600 Subject: [PATCH 16/63] llama : allow quantizing k-quants to fall back when tensor size incompatible (#3747) * Allow quantizing k-quants to fall back when tensor size incompatible * quantizing: Add warning when tensors were incompatible with k-quants Clean up k-quants state passing a bit --- llama.cpp | 108 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 43 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6caa58960..3d431ee7b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8049,6 +8049,24 @@ struct no_init { no_init() { /* do nothing */ } }; +struct quantize_state_internal { + const llama_model & model; + const llama_model_quantize_params * params; +#ifdef GGML_USE_K_QUANTS + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + + int n_k_quantized = 0; + int n_fallback = 0; +#endif + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; + static void llama_convert_tensor_internal( struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -8109,12 +8127,13 @@ static void llama_convert_tensor_internal( #ifdef GGML_USE_K_QUANTS static ggml_type get_k_quant_type( - ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, - int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 + quantize_state_internal & qs, + ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype ) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants - const auto tn = LLM_TN(model.arch); + const llm_arch arch = qs.model.arch; + const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; @@ -8122,7 +8141,7 @@ static ggml_type get_k_quant_type( if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } else if (new_type != GGML_TYPE_Q8_0) { @@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type( } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; - if (model.type == MODEL_70B) { + (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } - ++*i_attention_wv; + ++qs.i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K - : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (model.arch == LLM_ARCH_FALCON) { - new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : - use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + if (arch == LLM_ARCH_FALCON) { + new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { new_type = GGML_TYPE_Q5_K; } - ++*i_feed_forward_w2; + ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (model.arch != LLM_ARCH_FALCON) { + if (arch != LLM_ARCH_FALCON) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; @@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type( int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); convert_incompatible_tensor = true; + } else { + ++qs.n_k_quantized; } } if (convert_incompatible_tensor) { - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { - new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { - new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); + switch (new_type) { + case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; + case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; + case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; + case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; + default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); + ++qs.n_fallback; } return new_type; @@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(ml, model); llm_load_hparams(ml, model); + struct quantize_state_internal qs(model, params); + if (params->only_copy) { ftype = model.ftype; } @@ -8281,9 +8305,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s gguf_set_val_u32(ctx_out, "general.file_type", ftype); #ifdef GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * meta = ml.get_tensor_meta(i); @@ -8291,19 +8312,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { - ++n_attention_wv; + ++qs.n_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { - ++n_feed_forward_w2; + ++qs.n_feed_forward_w2; } } - if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", - __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); } - - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; #endif size_t total_size_org = 0; @@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS - new_type = get_k_quant_type( - new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 - ); + new_type = get_k_quant_type(qs, new_type, tensor, ftype); #endif // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("\n"); } } +#ifdef GGML_USE_K_QUANTS + if (qs.n_fallback > 0) { + LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", + __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + } +#endif } static int llama_apply_lora_from_file_internal( From 8a2f2fea2914aaa3f4b2f82800c7de15f15bdb09 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 15:25:15 +0300 Subject: [PATCH 17/63] convert : ignore tokens if their IDs are within [0, vocab_size) (#3831) --- convert.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/convert.py b/convert.py index 0680f71ea..bfbfab283 100755 --- a/convert.py +++ b/convert.py @@ -366,16 +366,19 @@ class SentencePieceVocab: added_tokens = {} vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}") - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + # Token pieces that were added to the base vocabulary. + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: From ba231e8a6dd8ad82acfe0e4d492ff7cef6b3f0a1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 15:25:33 +0300 Subject: [PATCH 18/63] issues : change label from bug to bug-unconfirmed (#3748) --- .github/ISSUE_TEMPLATE/bug.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md index d7879b232..c003fe7c1 100644 --- a/.github/ISSUE_TEMPLATE/bug.md +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -1,7 +1,7 @@ --- name: Bug template about: Used to report bugs in llama.cpp -labels: ["bug"] +labels: ["bug-unconfirmed"] assignees: '' --- From 82a6646e0221216c41edcdf99f5a44bb051391f5 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sat, 28 Oct 2023 15:43:01 +0300 Subject: [PATCH 19/63] metal : try cwd for ggml-metal.metal if bundle lookup fails (#3793) * Try cwd for ggml-metal if bundle lookup fails When building with `-DBUILD_SHARED_LIBS=ON -DLLAMA_METAL=ON -DLLAMA_BUILD_SERVER=ON`, `server` would fail to load `ggml-metal.metal` because `[bundle pathForResource:...]` returns `nil`. In that case, fall back to `ggml-metal.metal` in the cwd instead of passing `null` as a path. Follows up on #1782 * Update ggml-metal.m --------- Co-authored-by: Georgi Gerganov --- ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index c1901dca7..2380c4310 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + if (sourcePath == nil) { + GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); + sourcePath = @"ggml-metal.metal"; + } GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; if (error) { From ff3bad83e29e3009010cbc923bebd769055eaa7f Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Sat, 28 Oct 2023 16:41:07 +0200 Subject: [PATCH 20/63] flake : update flake.lock for newer transformers version + provide extra dev shell (#3797) * flake : update flake.lock for newer transformers version + provide extra dev shell with torch and transformers (for most convert-xxx.py scripts) --- flake.lock | 6 +++--- flake.nix | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index a7777d05d..070f0e161 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1692913444, - "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=", + "lastModified": 1698134075, + "narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "18324978d632ffc55ef1d928e81630c620f4f447", + "rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index cfc4776a4..fa34394b2 100644 --- a/flake.nix +++ b/flake.nix @@ -51,6 +51,9 @@ }; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); + # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime + llama-python-extra = + pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]); postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" @@ -126,5 +129,9 @@ buildInputs = [ llama-python ]; packages = nativeBuildInputs ++ osSpecific; }; + devShells.extra = pkgs.mkShell { + buildInputs = [ llama-python-extra ]; + packages = nativeBuildInputs ++ osSpecific; + }; }); } From d69d777c02b9ac405a95f3cbfba219a990caefff Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 18:32:28 +0200 Subject: [PATCH 21/63] ggml : quantization refactoring (#3833) * ggml : factor all quantization code in ggml-quants ggml-ci * ggml-quants : fix Zig and Swift builds + quantize tool ggml-ci * quantize : --pure option for disabling k-quant mixtures --------- Co-authored-by: cebtenzzre --- CMakeLists.txt | 12 +- Makefile | 18 +- Package.swift | 3 +- build.zig | 21 +- examples/quantize/quantize.cpp | 9 +- k_quants.c => ggml-quants.c | 2248 ++++++++++++++++++++++++++++++- k_quants.h => ggml-quants.h | 103 +- ggml.c | 2301 +------------------------------- ggml.h | 7 + llama.cpp | 34 +- llama.h | 1 + 11 files changed, 2372 insertions(+), 2385 deletions(-) rename k_quants.c => ggml-quants.c (71%) rename k_quants.h => ggml-quants.h (63%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9fc86237..3659279e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast" option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_MPI "llama: use MPI" OFF) -option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) @@ -278,13 +277,8 @@ if (LLAMA_BLAS) endif() endif() -if (LLAMA_K_QUANTS) - set(GGML_HEADERS_EXTRA k_quants.h) - set(GGML_SOURCES_EXTRA k_quants.c) - add_compile_definitions(GGML_USE_K_QUANTS) - if (LLAMA_QKK_64) - add_compile_definitions(GGML_QKK_64) - endif() +if (LLAMA_QKK_64) + add_compile_definitions(GGML_QKK_64) endif() if (LLAMA_CUBLAS) @@ -673,6 +667,8 @@ add_library(ggml OBJECT ggml-alloc.h ggml-backend.c ggml-backend.h + ggml-quants.c + ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} diff --git a/Makefile b/Makefile index 68069f9ff..2cecc2216 100644 --- a/Makefile +++ b/Makefile @@ -342,13 +342,9 @@ else MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif -ifndef LLAMA_NO_K_QUANTS - MK_CPPFLAGS += -DGGML_USE_K_QUANTS - OBJS += k_quants.o ifdef LLAMA_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64 endif -endif ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. @@ -365,7 +361,7 @@ ifdef LLAMA_MPI MK_CPPFLAGS += -DGGML_USE_MPI MK_CFLAGS += -Wno-cast-qual MK_CXXFLAGS += -Wno-cast-qual - OBJS += ggml-mpi.o + OBJS += ggml-mpi.o endif # LLAMA_MPI ifdef LLAMA_OPENBLAS @@ -382,7 +378,7 @@ endif # LLAMA_BLIS ifdef LLAMA_CUBLAS MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib - OBJS += ggml-cuda.o + OBJS += ggml-cuda.o NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math ifdef LLAMA_CUDA_NVCC NVCC = $(LLAMA_CUDA_NVCC) @@ -497,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI -ifndef LLAMA_NO_K_QUANTS -k_quants.o: k_quants.c k_quants.h - $(CC) $(CFLAGS) -c $< -o $@ -endif # LLAMA_NO_K_QUANTS - # combine build flags with cmdline overrides override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) @@ -542,7 +533,10 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o +ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h + $(CC) $(CFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/Package.swift b/Package.swift index 4ab055b19..5b3bd72ca 100644 --- a/Package.swift +++ b/Package.swift @@ -42,13 +42,12 @@ let package = Package( "llama.cpp", "ggml-alloc.c", "ggml-backend.c", - "k_quants.c", + "ggml-quants.c", ] + additionalSources, resources: resources, publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_K_QUANTS"), .define("GGML_USE_ACCELERATE") // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 diff --git a/build.zig b/build.zig index dcfa3dd6b..9b58b74ca 100644 --- a/build.zig +++ b/build.zig @@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void { var make = try Maker.init(b); make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; - if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) { - try make.addFlag("-DGGML_USE_K_QUANTS"); - const k_quants = make.obj("k_quants", "k_quants.c"); - try make.objs.append(k_quants); - } - const ggml = make.obj("ggml", "ggml.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); + const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); const llama = make.obj("llama", "llama.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); @@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void { const train = make.obj("train", "common/train.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index c7dd0d894..be0b2fe1e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -18,7 +18,6 @@ static const std::vector QUANT_OPTIONS = { { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, -#ifdef GGML_USE_K_QUANTS { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, @@ -31,7 +30,6 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, -#endif { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, @@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { @@ -103,6 +102,8 @@ int main(int argc, char ** argv) { params.quantize_output_tensor = false; } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; + } else if (strcmp(argv[arg_idx], "--pure") == 0) { + params.pure = true; } else { usage(argv[0]); } diff --git a/k_quants.c b/ggml-quants.c similarity index 71% rename from k_quants.c rename to ggml-quants.c index 801941fbe..fd4ee1be6 100644 --- a/k_quants.c +++ b/ggml-quants.c @@ -1,9 +1,10 @@ -#include "k_quants.h" +#include "ggml-quants.h" #include "ggml.h" #include #include #include +#include #ifdef __ARM_NEON @@ -65,6 +66,1024 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); +} + +#if __AVX__ || __AVX2__ || __AVX512F__ +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +#if defined(__AVX2__) || defined(__AVX512F__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = _mm256_set_epi64x( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); + const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytes = _mm256_or_si256(bytes, bit_mask); + return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8( 0xF ); + return _mm256_and_si256(lowMask, bytes); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { +#if __AVXVNNI__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +#endif +} + +static inline __m128i packNibbles( __m256i bytes ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh +#if __AVX512F__ + const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 + bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh + return _mm256_cvtepi16_epi8(bytes); // abcd_efgh +#else + const __m256i lowByte = _mm256_set1_epi16( 0xFF ); + __m256i high = _mm256_andnot_si256( lowByte, bytes ); + __m256i low = _mm256_and_si256( lowByte, bytes ); + high = _mm256_srli_epi16( high, 4 ); + bytes = _mm256_or_si256( low, high ); + + // Compress uint16_t lanes into bytes + __m128i r0 = _mm256_castsi256_si128( bytes ); + __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); + return _mm_packus_epi16( r0, r1 ); +#endif +} +#elif defined(__AVX__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return MM256_SET_M128I(bytesh, bytesl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return MM256_SET_M128I(tmph, tmpl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m128i lowByte = _mm_set1_epi16( 0xFF ); + __m128i high = _mm_andnot_si128( lowByte, bytes1 ); + __m128i low = _mm_and_si128( lowByte, bytes1 ); + high = _mm_srli_epi16( high, 4 ); + bytes1 = _mm_or_si128( low, high ); + high = _mm_andnot_si128( lowByte, bytes2 ); + low = _mm_and_si128( lowByte, bytes2 ); + high = _mm_srli_epi16( high, 4 ); + bytes2 = _mm_or_si128( low, high ); + + return _mm_packus_epi16( bytes1, bytes2); +} +#endif +#elif defined(__SSSE3__) +// horizontally add 4x4 floats +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =_mm_hadd_ps(a, b); + __m128 res_1 =_mm_hadd_ps(c, d); + __m128 res =_mm_hadd_ps(res_0, res_1); + res =_mm_hadd_ps(res, res); + res =_mm_hadd_ps(res, res); + + return _mm_cvtss_f32(res); +} +#endif // __AVX__ || __AVX2__ || __AVX512F__ +#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) + +#if defined(__ARM_NEON) + +#if !defined(__aarch64__) + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +#endif +#endif + +#if defined(__ARM_NEON) || defined(__wasm_simd128__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +// reference implementation for deterministic creation of model files +void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_0_reference(x, y, k); +} + +void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { + const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + y[i].m = ggml_fp32_to_fp16(min); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_1_reference(x, y, k); +} + +void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(qh)); + } +} + +void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_0_reference(x, y, k); +} + +void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { + const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + y[i].m = ggml_fp32_to_fp16(min); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); + } +} + +void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_1_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = x[i*QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[i*QK8_0 + j]*id; + + y[i].qs[j] = roundf(x0); + } + } +} + +void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = ggml_fp32_to_fp16(d); + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#elif defined(__riscv_v_intrinsic) + + size_t vl = __riscv_vsetvl_e32m4(QK8_0); + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl); + + vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = ggml_fp32_to_fp16(d); + + vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); + + // convert to integer + vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); + vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); + + // store result + __riscv_vse8_v_i8m1(y[i].qs , vs, vl); + } +#else + // scalar + quantize_row_q8_0_reference(x, y, k); +#endif +} + +// reference implementation for deterministic creation of model files +void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { + assert(QK8_1 == 32); + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_1; j++) { + const float v = x[i*QK8_1 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int sum = 0; + + for (int j = 0; j < QK8_1/2; ++j) { + const float v0 = x[i*QK8_1 + j]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; + + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_1/2 + j] = roundf(v1); + + sum += y[i].qs[ j]; + sum += y[i].qs[QK8_1/2 + j]; + } + + y[i].s = sum*d; + } +} + +void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int32x4_t accv = vdupq_n_s32(0); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); + } + + y[i].s = d * vaddvq_s32(accv); + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + v128_t accv = wasm_i32x4_splat(0); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + + accv = wasm_i32x4_add(accv, vi); + } + + y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3)); + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#elif defined(__riscv_v_intrinsic) + + size_t vl = __riscv_vsetvl_e32m4(QK8_1); + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl); + + vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); + + // convert to integer + vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); + vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); + + // store result + __riscv_vse8_v_i8m1(y[i].qs , vs, vl); + + // compute sum for y[i].s + vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); + vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl); + + // set y[i].s + int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); + y[i].s = sum*d; + } +#else + // scalar + quantize_row_q8_1_reference(x, y, k); +#endif +} + +void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = ggml_fp16_to_fp32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { + static const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = ggml_fp16_to_fp32(x[i].d); + const float m = ggml_fp16_to_fp32(x[i].m); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = ggml_fp16_to_fp32(x[i].d); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { + static const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = ggml_fp16_to_fp32(x[i].d); + const float m = ggml_fp16_to_fp32(x[i].m); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0x0F) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { + static const int qk = QK8_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = ggml_fp16_to_fp32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } +} + // // 2-6 bit quantization in super-blocks // @@ -1264,15 +2283,6 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { // #if __AVX__ || __AVX2__ || __AVX512F__ -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = _mm256_extractf128_ps(x, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(x)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - return _mm_cvtss_f32(res); -} - // shuffles to pick the required scales in dot products static inline __m256i get_scale_shuffle_q3k(int i) { static const uint8_t k_shuffle[128] = { @@ -1311,6 +2321,1224 @@ static inline __m128i get_scale_shuffle(int i) { } #endif +void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q4_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); + + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[0].d) * ggml_fp16_to_fp32(y[0].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[1].d) * ggml_fp16_to_fp32(y[1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + acc_0 = _mm_mul_ps( d_0_1, p0 ); + acc_1 = _mm_mul_ps( d_0_1, p1 ); + acc_2 = _mm_mul_ps( d_2_3, p2 ); + acc_3 = _mm_mul_ps( d_2_3, p3 ); + } + + assert(nb % 2 == 0); // TODO: handle odd nb + + // Main loop + for (int i = 2; i < nb; i+=2) { + _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[i + 1].d) * ggml_fp16_to_fp32(y[i + 1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + // load elements + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); + + // mask and store lower part of x, and then upper part + vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + + vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + // subtract offset + vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl); + vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d); + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d); + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q4_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; + + // TODO: add WASM SIMD +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs = 0; + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i + 0]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + summs += ggml_fp16_to_fp32(x0->m) * y0->s + ggml_fp16_to_fp32(x1->m) * y1->s; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float d0 = ggml_fp16_to_fp32(x[i].d); + const float d1 = y[i].d; + + summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + + const __m256 d0v = _mm256_set1_ps( d0 ); + const __m256 d1v = _mm256_set1_ps( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + const __m256 xy = mul_sum_us8_pairs_float(bx, by); + + // Accumulate d0*d1*x*y +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d0d1, xy, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + // load elements + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); + + // mask and store lower part of x, and then upper part + vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + + vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_0); + + const block_q5_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q5_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * ggml_fp16_to_fp32(y0->d)))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); + bx = _mm256_or_si256(bx, bxhi); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + // These tempory registers are for masking and shift operations + vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); + vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); + + vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl); + vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl); + vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl); + vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl); + vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl); + + // narrowing + vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl); + vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); + + vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl); + vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); + + // load + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); + + vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + + vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); + vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); + + vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl); + vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi; + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_1); + + const block_q5_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q5_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + summs0 += ggml_fp16_to_fp32(x0->m) * y0->s; + summs1 += ggml_fp16_to_fp32(x1->m) * y1->s; + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; + + summs += ggml_fp16_to_fp32(x0->m) * y0->s; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d)); + + summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); + bx = _mm256_or_si256(bx, bxhi); + + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d)); + + summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + // temporary registers for shift operations + vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); + vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // load qh + vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl); + vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); + vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl); + vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl); + + // narrowing + vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl); + vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); + + vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl); + vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); + + // load + vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); + + vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); + vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); + + vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); + vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); + + vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); + vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); + + vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); + vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); + + vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); + vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q8_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + +#else + const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); + const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); + const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); + const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); + + const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); + const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); + const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); + const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); + + const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); + const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); + const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); + const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d, q, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif + } + + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)); + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[i].qs[j]*y[i].qs[j]; + } + + sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)); + } + + *s = sumf; +#endif +} + #if QK_K == 256 void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { diff --git a/k_quants.h b/ggml-quants.h similarity index 63% rename from k_quants.h rename to ggml-quants.h index 9de089e7a..d88f99e33 100644 --- a/k_quants.h +++ b/ggml-quants.h @@ -1,20 +1,14 @@ #pragma once +// This is a private API for quantization and dequantization +// Should not be used directly, use ggml.h instead + #include "ggml.h" #include #include #include -// Super-block size -#ifdef GGML_QKK_64 -#define QK_K 64 -#define K_SCALE_SIZE 4 -#else -#define QK_K 256 -#define K_SCALE_SIZE 12 -#endif - #ifndef static_assert #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) @@ -23,10 +17,66 @@ #endif #endif +#define QK4_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +typedef struct { + ggml_fp16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_1]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); + // // Super-block quantization structures // +// Super-block size +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else +#define QK_K 256 +#define K_SCALE_SIZE 12 +#endif + // 2-bit quantization // weight is represented as x = a * q + b // 16 blocks of 16 elements each @@ -127,6 +177,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ // Quantization +void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); +void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); +void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); +void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); +void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); +void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); + void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); @@ -134,6 +191,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); +void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); + void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); @@ -142,6 +206,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); // Dequantization +void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); +void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); +void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); +void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); +void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); +//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); + void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); @@ -150,16 +221,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); // Dot product +void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); + void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -// Quantization with histogram collection -size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - diff --git a/ggml.c b/ggml.c index 6f66bab05..95f72c35e 100644 --- a/ggml.c +++ b/ggml.c @@ -1,10 +1,7 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" - -#ifdef GGML_USE_K_QUANTS -#include "k_quants.h" -#endif +#include "ggml-quants.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -443,21 +440,6 @@ static ggml_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; -#if defined(__ARM_NEON) || defined(__wasm_simd128__) -#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s -#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) -#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) -#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) -#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) -#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) -#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) -#define B8(c,s ) B7(c,s, c), B7(c,s, s) - -// precomputed tables for expanding 8bits to 8 bytes: -static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 -static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 -#endif - // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. @@ -587,1071 +569,8 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -// -// quantization -// - -#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) - -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -// multiply int8_t, add results pairwise twice -static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(x, x); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(y, x); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - const __m128i ones = _mm_set1_epi16(1); - return _mm_madd_epi16(ones, dot); -} - -#if __AVX__ || __AVX2__ || __AVX512F__ -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = _mm256_extractf128_ps(x, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(x)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - return _mm_cvtss_f32(res); -} - -// horizontally add 8 int32_t -static inline int hsum_i32_8(const __m256i a) { - const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); - const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); - const __m128i sum64 = _mm_add_epi32(hi64, sum128); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -// horizontally add 4 int32_t -static inline int hsum_i32_4(const __m128i a) { - const __m128i hi64 = _mm_unpackhi_epi64(a, a); - const __m128i sum64 = _mm_add_epi32(hi64, a); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -#if defined(__AVX2__) || defined(__AVX512F__) -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m256i shuf_mask = _mm256_set_epi64x( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); - __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); - const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytes = _mm256_or_si256(bytes, bit_mask); - return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { - const __m256i ones = _mm256_set1_epi16(1); - const __m256i summed_pairs = _mm256_madd_epi16(ones, x); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { -#if __AVXVNNI__ - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Perform multiplication and create 16-bit values - const __m256i dot = _mm256_maddubs_epi16(ax, sy); - return sum_i16_pairs_float(dot); -#endif -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { -#if __AVXVNNIINT8__ - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Get absolute values of x vectors - const __m256i ax = _mm256_sign_epi8(x, x); - // Sign the values of the y vectors - const __m256i sy = _mm256_sign_epi8(y, x); - return mul_sum_us8_pairs_float(ax, sy); -#endif -} - -static inline __m128i packNibbles( __m256i bytes ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh -#if __AVX512F__ - const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 - bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh - return _mm256_cvtepi16_epi8(bytes); // abcd_efgh -#else - const __m256i lowByte = _mm256_set1_epi16( 0xFF ); - __m256i high = _mm256_andnot_si256( lowByte, bytes ); - __m256i low = _mm256_and_si256( lowByte, bytes ); - high = _mm256_srli_epi16( high, 4 ); - bytes = _mm256_or_si256( low, high ); - - // Compress uint16_t lanes into bytes - __m128i r0 = _mm256_castsi256_si128( bytes ); - __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); - return _mm_packus_epi16( r0, r1 ); -#endif -} -#elif defined(__AVX__) -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); - const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); - __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); - __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); - const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytesl = _mm_or_si128(bytesl, bit_mask); - bytesh = _mm_or_si128(bytesh, bit_mask); - bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); - bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); - return MM256_SET_M128I(bytesh, bytesl); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - // Load 16 bytes from memory - __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); - __m128i tmph = _mm_srli_epi16(tmpl, 4); - const __m128i lowMask = _mm_set1_epi8(0xF); - tmpl = _mm_and_si128(lowMask, tmpl); - tmph = _mm_and_si128(lowMask, tmph); - return MM256_SET_M128I(tmph, tmpl); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { - const __m128i ones = _mm_set1_epi16(1); - const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); - const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); - const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { - const __m128i axl = _mm256_castsi256_si128(ax); - const __m128i axh = _mm256_extractf128_si256(ax, 1); - const __m128i syl = _mm256_castsi256_si128(sy); - const __m128i syh = _mm256_extractf128_si256(sy, 1); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - const __m128i xl = _mm256_castsi256_si128(x); - const __m128i xh = _mm256_extractf128_si256(x, 1); - const __m128i yl = _mm256_castsi256_si128(y); - const __m128i yh = _mm256_extractf128_si256(y, 1); - // Get absolute values of x vectors - const __m128i axl = _mm_sign_epi8(xl, xl); - const __m128i axh = _mm_sign_epi8(xh, xh); - // Sign the values of the y vectors - const __m128i syl = _mm_sign_epi8(yl, xl); - const __m128i syh = _mm_sign_epi8(yh, xh); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh - const __m128i lowByte = _mm_set1_epi16( 0xFF ); - __m128i high = _mm_andnot_si128( lowByte, bytes1 ); - __m128i low = _mm_and_si128( lowByte, bytes1 ); - high = _mm_srli_epi16( high, 4 ); - bytes1 = _mm_or_si128( low, high ); - high = _mm_andnot_si128( lowByte, bytes2 ); - low = _mm_and_si128( lowByte, bytes2 ); - high = _mm_srli_epi16( high, 4 ); - bytes2 = _mm_or_si128( low, high ); - - return _mm_packus_epi16( bytes1, bytes2); -} -#endif -#elif defined(__SSSE3__) -// horizontally add 4x4 floats -static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { - __m128 res_0 =_mm_hadd_ps(a, b); - __m128 res_1 =_mm_hadd_ps(c, d); - __m128 res =_mm_hadd_ps(res_0, res_1); - res =_mm_hadd_ps(res, res); - res =_mm_hadd_ps(res, res); - - return _mm_cvtss_f32(res); -} -#endif // __AVX__ || __AVX2__ || __AVX512F__ -#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) - -#if defined(__ARM_NEON) - -#if !defined(__aarch64__) - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -#endif -#endif - -#define QK4_0 32 -typedef struct { - ggml_fp16_t d; // delta - uint8_t qs[QK4_0 / 2]; // nibbles / quants -} block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); - -#define QK4_1 32 -typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min - uint8_t qs[QK4_1 / 2]; // nibbles / quants -} block_q4_1; -static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); - -#define QK5_0 32 -typedef struct { - ggml_fp16_t d; // delta - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_0 / 2]; // nibbles / quants -} block_q5_0; -static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); - -#define QK5_1 32 -typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min - uint8_t qh[4]; // 5-th bit of quants - uint8_t qs[QK5_1 / 2]; // nibbles / quants -} block_q5_1; -static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); - -#define QK8_0 32 -typedef struct { - ggml_fp16_t d; // delta - int8_t qs[QK8_0]; // quants -} block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); - -#define QK8_1 32 -typedef struct { - float d; // delta - float s; // d * sum(qs[i]) - int8_t qs[QK8_1]; // quants -} block_q8_1; -static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); - -// reference implementation for deterministic creation of model files -static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - static const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - y[i].qs[j] = xi0; - y[i].qs[j] |= xi1 << 4; - } - } -} - -static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_0_reference(x, y, k); -} - -static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { - const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); - - for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - y[i].qs[j] = xi0; - y[i].qs[j] |= xi1 << 4; - } - } -} - -static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_1_reference(x, y, k); -} - -static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { - static const int qk = QK5_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - uint32_t qh = 0; - - for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; - - const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - - y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - - // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); - } - - memcpy(&y[i].qh, &qh, sizeof(qh)); - } -} - -static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_0_reference(x, y, k); -} - -static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { - const int qk = QK5_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 5) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); - - uint32_t qh = 0; - - for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); - - // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); - } - - memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); - } -} - -static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_1_reference(x, y, k); -} - -// reference implementation for deterministic creation of model files -static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = x[i*QK8_0 + j]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = x[i*QK8_0 + j]*id; - - y[i].qs[j] = roundf(x0); - } - } -} - -static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { - assert(QK8_0 == 32); - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - } - } -#elif defined(__wasm_simd128__) - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - } - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = maxScalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v_intrinsic) - - size_t vl = __riscv_vsetvl_e32m4(QK8_0); - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl); - - vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); - - // convert to integer - vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); - vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); - - // store result - __riscv_vse8_v_i8m1(y[i].qs , vs, vl); - } -#else - // scalar - quantize_row_q8_0_reference(x, y, k); -#endif -} - -// reference implementation for deterministic creation of model files -static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { - assert(QK8_1 == 32); - assert(k % QK8_1 == 0); - const int nb = k / QK8_1; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_1; j++) { - const float v = x[i*QK8_1 + j]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - int sum = 0; - - for (int j = 0; j < QK8_1/2; ++j) { - const float v0 = x[i*QK8_1 + j]*id; - const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; - - y[i].qs[ j] = roundf(v0); - y[i].qs[QK8_1/2 + j] = roundf(v1); - - sum += y[i].qs[ j]; - sum += y[i].qs[QK8_1/2 + j]; - } - - y[i].s = sum*d; - } -} - -static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK8_1 == 0); - const int nb = k / QK8_1; - - block_q8_1 * restrict y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - int32x4_t accv = vdupq_n_s32(0); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - - accv = vaddq_s32(accv, vi); - } - - y[i].s = d * vaddvq_s32(accv); - } -#elif defined(__wasm_simd128__) - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - v128_t accv = wasm_i32x4_splat(0); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - - accv = wasm_i32x4_add(accv, vi); - } - - y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + - wasm_i32x4_extract_lane(accv, 1) + - wasm_i32x4_extract_lane(accv, 2) + - wasm_i32x4_extract_lane(accv, 3)); - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = maxScalar / 127.f; - y[i].d = d; - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Compute the sum of the quants and set y[i].s - y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); - - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Compute the sum of the quants and set y[i].s - const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); - const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v_intrinsic) - - size_t vl = __riscv_vsetvl_e32m4(QK8_1); - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl); - - vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); - - // convert to integer - vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); - vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); - - // store result - __riscv_vse8_v_i8m1(y[i].qs , vs, vl); - - // compute sum for y[i].s - vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); - vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl); - - // set y[i].s - int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = sum*d; - } -#else - // scalar - quantize_row_q8_1_reference(x, y, k); -#endif -} - -static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { - static const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F) - 8; - const int x1 = (x[i].qs[j] >> 4) - 8; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { - static const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F); - const int x1 = (x[i].qs[j] >> 4); - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} - -static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { - static const int qk = QK5_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { - static const int qk = QK5_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); - - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int x0 = (x[i].qs[j] & 0x0F) | xh_0; - const int x1 = (x[i].qs[j] >> 4) | xh_1; - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} - -static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) { - static const int qk = QK8_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - const block_q8_0 * restrict x = vx; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; - } - } -} - static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); -static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -1740,7 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK8_0, .type_size = sizeof(block_q8_0), .is_quantized = true, - .to_float = dequantize_row_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q8_0, .from_float = quantize_row_q8_0, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, @@ -1755,7 +674,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, }, -#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { .type_name = "q2_K", .blck_size = QK_K, @@ -1818,7 +736,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .from_float = quantize_row_q8_K, } -#endif }; // For internal test use @@ -2442,1218 +1359,6 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest *s = sumf; } -static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( d, q, acc ); - } - - *s = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); - - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - - __m128i bx = _mm_and_si128(lowMask, tmp); - __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx, by); - - bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx, by); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); - - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__SSSE3__) - // set constants - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - // Initialize accumulator with zeros - __m128 acc_0 = _mm_setzero_ps(); - __m128 acc_1 = _mm_setzero_ps(); - __m128 acc_2 = _mm_setzero_ps(); - __m128 acc_3 = _mm_setzero_ps(); - - // First round without accumulation - { - _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); - - const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); - __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); - bx_1 = _mm_sub_epi8(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); - - const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); - - __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); - __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); - bx_2 = _mm_sub_epi8(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); - __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); - bx_3 = _mm_sub_epi8(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = _mm_cvtepi32_ps(i32_0); - __m128 p1 = _mm_cvtepi32_ps(i32_1); - __m128 p2 = _mm_cvtepi32_ps(i32_2); - __m128 p3 = _mm_cvtepi32_ps(i32_3); - - // Apply the scale - acc_0 = _mm_mul_ps( d_0_1, p0 ); - acc_1 = _mm_mul_ps( d_0_1, p1 ); - acc_2 = _mm_mul_ps( d_2_3, p2 ); - acc_3 = _mm_mul_ps( d_2_3, p3 ); - } - - // Main loop - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 2; i < nb; i+=2) { - _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); - - const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); - __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx_1 = _mm_sub_epi8(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); - - const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); - - __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); - __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); - bx_2 = _mm_sub_epi8(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); - __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); - bx_3 = _mm_sub_epi8(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = _mm_cvtepi32_ps(i32_0); - __m128 p1 = _mm_cvtepi32_ps(i32_1); - __m128 p2 = _mm_cvtepi32_ps(i32_2); - __m128 p3 = _mm_cvtepi32_ps(i32_3); - - // Apply the scale - __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); - __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); - __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); - __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); - - // Acummulate - acc_0 = _mm_add_ps(p0_d, acc_0); - acc_1 = _mm_add_ps(p1_d, acc_1); - acc_2 = _mm_add_ps(p2_d, acc_2); - acc_3 = _mm_add_ps(p3_d, acc_3); - } - - *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - for (int i = 0; i < nb; i++) { - // load elements - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - // subtract offset - vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl); - vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0x0F) - 8; - const int v1 = (x[i].qs[j] >> 4) - 8; - - sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); - } - - sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); - } - - *s = sumf; -#endif -} - -static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_1; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; - - // TODO: add WASM SIMD -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs = 0; - - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; - - summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - // Main loop - for (int i = 0; i < nb; ++i) { - const float d0 = GGML_FP16_TO_FP32(x[i].d); - const float d1 = y[i].d; - - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - - const __m256 d0v = _mm256_set1_ps( d0 ); - const __m256 d1v = _mm256_set1_ps( d1 ); - - // Compute combined scales - const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - - const __m256 xy = mul_sum_us8_pairs_float(bx, by); - - // Accumulate d0*d1*x*y -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d0d1, xy, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); -#endif - } - - *s = hsum_float_8(acc) + summs; -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - for (int i = 0; i < nb; i++) { - // load elements - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0x0F); - const int v1 = (x[i].qs[j] >> 4); - - sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); - } - - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#endif -} - -static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - assert(qk == QK5_0); - - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - // extract the 5th bit via lookup table ((!b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_1[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_1[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__wasm_simd128__) - v128_t sumv = wasm_f32x4_splat(0.0f); - - uint32_t qh; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; - - const v128_t m4b = wasm_i8x16_splat(0x0F); - - // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); - - tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_1[(qh >> 24) ]; - - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); - - const v128_t v0 = wasm_v128_load(x0->qs); - - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); - const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); - - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); - - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); - - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - - // dot product - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( - wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); - } - - *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; i++) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - __m256i bxhi = bytes_from_bits_32(x[i].qh); - bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); - bx = _mm256_or_si256(bx, bxhi); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps(d, q, acc); - } - - *s = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8((char)0xF0); - - // Main loop - for (int i = 0; i < nb; i++) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i bxhi = bytes_from_bits_32(x[i].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_andnot_si128(bxhil, mask); - bxhih = _mm_andnot_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); - - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - uint32_t qh; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - // These tempory registers are for masking and shift operations - vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); - vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); - - vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl); - vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); - - for (int i = 0; i < nb; i++) { - memcpy(&qh, x[i].qh, sizeof(uint32_t)); - - // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl); - vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl); - vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); - - // ((qh & (1u << (j + 16))) >> (j + 12)); - vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl); - vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl); - - // narrowing - vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl); - vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); - - vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl); - vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); - - // load - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); - vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); - - vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl); - vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - - const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - - sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); - } - - sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; - } - - *s = sumf; -#endif -} - -static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_1; - const int nb = n / qk; - - assert(n % qk == 0); - assert(qk == QK5_1); - - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs0 = 0.0f; - float summs1 = 0.0f; - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; - summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; - - // extract the 5th bit via lookup table ((b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_0[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_0[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit - const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; -#elif defined(__wasm_simd128__) - v128_t sumv = wasm_f32x4_splat(0.0f); - - float summs = 0.0f; - - uint32_t qh; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; - - summs += GGML_FP16_TO_FP32(x0->m) * y0->s; - - const v128_t m4b = wasm_i8x16_splat(0x0F); - - // extract the 5th bit - memcpy(&qh, x0->qh, sizeof(qh)); - - tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_0[(qh >> 24) ]; - - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); - - const v128_t v0 = wasm_v128_load(x0->qs); - - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); - - // add high bit - const v128_t v0lf = wasm_v128_or(v0l, qhl); - const v128_t v0hf = wasm_v128_or(v0h, qhh); - - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); - - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); - - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - - // dot product - sumv = wasm_f32x4_add(sumv, - wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); - } - - *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0.0f; - - // Main loop - for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - __m256i bxhi = bytes_from_bits_32(x[i].qh); - bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); - bx = _mm256_or_si256(bx, bxhi); - - const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_us8_pairs_float(bx, by); - - acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); - } - - *s = hsum_float_8(acc) + summs; -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8(0x10); - - float summs = 0.0f; - - // Main loop - for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i bxhi = bytes_from_bits_32(x[i].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_and_si128(bxhil, mask); - bxhih = _mm_and_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx); - __m128i bxh = _mm256_extractf128_si256(bx, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx = MM256_SET_M128I(bxh, bxl); - - const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_us8_pairs_float(bx, by); - - acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); - } - - *s = hsum_float_8(acc) + summs; -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - - uint32_t qh; - - size_t vl = __riscv_vsetvl_e8m1(qk/2); - - // temporary registers for shift operations - vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); - vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl); - - for (int i = 0; i < nb; i++) { - memcpy(&qh, x[i].qh, sizeof(uint32_t)); - - // load qh - vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl); - - // ((qh >> (j + 0)) << 4) & 0x10; - vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl); - vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl); - vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl); - - // ((qh >> (j + 12)) ) & 0x10; - vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl); - vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl); - - // narrowing - vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl); - vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl); - - vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl); - vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl); - - // load - vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); - - vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); - vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); - - vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); - vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); - - vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl); - vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl); - - vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); - vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); - - vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); - vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - - vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); - - int sumi = 0; - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; - const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; - - sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); - } - - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; - } - - *s = sumf; -#endif -} - -static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb - for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const int8x16_t x0_0 = vld1q_s8(x0->qs); - const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); - const int8x16_t x1_0 = vld1q_s8(x1->qs); - const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); - - // load y - const int8x16_t y0_0 = vld1q_s8(y0->qs); - const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); - const int8x16_t y1_0 = vld1q_s8(y1->qs); - const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - -#else - const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); - const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); - const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); - const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); - - const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); - const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); - const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); - const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); - - const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); - const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); - const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); - const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - // Multiply q with scale and accumulate -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d, q, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); -#endif - } - - *s = hsum_float_8(acc); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - size_t vl = __riscv_vsetvl_e8m1(qk); - - for (int i = 0; i < nb; i++) { - // load elements - vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); - vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); - - vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); - - vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - - sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk; j++) { - sumi += x[i].qs[j]*y[i].qs[j]; - } - - sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); - } - - *s = sumf; -#endif -} - // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { @@ -21001,7 +18706,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; result = ggml_quantize_q8_0(src + start, block, n, n, hist); } break; -#ifdef GGML_USE_K_QUANTS case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); @@ -21032,7 +18736,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q6_K * block = (block_q6_K*)dst + start / QK_K; result = ggml_quantize_q6_K(src + start, block, n, n, hist); } break; -#endif case GGML_TYPE_F16: { int elemsize = sizeof(ggml_fp16_t); diff --git a/ggml.h b/ggml.h index 08bff5511..8c954904e 100644 --- a/ggml.h +++ b/ggml.h @@ -1930,12 +1930,19 @@ extern "C" { // quantization // + // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); // diff --git a/llama.cpp b/llama.cpp index 3d431ee7b..1d1db8fc9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,13 +19,11 @@ #ifdef GGML_USE_MPI # include "ggml-mpi.h" #endif -#ifdef GGML_USE_K_QUANTS -# ifndef QK_K -# ifdef GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif +#ifndef QK_K +# ifdef GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 # endif #endif @@ -8052,7 +8050,7 @@ struct no_init { struct quantize_state_internal { const llama_model & model; const llama_model_quantize_params * params; -#ifdef GGML_USE_K_QUANTS + int n_attention_wv = 0; int n_feed_forward_w2 = 0; int i_attention_wv = 0; @@ -8060,7 +8058,7 @@ struct quantize_state_internal { int n_k_quantized = 0; int n_fallback = 0; -#endif + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) @@ -8125,7 +8123,6 @@ static void llama_convert_tensor_internal( workers.clear(); } -#ifdef GGML_USE_K_QUANTS static ggml_type get_k_quant_type( quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype @@ -8237,7 +8234,6 @@ static ggml_type get_k_quant_type( return new_type; } -#endif static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type quantized_type; @@ -8252,7 +8248,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; -#ifdef GGML_USE_K_QUANTS // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: @@ -8263,7 +8258,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; -#endif + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -8304,7 +8299,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); -#ifdef GGML_USE_K_QUANTS for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * meta = ml.get_tensor_meta(i); @@ -8322,7 +8316,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); } -#endif size_t total_size_org = 0; size_t total_size_new = 0; @@ -8387,9 +8380,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = quantized_type; -#ifdef GGML_USE_K_QUANTS - new_type = get_k_quant_type(qs, new_type, tensor, ftype); -#endif + if (!params->pure) { + new_type = get_k_quant_type(qs, new_type, tensor, ftype); + } + // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type; @@ -8514,12 +8508,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("\n"); } } -#ifdef GGML_USE_K_QUANTS + if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); } -#endif } static int llama_apply_lora_from_file_internal( @@ -8844,6 +8837,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, + /*.pure =*/ false, }; return result; diff --git a/llama.h b/llama.h index d901dcd91..6927bd601 100644 --- a/llama.h +++ b/llama.h @@ -191,6 +191,7 @@ extern "C" { bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // disable k-quant mixtures and quantize all tensors to the same type } llama_model_quantize_params; // grammar types From 71a09da301705b9c5ad4ca3cf3fbd966dd3f1ec5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 18:32:51 +0200 Subject: [PATCH 22/63] llama : fix kv shift bug (#3835) ggml-ci --- llama.cpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1d1db8fc9..d8510a5cf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1552,14 +1552,14 @@ static void llama_kv_cache_seq_shift( for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].pos += delta; + cache.has_shift = true; + cache.cells[i].pos += delta; + cache.cells[i].delta += delta; + if (cache.cells[i].pos < 0) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); if (new_head == cache.size) new_head = i; - } else { - cache.has_shift = true; - cache.cells[i].delta = delta; } } } @@ -6073,11 +6073,20 @@ static int llama_decode_internal( #endif // update the kv ring buffer - lctx.kv_self.has_shift = false; - lctx.kv_self.head += n_tokens; - // Ensure kv cache head points to a valid index. - if (lctx.kv_self.head >= lctx.kv_self.size) { - lctx.kv_self.head = 0; + { + if (kv_self.has_shift) { + kv_self.has_shift = false; + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } + + kv_self.head += n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } } #ifdef GGML_PERF From 2046eb4345e62c4575b3cdc0115a51db89f3fb70 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Sun, 29 Oct 2023 12:33:47 -0400 Subject: [PATCH 23/63] make : remove unnecessary dependency on build-info.h (#3842) --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 2cecc2216..c53c1e726 100644 --- a/Makefile +++ b/Makefile @@ -541,10 +541,10 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h -COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h +COMMON_DEPS = common.o sampling.o grammar-parser.o -common.o: common/common.cpp $(COMMON_H_DEPS) +common.o: common/common.cpp build-info.h $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ sampling.o: common/sampling.cpp $(COMMON_H_DEPS) From 6e08281e588bbba1a5d180290a94a43f167f3a1a Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sun, 29 Oct 2023 11:31:40 -0600 Subject: [PATCH 24/63] Extend llama_kv_cache_seq_rm to allow matching any sequence (#3843) * Extend llama_kv_cache_seq_rm to allow matichng any sequence * Replace llama_kv_cache_tokens_rm with llama_kv_cache_clear Use llama_kv_cache_clear for cache clearing Change calls to llama_kv_cache_tokens_rm that want to delete by position to use llama_kv_cache_seq_rm functionality --- common/common.cpp | 2 +- examples/batched-bench/batched-bench.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 4 ++-- examples/main/main.cpp | 2 +- examples/perplexity/perplexity.cpp | 6 ++--- examples/server/server.cpp | 2 +- llama.cpp | 29 ++++++++++++------------ llama.h | 15 +++++------- 8 files changed, 30 insertions(+), 32 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index f81f4d354..c187128d6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -889,7 +889,7 @@ std::tuple llama_init_from_gpt_par std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); - llama_kv_cache_tokens_rm(lctx, -1, -1); + llama_kv_cache_clear(lctx); llama_reset_timings(lctx); } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 43f9c971d..533c55c17 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -185,7 +185,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_TEE("%s: llama_decode() failed\n", __func__); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 20767d555..780398184 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); // warmup run if (t.n_prompt > 0) { @@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3d9f670b9..8a43b6ab8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -298,7 +298,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1); + llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } LOGLN( diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 3c2542e8c..bd2c73d87 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } // clear the KV cache - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab); if (logits.empty()) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5b7e4139d..c163c7f8e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -857,7 +857,7 @@ struct llama_server_context void kv_cache_clear() { // clear the entire KV cache - llama_kv_cache_tokens_rm(ctx, -1, -1); + llama_kv_cache_clear(ctx); clean_kv_cache = false; } diff --git a/llama.cpp b/llama.cpp index d8510a5cf..a4340d527 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1466,17 +1466,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { return 0; } -static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) { - if (c0 < 0) c0 = 0; - if (c1 < 0) c1 = cache.size; - - for (int32_t i = c0; i < c1; ++i) { +static void llama_kv_cache_clear(struct llama_kv_cache & cache) { + for (int32_t i = 0; i < cache.size; ++i) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); } - - // Searching for a free slot can start here since we know it will be empty. - cache.head = uint32_t(c0); + cache.head = 0; } static void llama_kv_cache_seq_rm( @@ -1490,8 +1485,14 @@ static void llama_kv_cache_seq_rm( if (p1 < 0) p1 = std::numeric_limits::max(); for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].seq_id.erase(seq_id); + if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + if (seq_id < 0) { + cache.cells[i].seq_id.clear(); + } else if (cache.cells[i].has_seq_id(seq_id)) { + cache.cells[i].seq_id.erase(seq_id); + } else { + continue; + } if (cache.cells[i].seq_id.empty()) { cache.cells[i].pos = -1; if (new_head == cache.size) new_head = i; @@ -9207,8 +9208,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->kv_self.head; } -void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) { - llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1); +void llama_kv_cache_clear(struct llama_context * ctx) { + llama_kv_cache_clear(ctx->kv_self); } void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { @@ -9654,7 +9655,7 @@ int llama_eval( llama_token * tokens, int32_t n_tokens, int n_past) { - llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); if (ret < 0) { @@ -9669,7 +9670,7 @@ int llama_eval_embd( float * embd, int32_t n_tokens, int n_past) { - llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1); + llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; diff --git a/llama.h b/llama.h index 6927bd601..d727dbd9f 100644 --- a/llama.h +++ b/llama.h @@ -334,17 +334,14 @@ extern "C" { LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx), "avoid using this, it will be removed in the future, instead - count the tokens in user code"); - // Remove all tokens data of cells in [c0, c1) - // c0 < 0 : [0, c1] - // c1 < 0 : [c0, inf) - LLAMA_API void llama_kv_cache_tokens_rm( - struct llama_context * ctx, - int32_t c0, - int32_t c1); + // Clear the KV cache + LLAMA_API void llama_kv_cache_clear( + struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) - // p0 < 0 : [0, p1] - // p1 < 0 : [p0, inf) + // seq_id < 0 : match any sequence + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, From 207b51900e15cc7f89763a3bb1c565fe11cbb45d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Oct 2023 19:19:15 +0200 Subject: [PATCH 25/63] ggml : move FP16 <-> FP32 code to ggml-impl.h (#3861) * ggml : move FP16 <-> FP32 stuff to ggml-impl.h ggml-ci * tests : fix ARM build * ggml : explicitly initialize deprecated type traits * ggml : add math.h to ggml-impl.h * ggml : remove duplicate static assert macros * ggml : prefix lookup tables with ggml_ ggml-ci * ggml-impl : move extern "C" to start of file --- ggml-impl.h | 237 ++++++++++++++++++++++++ ggml-quants.c | 350 ++++++++++++++++++------------------ ggml-quants.h | 14 +- ggml.c | 282 +++++------------------------ llama.cpp | 2 +- tests/test-double-float.cpp | 2 +- tests/test-quantize-fns.cpp | 7 + 7 files changed, 470 insertions(+), 424 deletions(-) create mode 100644 ggml-impl.h diff --git a/ggml-impl.h b/ggml-impl.h new file mode 100644 index 000000000..5ec18a50c --- /dev/null +++ b/ggml-impl.h @@ -0,0 +1,237 @@ +#pragma once + +#include "ggml.h" + +// GGML internal header + +#include +#include +#include +#include // memcpy +#include // fabsf + +#ifdef __cplusplus +extern "C" { +#endif + +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. +// if C99 - static_assert is noop +// ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#ifndef __SSE3__ +#define __SSE3__ +#endif +#endif + +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#if defined(__ARM_NEON) && !defined(_MSC_VER) + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) +#define GGML_COMPUTE_FP32_TO_FP16(x) (x) + +#define GGML_FP16_TO_FP32(x) ((float) (x)) +#define GGML_FP32_TO_FP16(x) (x) + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // __ARM_NEON + +// precomputed f32 table for f16 (256 KB) +// defined in ggml.c, initialized in ggml_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) + +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +#endif + + // TODO: backend v2 PR + +#ifdef __cplusplus +} +#endif diff --git a/ggml-quants.c b/ggml-quants.c index fd4ee1be6..721594467 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1,5 +1,5 @@ #include "ggml-quants.h" -#include "ggml.h" +#include "ggml-impl.h" #include #include @@ -352,7 +352,7 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < qk/2; ++j) { const float x0 = x[i*qk + 0 + j]*id; @@ -392,8 +392,8 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); - y[i].m = ggml_fp32_to_fp16(min); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); for (int j = 0; j < qk/2; ++j) { const float x0 = (x[i*qk + 0 + j] - min)*id; @@ -434,7 +434,7 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; @@ -481,8 +481,8 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict const float d = (max - min) / ((1 << 5) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); - y[i].m = ggml_fp32_to_fp16(min); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); uint32_t qh = 0; @@ -524,7 +524,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < QK8_0; ++j) { const float x0 = x[i*QK8_0 + j]*id; @@ -559,7 +559,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const float32x4_t v = vmulq_n_f32(srcv[j], id); @@ -592,7 +592,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); @@ -627,7 +627,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -704,7 +704,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = ggml_fp32_to_fp16(d); + y[i].d = GGML_FP32_TO_FP16(d); vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); @@ -982,7 +982,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F) - 8; @@ -1002,8 +1002,8 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); - const float m = ggml_fp16_to_fp32(x[i].m); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F); @@ -1023,7 +1023,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1049,8 +1049,8 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); - const float m = ggml_fp16_to_fp32(x[i].m); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1076,7 +1076,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk; ++j) { y[i*qk + j] = x[i].qs[j]*d; @@ -1387,10 +1387,10 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict int l = nearest_int(iscale*scales[j]); y[i].scales[j] = l; } - y[i].d = ggml_fp32_to_fp16(max_scale/q4scale); + y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale); } else { for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; - y[i].d = ggml_fp32_to_fp16(0.f); + y[i].d = GGML_FP32_TO_FP16(0.f); } if (max_min > 0) { float iscale = q4scale/max_min; @@ -1398,14 +1398,14 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict int l = nearest_int(iscale*mins[j]); y[i].scales[j] |= (l << 4); } - y[i].dmin = ggml_fp32_to_fp16(max_min/q4scale); + y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale); } else { - y[i].dmin = ggml_fp32_to_fp16(0.f); + y[i].dmin = GGML_FP32_TO_FP16(0.f); } for (int j = 0; j < QK_K/16; ++j) { - const float d = ggml_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF); + const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF); if (!d) continue; - const float dm = ggml_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4); + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4); for (int ii = 0; ii < 16; ++ii) { int l = nearest_int((x[16*j + ii] + dm)/d); l = MAX(0, MIN(3, l)); @@ -1436,8 +1436,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); - const float min = ggml_fp16_to_fp32(x[i].dmin); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float min = GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * q = x[i].qs; @@ -1526,16 +1526,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict l >>= 4; y[i].scales[j%4 + 8] |= (l << (2*(j/4))); } - y[i].d = ggml_fp32_to_fp16(1/iscale); + y[i].d = GGML_FP32_TO_FP16(1/iscale); } else { - y[i].d = ggml_fp32_to_fp16(0.f); + y[i].d = GGML_FP32_TO_FP16(0.f); } int8_t sc; for (int j = 0; j < QK_K/16; ++j) { sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - float d = ggml_fp16_to_fp32(y[i].d) * sc; + float d = GGML_FP16_TO_FP32(y[i].d) * sc; if (!d) { continue; } @@ -1555,16 +1555,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict l2 = 8 + MAX(-8, MIN(7, l2)); y[i].scales[j/2] = l1 | (l2 << 4); } - y[i].d = ggml_fp32_to_fp16(1/iscale); + y[i].d = GGML_FP32_TO_FP16(1/iscale); } else { for (int j = 0; j < QK_K/16; j+=2) { y[i].scales[j/2] = 0; } - y[i].d = ggml_fp32_to_fp16(0.f); + y[i].d = GGML_FP32_TO_FP16(0.f); } for (int j = 0; j < QK_K/16; ++j) { int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; - float d = ggml_fp16_to_fp32(y[i].d) * (s - 8); + float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8); if (!d) { continue; } @@ -1618,7 +1618,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d_all = ggml_fp16_to_fp32(x[i].d); + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q = x[i].qs; const uint8_t * restrict hm = x[i].hmask; @@ -1663,7 +1663,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d_all = ggml_fp16_to_fp32(x[i].d); + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q = x[i].qs; const uint8_t * restrict hm = x[i].hmask; @@ -1753,15 +1753,15 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = ggml_fp32_to_fp16(max_scale/63.f); - y[i].dmin = ggml_fp32_to_fp16(max_min/63.f); + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = ggml_fp16_to_fp32(y[i].d) * sc; + const float d = GGML_FP16_TO_FP32(y[i].d) * sc; if (!d) continue; - const float dm = ggml_fp16_to_fp32(y[i].dmin) * m; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; for (int ii = 0; ii < 32; ++ii) { int l = nearest_int((x[32*j + ii] + dm)/d); l = MAX(0, MIN(15, l)); @@ -1778,17 +1778,17 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict int m2 = nearest_int(inv_min*mins[1]); y[i].scales[0] = d1 | (m1 << 4); y[i].scales[1] = d2 | (m2 << 4); - y[i].d[0] = ggml_fp32_to_fp16(max_scale/s_factor); - y[i].d[1] = ggml_fp32_to_fp16(max_min/s_factor); + y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor); + y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor); float sumlx = 0; int suml2 = 0; for (int j = 0; j < QK_K/32; ++j) { const uint8_t sd = y[i].scales[j] & 0xF; const uint8_t sm = y[i].scales[j] >> 4; - const float d = ggml_fp16_to_fp32(y[i].d[0]) * sd; + const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd; if (!d) continue; - const float m = ggml_fp16_to_fp32(y[i].d[1]) * sm; + const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm; for (int ii = 0; ii < 32; ++ii) { int l = nearest_int((x[32*j + ii] + m)/d); l = MAX(0, MIN(15, l)); @@ -1798,7 +1798,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } if (suml2) { - y[i].d[0] = ggml_fp32_to_fp16(sumlx/suml2); + y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2); } #endif uint8_t * q = y[i].qs; @@ -1822,8 +1822,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int #if QK_K == 256 - const float d = ggml_fp16_to_fp32(x[i].d); - const float min = ggml_fp16_to_fp32(x[i].dmin); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float min = GGML_FP16_TO_FP32(x[i].dmin); int is = 0; uint8_t sc, m; @@ -1837,8 +1837,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int q += 32; is += 2; } #else - const float dall = ggml_fp16_to_fp32(x[i].d[0]); - const float mall = ggml_fp16_to_fp32(x[i].d[1]); + const float dall = GGML_FP16_TO_FP32(x[i].d[0]); + const float mall = GGML_FP16_TO_FP32(x[i].d[1]); const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); for (int l = 0; l < 32; ++l) { @@ -1924,15 +1924,15 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = ggml_fp32_to_fp16(max_scale/63.f); - y[i].dmin = ggml_fp32_to_fp16(max_min/63.f); + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = ggml_fp16_to_fp32(y[i].d) * sc; + const float d = GGML_FP16_TO_FP32(y[i].d) * sc; if (!d) continue; - const float dm = ggml_fp16_to_fp32(y[i].dmin) * m; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; for (int ii = 0; ii < 32; ++ii) { int l = nearest_int((x[32*j + ii] + dm)/d); l = MAX(0, MIN(31, l)); @@ -1976,10 +1976,10 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict int l = nearest_int(iscale*scales[j]); y[i].scales[j] = MAX(-128, MIN(127, l)); } - y[i].d = ggml_fp32_to_fp16(1/iscale); + y[i].d = GGML_FP32_TO_FP16(1/iscale); for (int j = 0; j < QK_K/16; ++j) { - const float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j]; + const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; if (!d) continue; for (int ii = 0; ii < 16; ++ii) { int l = nearest_int(x[16*j + ii]/d); @@ -2023,8 +2023,8 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int #if QK_K == 256 - const float d = ggml_fp16_to_fp32(x[i].d); - const float min = ggml_fp16_to_fp32(x[i].dmin); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float min = GGML_FP16_TO_FP32(x[i].dmin); int is = 0; uint8_t sc, m; @@ -2040,7 +2040,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int u1 <<= 2; u2 <<= 2; } #else - float d = ggml_fp16_to_fp32(x[i].d); + float d = GGML_FP16_TO_FP32(x[i].d); const int8_t * restrict s = x[i].scales; for (int l = 0; l < 8; ++l) { y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); @@ -2103,19 +2103,19 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict if (!max_abs_scale) { memset(&y[i], 0, sizeof(block_q6_K)); - y[i].d = ggml_fp32_to_fp16(0.f); + y[i].d = GGML_FP32_TO_FP16(0.f); x += QK_K; continue; } float iscale = -128.f/max_scale; - y[i].d = ggml_fp32_to_fp16(1/iscale); + y[i].d = GGML_FP32_TO_FP16(1/iscale); for (int ib = 0; ib < QK_K/16; ++ib) { y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); } for (int j = 0; j < QK_K/16; ++j) { - float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j]; + float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; if (!d) { continue; } @@ -2164,7 +2164,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict ql = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -2371,8 +2371,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); @@ -2389,8 +2389,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2402,7 +2402,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, // Main loop for (int i = 0; i < nb; ++i) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); __m256i bx = bytes_from_nibbles_32(x[i].qs); @@ -2426,7 +2426,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i off = _mm_set1_epi8(8); @@ -2468,7 +2468,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[0].d) * ggml_fp16_to_fp32(y[0].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); @@ -2486,7 +2486,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[1].d) * ggml_fp16_to_fp32(y[1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); @@ -2521,7 +2521,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); @@ -2539,7 +2539,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( ggml_fp16_to_fp32(x[i + 1].d) * ggml_fp16_to_fp32(y[i + 1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); @@ -2606,7 +2606,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d); + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); } *s = sumf; @@ -2624,7 +2624,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += sumi*ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d); + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); } *s = sumf; @@ -2655,7 +2655,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri const block_q8_1 * restrict y0 = &y[i + 0]; const block_q8_1 * restrict y1 = &y[i + 1]; - summs += ggml_fp16_to_fp32(x0->m) * y0->s + ggml_fp16_to_fp32(x1->m) * y1->s; + summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -2679,8 +2679,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), ggml_fp16_to_fp32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), ggml_fp16_to_fp32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); @@ -2697,8 +2697,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -2711,10 +2711,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; ++i) { - const float d0 = ggml_fp16_to_fp32(x[i].d); + const float d0 = GGML_FP16_TO_FP32(x[i].d); const float d1 = y[i].d; - summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -2766,7 +2766,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -2784,7 +2784,7 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -2864,10 +2864,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -2884,8 +2884,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2946,7 +2946,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * ggml_fp16_to_fp32(y0->d)))); + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -2958,7 +2958,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2982,7 +2982,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3066,7 +3066,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; } *s = sumf; @@ -3090,7 +3090,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)) * sumi; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; } *s = sumf; @@ -3130,8 +3130,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += ggml_fp16_to_fp32(x0->m) * y0->s; - summs1 += ggml_fp16_to_fp32(x1->m) * y1->s; + summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -3176,10 +3176,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), ggml_fp16_to_fp32(x0->d)*y0->d); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), ggml_fp16_to_fp32(x1->d)*y1->d); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -3196,8 +3196,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), ggml_fp16_to_fp32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), ggml_fp16_to_fp32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -3215,7 +3215,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; - summs += ggml_fp16_to_fp32(x0->m) * y0->s; + summs += GGML_FP16_TO_FP32(x0->m) * y0->s; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3262,7 +3262,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(ggml_fp16_to_fp32(x0->d) * y0->d))); + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -3275,9 +3275,9 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3302,9 +3302,9 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += ggml_fp16_to_fp32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3385,7 +3385,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -3409,7 +3409,7 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (ggml_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_fp16_to_fp32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -3451,11 +3451,11 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); @@ -3473,8 +3473,8 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), ggml_fp16_to_fp32(x0->d)*ggml_fp16_to_fp32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), ggml_fp16_to_fp32(x1->d)*ggml_fp16_to_fp32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -3486,7 +3486,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(ggml_fp16_to_fp32(x[i].d) * ggml_fp16_to_fp32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); @@ -3517,7 +3517,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)); + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); } *s = sumf; @@ -3532,7 +3532,7 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri sumi += x[i].qs[j]*y[i].qs[j]; } - sumf += sumi*(ggml_fp16_to_fp32(x[i].d)*ggml_fp16_to_fp32(y[i].d)); + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); } *s = sumf; @@ -3562,8 +3562,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -3641,8 +3641,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -3708,8 +3708,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -3816,8 +3816,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); size_t vl = 16; @@ -3903,8 +3903,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -4021,8 +4021,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -4073,8 +4073,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -4188,8 +4188,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); isum[0] = isum[1] = isum[2] = isum[3] = 0; for (int l = 0; l < 16; ++l) { @@ -4242,7 +4242,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const uint8_t * restrict qh = x[i].hmask; @@ -4350,7 +4350,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -4455,7 +4455,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -4676,7 +4676,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d*sum_t; @@ -4741,7 +4741,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -4843,7 +4843,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -4914,7 +4914,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5099,7 +5099,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q8 += 8; a += 8; for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l]; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -5139,8 +5139,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -5222,8 +5222,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5288,8 +5288,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5371,8 +5371,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri size_t vl = 8; - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -5482,9 +5482,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -5586,8 +5586,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d; - const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; + const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; const __m256 vd = _mm256_set1_ps(d); const uint16_t * a = (const uint16_t *)x[i].scales; @@ -5632,8 +5632,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d; - const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; + const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; const __m256 vd = _mm256_set1_ps(d); const uint16_t * a = (const uint16_t *)x[i].scales; @@ -5689,8 +5689,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; - sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]); + sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); size_t vl = 32; @@ -5739,9 +5739,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; - sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); for (int j = 0; j < QK_K/32; ++j) { for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; @@ -5789,8 +5789,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -5878,8 +5878,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * restrict q8 = y[i].qs; #if QK_K == 256 - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5960,8 +5960,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6065,8 +6065,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * restrict hm = x[i].qh; const int8_t * restrict q8 = y[i].qs; - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; - const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -6188,9 +6188,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d; + const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -6288,7 +6288,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); @@ -6334,7 +6334,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); @@ -6471,7 +6471,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); } - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * restrict sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { @@ -6514,7 +6514,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d_all = ggml_fp16_to_fp32(x[i].d); + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -6646,7 +6646,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -6726,7 +6726,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -6838,7 +6838,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -6955,7 +6955,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -7053,7 +7053,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -7110,7 +7110,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q4 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -7269,7 +7269,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/ggml-quants.h b/ggml-quants.h index d88f99e33..70c12c274 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -1,22 +1,12 @@ #pragma once -// This is a private API for quantization and dequantization -// Should not be used directly, use ggml.h instead +#include "ggml-impl.h" -#include "ggml.h" +// GGML internal header #include -#include #include -#ifndef static_assert -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) -#define static_assert(cond, msg) _Static_assert(cond, msg) -#else -#define static_assert(cond, msg) struct global_scope_noop_trick -#endif -#endif - #define QK4_0 32 typedef struct { ggml_fp16_t d; // delta diff --git a/ggml.c b/ggml.c index 95f72c35e..84407b122 100644 --- a/ggml.c +++ b/ggml.c @@ -1,6 +1,6 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-quants.h" #if defined(_MSC_VER) || defined(__MINGW32__) @@ -27,18 +27,6 @@ #include #endif -// static_assert should be a #define, but if it's not, -// fall back to the _Static_assert C11 keyword. -// if C99 - static_assert is noop -// ref: https://stackoverflow.com/a/53923785/4039976 -#ifndef static_assert -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) -#define static_assert(cond, msg) _Static_assert(cond, msg) -#else -#define static_assert(cond, msg) struct global_scope_noop_trick -#endif -#endif - #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -106,23 +94,11 @@ typedef void * thread_ret_t; #include #endif + #ifdef GGML_USE_CPU_HBM #include #endif -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#ifndef __SSE3__ -#define __SSE3__ -#endif -#endif - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -248,213 +224,27 @@ inline static void * ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif -#undef MIN -#undef MAX -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - // floating point type used to accumulate sums typedef double ggml_float; -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define GGML_COMPUTE_FP32_TO_FP16(x) (x) - -#define GGML_FP16_TO_FP32(x) ((float) (x)) -#define GGML_FP32_TO_FP16(x) (x) - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // __ARM_NEON - // // global data // // precomputed gelu table for f16 (128 KB) -static ggml_fp16_t table_gelu_f16[1 << 16]; +static ggml_fp16_t ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) -static ggml_fp16_t table_gelu_quick_f16[1 << 16]; +static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; // precomputed silu table for f16 (128 KB) -static ggml_fp16_t table_silu_f16[1 << 16]; +static ggml_fp16_t ggml_table_silu_f16[1 << 16]; // precomputed exp table for f16 (128 KB) -static ggml_fp16_t table_exp_f16[1 << 16]; +static ggml_fp16_t ggml_table_exp_f16[1 << 16]; -// precomputed f32 table for f16 (256 KB) -static float table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) - -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -#endif +// precomputed f32 table for f16 (256 KB) (ggml-impl.h) +float ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API @@ -632,6 +422,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, + [4] = { // GGML_TYPE_Q4_2 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_COUNT, + }, + [5] = { // GGML_TYPE_Q4_3 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + .to_float = NULL, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_COUNT, + }, [GGML_TYPE_Q5_0] = { .type_name = "q5_0", .blck_size = QK5_0, @@ -1551,7 +1363,7 @@ inline static float ggml_gelu_f32(float x) { inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { - y[i] = table_gelu_f16[i16[i]]; + y[i] = ggml_table_gelu_f16[i16[i]]; } } @@ -1561,7 +1373,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); } } #else @@ -1579,7 +1391,7 @@ inline static float ggml_gelu_quick_f32(float x) { //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { -// y[i] = table_gelu_quick_f16[i16[i]]; +// y[i] = ggml_table_gelu_quick_f16[i16[i]]; // } //} @@ -1589,7 +1401,7 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * for (int i = 0; i < n; ++i) { ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); } } #else @@ -1608,7 +1420,7 @@ inline static float ggml_silu_f32(float x) { //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { -// y[i] = table_silu_f16[i16[i]]; +// y[i] = ggml_table_silu_f16[i16[i]]; // } //} @@ -1618,7 +1430,7 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]); + y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]); } } #else @@ -2334,11 +2146,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { for (int i = 0; i < (1 << 16); ++i) { uint16_t ui = i; memcpy(&ii, &ui, sizeof(ii)); - const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); - table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); - table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); - table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); + const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); + ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); + ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); @@ -10701,7 +10513,7 @@ static void ggml_compute_forward_soft_max_f32( // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); sum += (ggml_float)val; dp[i] = val; } @@ -12990,7 +12802,7 @@ static void ggml_compute_forward_flash_attn_f32( #else ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); #endif sump[j] += (ggml_float)val; SS[j] = val; @@ -13192,7 +13004,7 @@ static void ggml_compute_forward_flash_attn_f16( } else { ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); sump[j] += (ggml_float)val; SS[j] = val; } @@ -13643,7 +13455,7 @@ static void ggml_compute_forward_flash_attn_back_f32( #else ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]); #endif sump[j] += (ggml_float)val; SW[j] = val; @@ -14393,7 +14205,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( #else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); #endif sum += (ggml_float)val; st[i] = val; @@ -14507,7 +14319,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( #else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]); #endif sum += (ggml_float)val; ds0[i] = val; diff --git a/llama.cpp b/llama.cpp index a4340d527..e599917a8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1467,7 +1467,7 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { } static void llama_kv_cache_clear(struct llama_kv_cache & cache) { - for (int32_t i = 0; i < cache.size; ++i) { + for (int32_t i = 0; i < (int32_t) cache.size; ++i) { cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); } diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp index afd7bf77f..753dae911 100644 --- a/tests/test-double-float.cpp +++ b/tests/test-double-float.cpp @@ -4,7 +4,7 @@ #undef NDEBUG #include -#if !defined(__riscv) && !defined(__s390__) +#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON) #include #endif #include diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 884af4054..a2459a286 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -129,6 +129,13 @@ int main(int argc, char * argv[]) { ggml_type type = (ggml_type) i; ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + // deprecated - skip + if (qfns.blck_size == 0) { + continue; + } + + printf("Testing %s\n", ggml_type_name((ggml_type) i)); + if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = From 07178c98e1b61a5e2af39d347add12e7eb9e08e1 Mon Sep 17 00:00:00 2001 From: Tungsten842 <886724vf@anonaddy.me> Date: Tue, 31 Oct 2023 18:24:03 +0100 Subject: [PATCH 26/63] flake.nix: fix for rocm 5.7 (#3853) --- flake.lock | 12 ++++++------ flake.nix | 10 ++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index 070f0e161..0455f6561 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1692799911, - "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=", + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", "owner": "numtide", "repo": "flake-utils", - "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1698134075, - "narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=", + "lastModified": 1698318101, + "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4", + "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index fa34394b2..4cf28d5c1 100644 --- a/flake.nix +++ b/flake.nix @@ -11,8 +11,7 @@ meta.mainProgram = "llama"; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; buildInputs = with pkgs; [ openmpi ]; - osSpecific = with pkgs; buildInputs ++ - ( + osSpecific = with pkgs; buildInputs ++ ( if isAarch64 && isDarwin then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate @@ -96,12 +95,15 @@ }; packages.rocm = pkgs.stdenv.mkDerivation { inherit name src meta postPatch nativeBuildInputs postInstall; - buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; + buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ]; cmakeFlags = cmakeFlags ++ [ "-DLLAMA_HIPBLAS=1" "-DCMAKE_C_COMPILER=hipcc" "-DCMAKE_CXX_COMPILER=hipcc" - "-DCMAKE_POSITION_INDEPENDENT_CODE=ON" + # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM + # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt + # and select the line that matches the current nixpkgs version of rocBLAS. + "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" ]; }; apps.llama-server = { From 238657db2364cfb728c694470a4a81702afea760 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:44:49 -0500 Subject: [PATCH 27/63] samplers : Min-P sampler implementation [alternative to Top P/Top K] (#3841) * Introduce the new Min-P sampler by @kalomaze The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. * Min-P enabled and set to 0.05 default --------- Co-authored-by: Georgi Gerganov Co-authored-by: cebtenzzre --- common/common.cpp | 8 ++++++++ common/sampling.cpp | 6 ++++-- common/sampling.h | 1 + examples/main/README.md | 8 ++++++++ llama.cpp | 26 ++++++++++++++++++++++++++ llama.h | 7 +++++++ 6 files changed, 54 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c187128d6..dc4865e80 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,6 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } sparams.top_p = std::stof(argv[i]); + } else if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.min_p = std::stof(argv[i]); } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; @@ -679,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); + printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); @@ -1275,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); } diff --git a/common/sampling.cpp b/common/sampling.cpp index c4996c985..673d67a6d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, - params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, + params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); return std::string(result); @@ -110,6 +110,7 @@ llama_token llama_sampling_sample( const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; + const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; @@ -190,6 +191,7 @@ llama_token llama_sampling_sample( llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); llama_sample_temp (ctx_main, &cur_p, temp); id = llama_sample_token(ctx_main, &cur_p); diff --git a/common/sampling.h b/common/sampling.h index 62ea6d4cf..7c9b8dcf2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -14,6 +14,7 @@ typedef struct llama_sampling_params { int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled diff --git a/examples/main/README.md b/examples/main/README.md index a9561c383..a3428b487 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho Example usage: `--top-p 0.95` +### Min P Sampling + +- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05). + +The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. + +Example usage: `--min-p 0.05` + ### Tail Free Sampling (TFS) - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). diff --git a/llama.cpp b/llama.cpp index e599917a8..7ee589298 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7368,6 +7368,32 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } } +void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + if (p <= 0.0f || !candidates->size) { + return; + } + + llama_sample_softmax(ctx, candidates); + + const int64_t t_start_sample_us = ggml_time_us(); + + float scale = candidates->data[0].p; // scale by max prob + size_t i = 1; // first token always matches + + for (; i < candidates->size; ++i) { + if (candidates->data[i].p < p * scale && i >= min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + candidates->size = i; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; diff --git a/llama.h b/llama.h index d727dbd9f..75fe391ef 100644 --- a/llama.h +++ b/llama.h @@ -598,6 +598,13 @@ extern "C" { float p, size_t min_keep); + /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + LLAMA_API void llama_sample_min_p( + struct llama_context * ctx, + llama_token_data_array * candidates, + float p, + size_t min_keep); + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. LLAMA_API void llama_sample_tail_free( struct llama_context * ctx, From 71e3718abdb2771b50c9606d3a7569623a0b0afe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 08:04:02 +0200 Subject: [PATCH 28/63] llama : refactor graph build code (#3837) * llama : factor out ggml-alloc from graph graph build functions ggml-ci * metal : disable kernel load log * llama : factor out tensor offloading outside the build call (wip) ggml-ci * llama : offload rest of the models ggml-ci * llama : update offload log messages to print node index * llama : comments * llama : support offloading result_norm + comments * llama : factor graph input into a function * llama : do tensor offload only with CUDA * llama : fix res_norm offloading * llama : try to optimize offloading code * llama : fix non-CUDA build * llama : try to fix build * llama : move refact in correct place + optimize graph input * llama : refactor tensor offloading as callback * llama : add layer index to all tensor names * llama : add functional header * llama : comment ggml-ci * llama : remove obsolete map for layer counting * llama : add llm_build helper functions (#3848) * llama : add llm_build_norm helper function ggml-ci * llama : add llm_build_ffn helper function (#3849) ggml-ci * llama : add llm_build_k_shift helper ggml-ci * llama : fix offloading after recent changes * llama : add llm_build_kv_store helper ggml-ci * llama : remove obsolete offload names * llama : fix llm_build_k_shift to use n_head_kv instead of n_head * llama : simplify falcon Q, K, V computation * llama : remove obsolete comments in build graphs * llama : add llm_build_kqv helper ggml-ci * llama : minor * llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading * llama : fix input allocation logic * llama : update offload functions for KQ tensors * llama : normalize tensor names ggml-ci * llama : enable warning about not offloaded tensors * llama : remove extra ; + deduplicate gate_b logic * llama : add llm_build_inp_embd helper --- ggml-metal.m | 11 +- ggml.h | 2 +- llama.cpp | 3655 ++++++++++++++++++++------------------------------ 3 files changed, 1477 insertions(+), 2191 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 2380c4310..bc881395a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { // load kernels { NSError * error = nil; -#define GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + + /* GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ + */ +#define GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } diff --git a/ggml.h b/ggml.h index 8c954904e..9d16c5a72 100644 --- a/ggml.h +++ b/ggml.h @@ -709,7 +709,7 @@ extern "C" { // Context tensor enumeration and lookup GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); diff --git a/llama.cpp b/llama.cpp index 7ee589298..ead1d421d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -60,7 +60,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -69,11 +71,10 @@ #include #include #include +#include #include #include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -969,7 +970,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { (void) tensor; } @@ -1113,13 +1114,13 @@ struct llama_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1; // ffn_gate - struct ggml_tensor * w2; // ffn_down - struct ggml_tensor * w3; // ffn_up + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 // ff bias - struct ggml_tensor * b2; // ffn_down - struct ggml_tensor * b3; // ffn_up + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 }; struct llama_kv_cell { @@ -1225,8 +1226,8 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; - struct ggml_tensor * tok_embeddings; - struct ggml_tensor * pos_embeddings; + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; struct ggml_tensor * tok_norm; struct ggml_tensor * tok_norm_b; @@ -2482,7 +2483,7 @@ static void llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2536,21 +2537,21 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; ggml_backend_type backend_output; @@ -2602,15 +2603,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2618,7 +2619,7 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2681,21 +2682,21 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_STARCODER: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); // output { @@ -2754,11 +2755,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2766,14 +2767,14 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); } } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; @@ -2814,31 +2815,31 @@ static void llm_load_tensors( const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; case LLM_ARCH_BLOOM: { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); // output { @@ -2897,11 +2898,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2909,14 +2910,14 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2); + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); } } } break; case LLM_ARCH_MPT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2967,8 +2968,8 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2976,8 +2977,8 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w2) + - ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up); } } } break; @@ -3007,10 +3008,10 @@ static void llm_load_tensors( #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = hparams.n_layer + 3; -#elif defined(GGML_USE_CLBLAST) + const int max_offloadable_layers = hparams.n_layer + 3; +#elif GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); @@ -3089,9 +3090,359 @@ static bool llama_model_load( return true; } +using llm_build_cb = std::function; + +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +static struct ggml_tensor * llm_build_inp_embd( + struct ggml_context * ctx, + const llama_batch & batch, + struct ggml_tensor * tok_embd, + int64_t n_embd, + int32_t n_tokens, + const llm_build_cb & cb) { + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens", -1); + + inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + } + + return inpL; +} + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + int64_t n_rot, + llm_rope_type type, + const llm_build_cb & cb) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + + const int64_t n_ctx = lctx.cparams.n_ctx; + + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; + + GGML_ASSERT(n_embd_head % n_rot == 0); + + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); + + int rope_type = 0; + + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx, + ggml_view_3d(ctx, kv_self.k, + n_rot, n_head_kv, n_ctx, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(graph, tmp); + } +} + +static void llm_build_kv_store( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + +static struct ggml_tensor * llm_build_norm( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + float eps, + const llm_build_cb & cb, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx, cur, mb); + } + + return cur; +} + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +static struct ggml_tensor * llm_build_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = ggml_mul_mat(ctx, gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = ggml_mul_mat(ctx, gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx, cur); + cb(cur, "ffn_gelu", il); + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + cur = ggml_mul_mat(ctx, down, cur); + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx, cur, down_b); + } + + return cur; +} + +// if max_alibi_bias > 0 then apply ALiBi +static struct ggml_tensor * llm_build_kqv( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * kq_scale, + struct ggml_tensor * kq_mask, + int32_t n_tokens, + int32_t n_kv, + float alibi_bias_max, + const llm_build_cb & cb, + int il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd = hparams.n_embd; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); + cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + cb(k, "k", il); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (alibi_bias_max > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to ggml_add + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max); + cb(kq, "kq_scaled_alibi", il); + } + + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); + + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + cb(cur, "kqv_merged_cont", il); + + cur = ggml_mul_mat(ctx, wo, cur); + if (wo_b) { + cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx, cur, wo_b); + } + + return cur; +} + static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3106,7 +3457,6 @@ static struct ggml_cgraph * llm_build_llama( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3114,13 +3464,11 @@ static struct ggml_cgraph * llm_build_llama( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("n_kv = %d\n", n_kv); @@ -3139,314 +3487,81 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3454,21 +3569,14 @@ static struct ggml_cgraph * llm_build_llama( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3479,7 +3587,9 @@ static struct ggml_cgraph * llm_build_llama( static struct ggml_cgraph * llm_build_baichaun( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3494,7 +3604,6 @@ static struct ggml_cgraph * llm_build_baichaun( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3502,13 +3611,11 @@ static struct ggml_cgraph * llm_build_baichaun( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; @@ -3525,331 +3632,91 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { - // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Kcur; - struct ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: - Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); - Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: GGML_ASSERT(false); } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + // apply ALiBi for 13B model + const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f; - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct ggml_tensor * KQ_masked; - struct ggml_tensor * KQ_scaled_alibi; - - switch (model.type) { - case MODEL_7B: - KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - break; - case MODEL_13B: - // TODO: replace with ggml_add() - KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - break; - default: - GGML_ASSERT(false); - } - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3857,366 +3724,14 @@ static struct ggml_cgraph * llm_build_baichaun( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_refact( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - - // printf("n_kv = %d\n", n_kv); - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - struct ggml_tensor * inpSA = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } - - // self-attention - { - // compute Q and K - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); - - struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); - - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); - } - - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); - } - - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); - } - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4227,7 +3742,9 @@ static struct ggml_cgraph * llm_build_refact( static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4250,13 +3767,11 @@ static struct ggml_cgraph * llm_build_falcon( const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); @@ -4276,294 +3791,94 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb); } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); - - attn_norm = ggml_add(ctx0, - ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); - - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); - } else { // Falcon 7B + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, attn_norm, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm_2", il); + } else { cur = attn_norm; } - // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv", il); - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - const size_t wsize = ggml_type_size(cur->type); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - offload_func_kq(tmpq); - - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); - - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); + Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); + Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cur = llm_build_kqv(lctx, ctx0, attn_norm, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = cur; // feed forward { - struct ggml_tensor * inpFF = attn_norm; - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); - - cur = ggml_gelu(ctx0, cur); - offload_func(cur); - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -4572,18 +3887,14 @@ static struct ggml_cgraph * llm_build_falcon( cur = inpL; // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); - ggml_set_name(cur, "result_norm"); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4594,7 +3905,9 @@ static struct ggml_cgraph * llm_build_falcon( static struct ggml_cgraph * llm_build_starcoder( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4607,7 +3920,6 @@ static struct ggml_cgraph * llm_build_starcoder( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -4615,11 +3927,9 @@ static struct ggml_cgraph * llm_build_starcoder( const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -4634,266 +3944,95 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; - struct ggml_tensor * position; + struct ggml_tensor * pos; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - { - // Compute position embeddings. - struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(lctx.alloc, inp_positions); - if (!ggml_allocr_is_measure(lctx.alloc)) { - for (int i = 0; i < n_tokens; ++i) { - ((int32_t *) inp_positions->data)[i] = batch.pos[i]; - } - } - ggml_set_name(inp_positions, "inp_positions"); - - position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); - } + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - offload_func_kq(KQ_mask); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); + cb(KQ_mask, "KQ_mask", -1); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - inpL = ggml_add(ctx0, token, position); - ggml_set_name(inpL, "inpL"); + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); + // self-attention { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - offload_func(cur); - } - - { - // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - ggml_set_name(tmpq, "tmpq"); - ggml_set_name(tmpk, "tmpk"); - ggml_set_name(tmpv, "tmpv"); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - offload_func_kq(tmpq); - offload_func_kq(tmpk); - offload_func_v (tmpv); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Kcur = tmpk; + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - offload_func(cur); - - // Add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - - struct ggml_tensor * inpFF = cur; + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func_nr(cur); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); - offload_func_nr(cur); - } - - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - offload_func(cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - offload_func(cur); - - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); - offload_func(cur); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); - + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func_nr(cur); - - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); - ggml_set_name(cur, "result_norm"); - } + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); @@ -4903,7 +4042,9 @@ static struct ggml_cgraph * llm_build_starcoder( static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4912,29 +4053,27 @@ static struct ggml_cgraph * llm_build_persimmon( GGML_ASSERT(!!kv_self.ctx); const auto & cparams = lctx.cparams; + const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const size_t n_rot = n_embd_head / 2; + const int64_t n_rot = n_embd_head / 2; const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; - - const int n_gpu_layers = model.n_gpu_layers; - + const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, @@ -4948,146 +4087,77 @@ static struct ggml_cgraph * llm_build_persimmon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "imp_embd", -1); + + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + cb(KQ_scale, "KQ_scale", -1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); + cb(KQ_mask, "KQ_mask", -1); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions. - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_rot, n_head, n_ctx, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) - ), - K_shift, n_rot, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb); } - for (int il=0; il < n_layer; ++il) { + + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; - offload_func_t offload_func = llama_nop; - { - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - offload_func(cur); - ggml_format_name(cur, "input_layernorm_%d", il); - } + + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); + // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv", il); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + cb(cur, "bqkv", il); // split qkv GGML_ASSERT(n_head_kv == n_head); - ggml_set_name(cur, format("qkv_%d", il).c_str()); + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - offload_func_kq(tmpqkv); + cb(tmpqkv, "tmpqkv", il); + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - offload_func_kq(tmpqkv_perm); - ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); + cb(tmpqkv_perm, "tmpqkv", il); + struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 ); - offload_func_kq(tmpq); + cb(tmpq, "tmpq", il); + struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - offload_func_kq(tmpk); - // Q/K Layernorm - tmpq = ggml_norm(ctx0, tmpq, norm_eps); - offload_func_kq(tmpq); - tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - offload_func_kq(tmpq); - tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - offload_func_kq(tmpq); + cb(tmpk, "tmpk", il); - tmpk = ggml_norm(ctx0, tmpk, norm_eps); - offload_func_v(tmpk); - tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - offload_func_v(tmpk); - tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - offload_func_v(tmpk); + // Q/K Layernorm + tmpq = llm_build_norm(ctx0, tmpq, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(tmpq, "tmpq", il); + + tmpk = llm_build_norm(ctx0, tmpk, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(tmpk, "tmpk", il); // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( @@ -5096,16 +4166,15 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, 0 ); - offload_func_kq(qrot); - ggml_format_name(qrot, "qrot_%d", il); + cb(qrot, "qrot", il); + struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 ); - offload_func_kq(krot); - ggml_format_name(krot, "krot_%d", il); + cb(krot, "krot", il); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( @@ -5114,193 +4183,117 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot ); - offload_func_kq(qpass); - ggml_format_name(qpass, "qpass_%d", il); + cb(qpass, "qpass", il); + struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot ); - offload_func_kq(kpass); - ggml_format_name(kpass, "kpass_%d", il); + cb(kpass, "kpass", il); - struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + struct ggml_tensor * qrotated = ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(qrotated); + cb(qrotated, "qrotated", il); + struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(krotated); + cb(krotated, "krotated", il); + // ggml currently only supports concatenation on dim=2 // so we need to permute qrot, qpass, concat, then permute back. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - offload_func_kq(qrotated); + cb(qrotated, "qrotated", il); + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - offload_func_kq(krotated); + cb(krotated, "krotated", il); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - offload_func_kq(qpass); + cb(qpass, "qpass", il); + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - offload_func_kq(kpass); + cb(kpass, "kpass", il); struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - offload_func_kq(Qcur); + cb(Qcur, "Qcur", il); + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - offload_func_kq(Kcur); + cb(Kcur, "Kcur", il); struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - offload_func_kq(Q); + cb(Q, "Q", il); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - offload_func_kq(Kcur); - { - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - offload_func_v(tmpv); - // store K, V in cache - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); - struct ggml_tensor * k = ggml_view_1d( - ctx0, kv_self.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) - ); - offload_func_kq(k); - ggml_set_name(k, "k"); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - - offload_func_kq(K); - ggml_format_name(K, "K_%d", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_kq(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - cur = ggml_add(ctx0, cur, model.layers[il].bo); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + // TODO: not tested, could be broken + cur = llm_build_kqv(lctx, ctx0, Q, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network { - // MLP - { - // Norm - cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func(cur); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); - ggml_set_name(cur, "ffn_norm"); - offload_func(cur); - } - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, cur, model.layers[il].b3); - offload_func(cur); - ggml_set_name(cur, "result_ffn_up"); - - cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); - ggml_set_name(cur, "result_ffn_act"); - offload_func(cur); - offload_func(cur->src[0]); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - cur = ggml_add(ctx0, - cur, - model.layers[il].b2); - offload_func(cur); - ggml_set_name(cur, "outFF"); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_outFF"); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + inpL = cur; } + cur = inpL; - { - cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); - cur = ggml_mul(ctx0, cur, model.output_norm); - offload_func_nr(cur); - cur = ggml_add(ctx0, cur, model.output_norm_b); - // offload_func_nr(cur); + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); - ggml_set_name(cur, "result_norm"); - } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + return gf; } -static struct ggml_cgraph * llm_build_bloom( +static struct ggml_cgraph * llm_build_refact( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5315,6 +4308,133 @@ static struct ggml_cgraph * llm_build_bloom( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); + + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llm_build_bloom( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5322,8 +4442,8 @@ static struct ggml_cgraph * llm_build_bloom( const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5340,198 +4460,90 @@ static struct ggml_cgraph * llm_build_bloom( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); + cb(KQ_mask, "KQ_mask", -1); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // norm - { - inpL = ggml_norm(ctx0, token, norm_eps); - inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); - } - - ggml_set_name(inpL, "inpL"); + inpL = llm_build_norm(ctx0, inpL, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); + + // self-attention { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - { - // Self Attention - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); - - struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); - - struct ggml_tensor * Qcur = tmpq; - struct ggml_tensor * Kcur = tmpk; - - // store key and value to memory - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); - } - - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - // Add the input - cur = ggml_add(ctx0, cur, inpL); - - struct ggml_tensor * inpFF = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); - } + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); - } - ggml_set_name(cur, "result_norm"); + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5542,7 +4554,9 @@ static struct ggml_cgraph * llm_build_bloom( static struct ggml_cgraph * llm_build_mpt( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5555,7 +4569,6 @@ static struct ggml_cgraph * llm_build_mpt( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -5563,11 +4576,9 @@ static struct ggml_cgraph * llm_build_mpt( const float clamp_kqv = hparams.f_clamp_kqv; const float max_alibi_bias = hparams.f_max_alibi_bias; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5586,269 +4597,93 @@ static struct ggml_cgraph * llm_build_mpt( struct ggml_tensor * cur; struct ggml_tensor * inpL; - //int warmup = 0; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - //warmup = ((uint32_t*) inp_tokens->data)[0] == 0; - } - - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + cb(KQ_mask, "KQ_mask", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); - - attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - offload_func(attn_norm); - - if (1) { - cur = attn_norm; - } - - // compute QKV + cur = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv", il); if (clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - offload_func_kq(cur); + cb(cur, "wqkv_clamped", il); } - const size_t wsize = ggml_type_size(cur->type); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * Qcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); - offload_func_kq(Qcur); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Kcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); - offload_func_kq(Kcur); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_kq(Kcur); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - ggml_set_name(Qcur, "Qcur"); - ggml_set_name(Kcur, "Kcur"); - - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = - ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - offload_func_kq(KQ_scaled_alibi); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); } // Add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // feed forward { - // Norm - { - cur = ggml_norm(ctx0, attn_out, norm_eps); - offload_func(cur); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); - - cur = ggml_gelu(ctx0, cur); - offload_func(cur); - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + // input for next layer inpL = cur; } cur = inpL; - // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); - - cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5857,50 +4692,494 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } +// +// tensor offloading helpers +// +// TODO: will be removed with backend v2 + +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +// TODO: will be removed with backend v2 +struct llm_offload_trie { + struct node { + ~node() { + for (int i = 0; i < 256; ++i) { + if (children[i]) { + delete children[i]; + } + } + } + + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; + + llm_offload_trie() { + root = new node; + } + + llm_offload_trie(const std::unordered_map & map) { + root = new node; + + for (const auto & kv : map) { + add(kv.first, kv.second); + } + } + + ~llm_offload_trie() { + delete root; + } + + void add(const char * name, llm_offload_func_e func) { + node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + cur->children[c] = new node; + } + + cur = cur->children[c]; + } + + cur->func = func; + } + + llm_offload_func_e find(const char * name) const { + const node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + return OFFLOAD_FUNC_NOP; + } + + cur = cur->children[c]; + } + + return cur->func; + } + + node * root = nullptr; +}; + +// TODO: will be removed with backend v2 +static const std::unordered_map k_offload_map = { + //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + { "pos_embd", OFFLOAD_FUNC_NR }, + + { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, + + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, + + { "norm", OFFLOAD_FUNC }, + { "norm_w", OFFLOAD_FUNC }, + { "norm_wb", OFFLOAD_FUNC }, + + { "attn_norm", OFFLOAD_FUNC }, + { "attn_norm_2", OFFLOAD_FUNC }, + + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, + + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, + + { "q", OFFLOAD_FUNC_KQ }, + { "k", OFFLOAD_FUNC_KQ }, + { "kq", OFFLOAD_FUNC_KQ }, + { "kq_scaled", OFFLOAD_FUNC_KQ }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "kq_masked", OFFLOAD_FUNC_KQ }, + { "kq_soft_max", OFFLOAD_FUNC_V }, + { "v", OFFLOAD_FUNC_V }, + { "kqv", OFFLOAD_FUNC_V }, + { "kqv_merged", OFFLOAD_FUNC_V }, + { "kqv_merged_cont", OFFLOAD_FUNC_V }, + { "kqv_wo", OFFLOAD_FUNC_V }, + { "kqv_out", OFFLOAD_FUNC_V }, + + { "ffn_inp", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + + { "ffn_up", OFFLOAD_FUNC }, + { "ffn_up_b", OFFLOAD_FUNC }, + { "ffn_gate", OFFLOAD_FUNC }, + { "ffn_gate_b", OFFLOAD_FUNC }, + { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_down", OFFLOAD_FUNC }, + { "ffn_down_b", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, + + { "ffn_silu", OFFLOAD_FUNC }, + { "ffn_gelu", OFFLOAD_FUNC }, + { "ffn_relu", OFFLOAD_FUNC }, + { "ffn_sqr(relu)", OFFLOAD_FUNC }, + + { "l_out", OFFLOAD_FUNC }, + + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, +}; + +static llm_offload_trie k_offload_func_trie(k_offload_map); + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; + // check if we should build the worst-case graph (for memory measurement) + const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + + // keep track of the input that has already been allocated + bool alloc_inp_tokens = false; + bool alloc_inp_embd = false; + bool alloc_inp_pos = false; + bool alloc_inp_KQ_scale = false; + bool alloc_inp_KQ_mask = false; + bool alloc_inp_K_shift = false; + +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif + + int n_non_view = 0; // number of non-view tensors that have been processed by the callback + + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + // TODO: will be removed with backend v2 + llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } + + // + // allocate input tensors and set input data + // + // TODO: will be removed with backend v2 + + if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { + const int64_t n_tokens = cur->ne[0]; + + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } + + alloc_inp_tokens = true; + } + + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } + + alloc_inp_embd = true; + } + + if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + alloc_inp_pos = true; + } + + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + + alloc_inp_KQ_scale = true; + } + + if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + alloc_inp_KQ_mask = true; + } + + if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + + alloc_inp_K_shift = true; + } + + // view tensors are not processed further + if (cur->view_src != nullptr) { + return; + } + + if (cur->op != GGML_OP_NONE) { + n_non_view++; + } + + // + // offload layers + // + // TODO: will be removed with backend v2 + +//#define LLAMA_OFFLOAD_DEBUG + + if (!do_offload) { + return; + } + + const int n_layer = model.hparams.n_layer; + + const int n_gpu_layers = model.n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; + + // should we offload the final norm? yes if we are not computing embeddings + const bool offload_emb = lctx.embedding.empty(); + + static const std::unordered_map> k_offload_func_name = { + { OFFLOAD_FUNC_NOP, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, +#ifdef GGML_USE_CUBLAS + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, + { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, +#else + { OFFLOAD_FUNC, "CPU" }, + { OFFLOAD_FUNC_KQ, "CPU" }, + { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_NR, "CPU" }, + { OFFLOAD_FUNC_EMB, "CPU" }, +#endif // GGML_USE_CUBLAS + }; + + // check the global map for what offload function to use for this tensor + llm_offload_func_e func_e = k_offload_func_trie.find(name); + + if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG + // if a tensor hasn't been offloaded, we warn the user + if (worst_case) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); + } +#endif + + return; + } + + // count the number of layers and respect the provided n_gpu_layers + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: + break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (il < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: GGML_ASSERT(false); + } + + offload_func_t func = ggml_offload_nop; + + // this is needed for compatibility with Metal for example +#ifdef GGML_USE_CUBLAS + static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; +#else + static offload_func_t ggml_offload_gpu = ggml_offload_nop; +#endif + + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; + default: GGML_ASSERT(false); + } + + // apply offload function to the tensor + func(cur); + +#ifdef LLAMA_OFFLOAD_DEBUG + if (worst_case) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + } +#endif + }; + struct ggml_cgraph * result = NULL; switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, batch); + result = llm_build_llama(lctx, batch, cb, worst_case); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, batch); + result = llm_build_baichaun(lctx, batch, cb, worst_case); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, batch); + result = llm_build_falcon(lctx, batch, cb, worst_case); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, batch); + result = llm_build_starcoder(lctx, batch, cb, worst_case); } break; case LLM_ARCH_PERSIMMON: { - result = llm_build_persimmon(lctx, batch); + result = llm_build_persimmon(lctx, batch, cb, worst_case); } break; case LLM_ARCH_REFACT: { - result = llm_build_refact(lctx, batch); + result = llm_build_refact(lctx, batch, cb, worst_case); } break; case LLM_ARCH_BLOOM: { - result = llm_build_bloom(lctx, batch); + result = llm_build_bloom(lctx, batch, cb, worst_case); } break; case LLM_ARCH_MPT: { - result = llm_build_mpt(lctx, batch); + result = llm_build_mpt(lctx, batch, cb, worst_case); } break; default: GGML_ASSERT(false); } + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } + } + return result; } @@ -6043,11 +5322,13 @@ static int llama_decode_internal( } // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || + const bool full_offload_supported = + model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || model.arch == LLM_ARCH_MPT; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { n_threads = 1; @@ -6102,6 +5383,8 @@ static int llama_decode_internal( //} // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // need to update the graphs to skip "result_output" { auto & logits_out = lctx.logits; @@ -8713,8 +7996,8 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * dest_t = model_tensors[base_name]; - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + offload_func_t offload_func = ggml_offload_nop; + offload_func_t offload_func_force_inplace = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { From ca190bca8e844d171020d6147687e71472d71734 Mon Sep 17 00:00:00 2001 From: Adrian Hesketh Date: Wed, 1 Nov 2023 09:28:28 +0000 Subject: [PATCH 29/63] server : re-enable completion and embedded at the same time (#3876) --- .gitignore | 1 + examples/server/server.cpp | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 545c28726..5d7c5479e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ .DS_Store .build/ .cache/ +.ccls-cache/ .direnv/ .envrc .swiftpm diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c163c7f8e..47ae0d558 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -149,6 +149,7 @@ struct task_server { task_type type; json data; bool infill_mode = false; + bool embedding_mode = false; }; struct task_result { @@ -371,6 +372,7 @@ struct llama_client_slot std::vector generated_token_probs; bool infill = false; + bool embedding = false; bool has_next_token = true; bool truncated = false; bool stopped_eos = false; @@ -1244,13 +1246,14 @@ struct llama_server_context queue_results.push_back(res); } - int request_completion(json data, bool infill) + int request_completion(json data, bool infill, bool embedding) { std::lock_guard lock(mutex_tasks); task_server task; task.id = id_gen++; task.data = data; task.infill_mode = infill; + task.embedding_mode = embedding; task.type = COMPLETION_TASK; queue_tasks.push_back(task); return task.id; @@ -1376,7 +1379,7 @@ struct llama_server_context { LOG_TEE("slot unavailable\n"); // send error result - send_error(task.id, "slot unavaliable"); + send_error(task.id, "slot unavailable"); return; } @@ -1388,6 +1391,7 @@ struct llama_server_context slot->reset(); slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; slot->task_id = task.id; if (!launch_slot_with_data(slot, task.data)) @@ -1695,7 +1699,7 @@ struct llama_server_context } // prompt evaluated for embedding - if (params.embedding) + if (slot.embedding) { send_embedding(slot); slot.release(); @@ -2274,7 +2278,7 @@ int main(int argc, char **argv) svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); - const int task_id = llama.request_completion(data, false); + const int task_id = llama.request_completion(data, false, false); if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); @@ -2329,7 +2333,7 @@ int main(int argc, char **argv) svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); - const int task_id = llama.request_completion(data, true); + const int task_id = llama.request_completion(data, true, false); if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); @@ -2433,7 +2437,7 @@ int main(int argc, char **argv) { prompt = ""; } - const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false); + const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true); task_result result = llama.next_result(task_id); return res.set_content(result.result_json.dump(), "application/json"); }); From f0e209324a7f663225791897877bf610f1af152d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 11:29:07 +0200 Subject: [PATCH 30/63] scripts : add server-llm.sh (#3868) * scripts : add deploy-server.sh * scripts : rename to server-llm.sh * scripts : working curl pipe --- scripts/server-llm.sh | 391 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 391 insertions(+) create mode 100644 scripts/server-llm.sh diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh new file mode 100644 index 000000000..7bf0929bb --- /dev/null +++ b/scripts/server-llm.sh @@ -0,0 +1,391 @@ +#!/bin/bash +# +# Helper script for deploying llama.cpp server with a single Bash command +# +# - Works on Linux and macOS +# - Supports: CPU, CUDA, Metal, OpenCL +# - Can run all GGUF models from HuggingFace +# - Can serve requests in parallel +# - Always builds latest llama.cpp from GitHub +# +# Limitations +# +# - Chat templates are poorly supported (base models recommended) +# - Might be unstable! +# +# Usage: +# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] +# +# --port: port number, default is 8888 +# --repo: path to a repo containing GGUF model files +# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input +# --backend: cpu, cuda, metal, opencl, depends on the OS +# --gpu-id: gpu id, default is 0 +# --n-parallel: number of parallel requests, default is 8 +# --n-kv: KV cache size, default is 4096 +# --verbose: verbose output +# +# Example: +# +# bash -c "$(curl -s https://ggml.ai/server-llm.sh)" +# + +set -e + +# required utils: curl, git, make +if ! command -v curl &> /dev/null; then + printf "[-] curl not found\n" + exit 1 +fi +if ! command -v git &> /dev/null; then + printf "[-] git not found\n" + exit 1 +fi +if ! command -v make &> /dev/null; then + printf "[-] make not found\n" + exit 1 +fi + +# parse arguments +port=8888 +repo="" +wtype="" +backend="cpu" + +# if macOS, use metal backend by default +if [[ "$OSTYPE" == "darwin"* ]]; then + backend="metal" +elif command -v nvcc &> /dev/null; then + backend="cuda" +fi + +gpu_id=0 +n_parallel=8 +n_kv=4096 +verbose=0 + +function print_usage { + printf "Usage:\n" + printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" + printf " --port: port number, default is 8888\n" + printf " --repo: path to a repo containing GGUF model files\n" + printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" + printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" + printf " --gpu-id: gpu id, default is 0\n" + printf " --n-parallel: number of parallel requests, default is 8\n" + printf " --n-kv: KV cache size, default is 4096\n" + printf " --verbose: verbose output\n\n" + printf "Example:\n\n" + printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' +} + +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --port) + port="$2" + shift + shift + ;; + --repo) + repo="$2" + shift + shift + ;; + --wtype) + wtype="$2" + shift + shift + ;; + --backend) + backend="$2" + shift + shift + ;; + --gpu-id) + gpu_id="$2" + shift + shift + ;; + --n-parallel) + n_parallel="$2" + shift + shift + ;; + --n-kv) + n_kv="$2" + shift + shift + ;; + --verbose) + verbose=1 + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + echo "Unknown argument: $key" + print_usage + exit 1 + ;; + esac +done + +# available weights types +wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") + +wfiles=() +for wt in "${wtypes[@]}"; do + wfiles+=("") +done + +# sample repos +repos=( + "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" + "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" + "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" + "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" + "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" + "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" + "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" +) + +printf "\n" +printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" +printf " Based on the options that follow, the script might download a model file\n" +printf " from the internet, which can be a few GBs in size. The script will also\n" +printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" +printf "\n" +printf " Upon success, an HTTP server will be started and it will serve the selected\n" +printf " model using llama.cpp for demonstration purposes.\n" +printf "\n" +printf " Please note:\n" +printf "\n" +printf " - All new data will be stored in the current folder\n" +printf " - The server will be listening on all network interfaces\n" +printf " - The server will run with default settings which are not always optimal\n" +printf " - Do not judge the quality of a model based on the results from this script\n" +printf " - Do not use this script to benchmark llama.cpp\n" +printf " - Do not use this script in production\n" +printf " - This script is only for demonstration purposes\n" +printf "\n" +printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" +printf "\n" +printf " Press Enter to continue ...\n\n" + +read + +if [[ -z "$repo" ]]; then + printf "[+] No repo provided from the command line\n" + printf " Please select a number from the list below or enter an URL:\n\n" + + is=0 + for r in "${repos[@]}"; do + printf " %2d) %s\n" $is "$r" + is=$((is+1)) + done + + # ask for repo until index of sample repo is provided or an URL + while [[ -z "$repo" ]]; do + printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" + read -p "[+] Select repo: " repo + + # check if the input is a number + if [[ "$repo" =~ ^[0-9]+$ ]]; then + if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then + repo="${repos[$repo]}" + else + printf "[-] Invalid repo index: %s\n" "$repo" + repo="" + fi + elif [[ "$repo" =~ ^https?:// ]]; then + repo="$repo" + else + printf "[-] Invalid repo URL: %s\n" "$repo" + repo="" + fi + done +fi + +# remove suffix +repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') + +printf "[+] Checking for GGUF model files in %s\n" "$repo" + +# find GGUF files in the source +# TODO: better logic +model_tree="${repo%/}/tree/main" +model_files=$(curl -s "$model_tree" | grep -i "\\.gguf" | sed -E 's/.*(.*)<\/span><\/a>/\1/g') + +# list all files in the provided git repo +printf "[+] Model files:\n\n" +for file in $model_files; do + # determine iw by grepping the filename with wtypes + iw=-1 + is=0 + for wt in "${wtypes[@]}"; do + # uppercase + ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') + if [[ "$ufile" =~ "$wt" ]]; then + iw=$is + break + fi + is=$((is+1)) + done + + if [[ $iw -eq -1 ]]; then + continue + fi + + wfiles[$iw]="$file" + + have=" " + if [[ -f "$file" ]]; then + have="*" + fi + + printf " %2d) %s %s\n" $iw "$have" "$file" +done + +# ask for weights type until provided and available +while [[ -z "$wtype" ]]; do + printf "\n" + read -p "[+] Select weight type: " wtype + wfile="${wfiles[$wtype]}" + + if [[ -z "$wfile" ]]; then + printf "[-] Invalid weight type: %s\n" "$wtype" + wtype="" + fi +done + +printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" + +url="${repo%/}/resolve/main/$wfile" + +# check file if the model has been downloaded before +chk="$wfile.chk" + +# check if we should download the file +# - if $wfile does not exist +# - if $wfile exists but $chk does not exist +# - if $wfile exists and $chk exists but $wfile is newer than $chk +# TODO: better logic using git lfs info + +do_download=0 + +if [[ ! -f "$wfile" ]]; then + do_download=1 +elif [[ ! -f "$chk" ]]; then + do_download=1 +elif [[ "$wfile" -nt "$chk" ]]; then + do_download=1 +fi + +if [[ $do_download -eq 1 ]]; then + printf "[+] Downloading weights from %s\n" "$url" + + # download the weights file + curl -o "$wfile" -# -L "$url" + + # create a check file if successful + if [[ $? -eq 0 ]]; then + printf "[+] Creating check file %s\n" "$chk" + touch "$chk" + fi +else + printf "[+] Using cached weights %s\n" "$wfile" +fi + +# get latest llama.cpp and build + +printf "[+] Downloading latest llama.cpp\n" + +llama_cpp_dir="__llama_cpp_port_${port}__" + +if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then + # if the dir exists and there isn't a file "__ggml_script__" in it, abort + printf "[-] Directory %s already exists\n" "$llama_cpp_dir" + printf "[-] Please remove it and try again\n" + exit 1 +elif [[ -d "$llama_cpp_dir" ]]; then + printf "[+] Directory %s already exists\n" "$llama_cpp_dir" + printf "[+] Using cached llama.cpp\n" + + cd "$llama_cpp_dir" + git reset --hard + git fetch + git checkout origin/master + + cd .. +else + printf "[+] Cloning llama.cpp\n" + + git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" +fi + +# mark that that the directory is made by this script +touch "$llama_cpp_dir/__ggml_script__" + +if [[ $verbose -eq 1 ]]; then + set -x +fi + +# build +cd "$llama_cpp_dir" + +make clean + +log="--silent" +if [[ $verbose -eq 1 ]]; then + log="" +fi + +if [[ "$backend" == "cuda" ]]; then + printf "[+] Building with CUDA backend\n" + LLAMA_CUBLAS=1 make -j server $log +elif [[ "$backend" == "cpu" ]]; then + printf "[+] Building with CPU backend\n" + make -j server $log +elif [[ "$backend" == "metal" ]]; then + printf "[+] Building with Metal backend\n" + make -j server $log +elif [[ "$backend" == "opencl" ]]; then + printf "[+] Building with OpenCL backend\n" + LLAMA_CLBLAST=1 make -j server $log +else + printf "[-] Unknown backend: %s\n" "$backend" + exit 1 +fi + +# run the server + +printf "[+] Running server\n" + +args="" +if [[ "$backend" == "cuda" ]]; then + export CUDA_VISIBLE_DEVICES=$gpu_id + args="-ngl 999" +elif [[ "$backend" == "cpu" ]]; then + args="-ngl 0" +elif [[ "$backend" == "metal" ]]; then + args="-ngl 999" +elif [[ "$backend" == "opencl" ]]; then + args="-ngl 999" +else + printf "[-] Unknown backend: %s\n" "$backend" + exit 1 +fi + +if [[ $verbose -eq 1 ]]; then + args="$args --verbose" +fi + +./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args + +exit 0 From 73bdcb395ef9a997d9c02950c7cd4249546162cd Mon Sep 17 00:00:00 2001 From: Andrew Godfrey Date: Wed, 1 Nov 2023 04:49:04 -0700 Subject: [PATCH 31/63] finetune : add -ngl parameter (#3762) * Add '-ngl' support to finetune.cpp * Add fprintf in ggml_cuda_op_add When I tried CUDA offloading during finetuning following the readme, I got an assert here. This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora * Add 'finetune.sh', which currently fails when using GPU "error: operator (): Finetuning on tensors with type 'f16' is not yet supported" * tweak finetune.sh * Suppress some warnings in ggml.c * Add f16 implementation to ggml_compute_forward_add_f16_f32 * Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs * finetune.sh: Edit comments * Add "add_f16_f32_f32_cuda" * Tweak an error message * finetune.sh: Add an optional LLAMA_MODEL_DIR variable * finetune.sh: Add an optional LLAMA_TRAINING_DIR variable * train : minor * tabs to spaces --------- Co-authored-by: Georgi Gerganov Co-authored-by: cebtenzzre --- common/train.cpp | 2 ++ common/train.h | 1 + examples/finetune/finetune.cpp | 14 +++++++++- examples/finetune/finetune.sh | 34 +++++++++++++++++++++++ ggml-cuda.cu | 17 ++++++++++++ ggml-quants.c | 2 ++ ggml.c | 49 +++++++++++++++++++++++++--------- llama.cpp | 2 +- 8 files changed, 106 insertions(+), 15 deletions(-) create mode 100644 examples/finetune/finetune.sh diff --git a/common/train.cpp b/common/train.cpp index 3cce5da26..bc15b7a03 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() { params.n_batch = 8; params.n_gradient_accumulation = 1; params.n_epochs = -1; + params.n_gpu_layers = 0; params.custom_n_ctx = false; @@ -1080,6 +1081,7 @@ struct train_params_common get_default_train_params_common() { params.adam_beta2 = 0.999f; params.adam_gclip = 1.0f; params.adam_eps_f = 0.0f; + return params; } diff --git a/common/train.h b/common/train.h index 42fa704b8..d86c93cc4 100644 --- a/common/train.h +++ b/common/train.h @@ -44,6 +44,7 @@ struct train_params_common { int n_batch; int n_gradient_accumulation; int n_epochs; + int n_gpu_layers; bool custom_n_ctx; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 35824cd2d..60c7faa79 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type)) { + if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) { return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); } else if (a->type == GGML_TYPE_F32) { return ggml_add(ctx, a, b); @@ -1459,6 +1459,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_w3 = std::stoi(argv[i]); params->custom_n_rank_w3 = true; + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params->common.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1545,6 +1556,7 @@ int main(int argc, char ** argv) { srand(params.common.seed); struct llama_model_params llama_mparams = llama_model_default_params(); + llama_mparams.n_gpu_layers = params.common.n_gpu_layers; llama_mparams.vocab_only = false; printf("%s: model base = '%s'\n", __func__, params.fn_model_base); diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh new file mode 100644 index 000000000..079bfa113 --- /dev/null +++ b/examples/finetune/finetune.sh @@ -0,0 +1,34 @@ +#!/bin/bash +cd `dirname $0` +cd ../.. + +EXE="./finetune" + +if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi +if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi + +# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. +MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. + +while getopts "dg" opt; do + case $opt in + d) + DEBUGGER="gdb --args" + ;; + g) + EXE="./build/bin/Release/finetune" + GPUARG="--gpu-layers 25" + ;; + esac +done + +$DEBUGGER $EXE \ + --model-base $MODEL \ + $GPUARG \ + --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ + --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ + --lora-out lora-ol3b-shakespeare-ITERATION.bin \ + --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ + --save-every 10 \ + --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ + --use-checkpointing diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1ba951f68..4e6e7cd94 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -513,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d dst[i] = __hadd(x[i], __float2half(y[i])); } +static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = __half2float(x[i]) + y[i]; +} + static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -4693,6 +4702,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co add_f16_f32_f16<<>>(x, y, dst, k); } +static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f16_f32_f32<<>>(x, y, dst, k); +} + static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; mul_f32<<>>(x, y, dst, kx, ky); @@ -5996,7 +6010,10 @@ inline void ggml_cuda_op_add( add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream); } else { + fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type); GGML_ASSERT(false); } diff --git a/ggml-quants.c b/ggml-quants.c index 721594467..255c89b6a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -716,6 +716,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { __riscv_vse8_v_i8m1(y[i].qs , vs, vl); } #else + UNUSED(nb); // scalar quantize_row_q8_0_reference(x, y, k); #endif @@ -969,6 +970,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { y[i].s = sum*d; } #else + UNUSED(nb); // scalar quantize_row_q8_1_reference(x, y, k); #endif diff --git a/ggml.c b/ggml.c index 84407b122..80d682255 100644 --- a/ggml.c +++ b/ggml.c @@ -3153,7 +3153,7 @@ static struct ggml_tensor * ggml_add_cast_impl( // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); GGML_ASSERT(ggml_can_repeat_rows(b, a)); - GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input + GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16 bool is_node = false; @@ -6927,9 +6927,15 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + } + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread @@ -6940,18 +6946,35 @@ static void ggml_compute_forward_add_f16_f32( const int ir1 = MIN(ir0 + dr, nr); if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + if (dst->type == GGML_TYPE_F16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } } } } diff --git a/llama.cpp b/llama.cpp index ead1d421d..42cedc7a1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8003,7 +8003,7 @@ static int llama_apply_lora_from_file_internal( if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); } offload_func = ggml_cuda_assign_buffers; offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; From 9a3b4f6c86503c9cfc049d4d0fdeafef12806f5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 13:50:45 +0200 Subject: [PATCH 32/63] ggml : fix UNUSED macro (#3762) --- ggml-quants.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 255c89b6a..740be6dc5 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -716,7 +716,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { __riscv_vse8_v_i8m1(y[i].qs , vs, vl); } #else - UNUSED(nb); + GGML_UNUSED(nb); // scalar quantize_row_q8_0_reference(x, y, k); #endif @@ -970,7 +970,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { y[i].s = sum*d; } #else - UNUSED(nb); + GGML_UNUSED(nb); // scalar quantize_row_q8_1_reference(x, y, k); #endif From e75dfdd31b6a3dfa0627ba4ac3bb4b36e9db588e Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Wed, 1 Nov 2023 21:40:43 +0800 Subject: [PATCH 33/63] sampling : null grammar field after reset (#3885) --- common/sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 673d67a6d..1317024c2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) { void llama_sampling_reset(llama_sampling_context * ctx) { if (ctx->grammar != NULL) { llama_grammar_free(ctx->grammar); + ctx->grammar = NULL; } if (!ctx->parsed_grammar.rules.empty()) { From a2758d08e44ce3624d233af4d23c6843e2e735b5 Mon Sep 17 00:00:00 2001 From: staviq Date: Wed, 1 Nov 2023 15:18:27 +0100 Subject: [PATCH 34/63] log : make generating separate log files optional (#3787) * impl --log-new, --log-append * Update common/log.h Co-authored-by: cebtenzzre * Update common/log.h Co-authored-by: cebtenzzre * Apply suggestions from code review Co-authored-by: cebtenzzre --------- Co-authored-by: cebtenzzre --- common/log.h | 122 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 40 deletions(-) diff --git a/common/log.h b/common/log.h index d2c864cea..c0e814861 100644 --- a/common/log.h +++ b/common/log.h @@ -97,38 +97,56 @@ #define LOG_TEE_TARGET stderr #endif -// NOTE: currently disabled as it produces too many log files +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; + // Utility to obtain "pid" like unique process id and use it when creating log files. -//inline std::string log_get_pid() -//{ -// static std::string pid; -// if (pid.empty()) -// { -// // std::this_thread::get_id() is the most portable way of obtaining a "process id" -// // it's not the same as "pid" but is unique enough to solve multiple instances -// // trying to write to the same log. -// std::stringstream ss; -// ss << std::this_thread::get_id(); -// pid = ss.str(); -// } -// -// return pid; -//} +inline std::string log_get_pid() +{ + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } + + return pid; +} // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" // where the number is a runtime id of the current thread. -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) // INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) +inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) { + static bool _multilog = false; + + if (multilog != LogTriStateSame) + { + _multilog = multilog == LogTriStateTrue; + } + std::stringstream buf; buf << log_file_basename; - //buf << "."; - //buf << log_get_pid(); + if (_multilog) + { + buf << "."; + buf << log_get_pid(); + } buf << "."; buf << log_file_extension; @@ -213,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #define LOG_TEE_FLF_VAL ,"" #endif -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - // INTERNAL, DO NOT USE // USE LOG() INSTEAD // @@ -315,16 +324,23 @@ enum LogTriState #endif // INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) { - static bool _initialized{false}; - static bool _disabled{(filename.empty() && target == nullptr)}; + static bool _initialized = false; + static bool _append = false; + static bool _disabled = filename.empty() && target == nullptr; static std::string log_current_filename{filename}; static FILE *log_current_target{target}; static FILE *logfile = nullptr; if (change) { + if (append != LogTriStateSame) + { + _append = append == LogTriStateTrue; + return logfile; + } + if (disable == LogTriStateTrue) { // Disable primary target @@ -377,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } } - logfile = fopen(filename.c_str(), "w"); + logfile = fopen(filename.c_str(), _append ? "a" : "w"); } if (!logfile) @@ -398,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri } // INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) { - return log_handler1_impl(change, disable, filename, target); + return log_handler1_impl(change, append, disable, filename, target); } // Disables logs entirely at runtime. @@ -411,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri // INTERNAL, DO NOT USE inline FILE *log_disable_impl() { - return log_handler1_impl(true, LogTriStateTrue); + return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); } // Enables logs at runtime. @@ -420,19 +436,31 @@ inline FILE *log_disable_impl() // INTERNAL, DO NOT USE inline FILE *log_enable_impl() { - return log_handler1_impl(true, LogTriStateFalse); + return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); } // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) #define log_set_target(target) log_set_target_impl(target) // INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } // INTERNAL, DO NOT USE inline FILE *log_handler() { return log_handler1_impl(); } +// Enable or disable creating separate log files for each run. +// can ONLY be invoked BEFORE first log use. +#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") +// Enable or disable append mode for log file. +// can ONLY be invoked BEFORE first log use. +#define log_append(enable) log_append_impl(enable) +// INTERNAL, DO NOT USE +inline FILE *log_append_impl(bool enable) +{ + return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); +} + inline void log_test() { log_disable(); @@ -494,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param) return true; } + if (param == "--log-new") + { + log_multilog(true); + return true; + } + + if (param == "--log-append") + { + log_append(true); + return true; + } + return false; } @@ -523,7 +563,9 @@ inline void log_print_usage() printf(" --log-disable Disable trace logs\n"); printf(" --log-enable Enable trace logs\n"); printf(" --log-file Specify a log filename (without extension)\n"); - printf(" Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ + printf(" --log-new Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"\n"); + printf(" --log-append Don't truncate the old log file.\n"); } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) From 0e40806c1cb3bdf9955ed807ffbe212be85b4c67 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:42:01 -0300 Subject: [PATCH 35/63] common : allow caller to handle help/argument exceptions (#3715) * Allow caller to handle help/argument exceptions * Prepend newline to usage output * Add new gpt_params_parse_ex function to hide arg-parse impl * Fix issue blocking success case * exit instead of returning false * Update common/common.h Co-authored-by: Georgi Gerganov * Update common/common.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 41 ++++++++++++++++++++++++++--------------- common/common.h | 2 ++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index dc4865e80..89be41261 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -103,9 +103,24 @@ void process_escapes(std::string& input) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool result = true; + try { + if (!gpt_params_parse_ex(argc, argv, params)) { + gpt_print_usage(argc, argv, gpt_params()); + exit(0); + } + } + catch (const std::invalid_argument& ex) { + fprintf(stderr, ex.what()); + gpt_print_usage(argc, argv, gpt_params()); + exit(1); + } + return result; +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; - gpt_params default_params; const std::string arg_prefix = "--"; llama_sampling_params & sparams = params.sparams; @@ -554,11 +569,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, default_params); -#ifndef LOG_DISABLE_LOGS - log_print_usage(); -#endif // LOG_DISABLE_LOGS - exit(0); + return false; + } else if (arg == "--random-prompt") { params.random_prompt = true; } else if (arg == "--in-prefix-bos") { @@ -617,22 +629,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: unknown argument: " + arg); } } if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); - exit(1); + throw std::invalid_argument("error: invalid parameter for argument: " + arg); } if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { - fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); - gpt_print_usage(argc, argv, default_params); - exit(1); + + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } if (params.escape) { @@ -651,6 +658,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); printf("options:\n"); @@ -762,6 +770,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); printf("\n"); +#ifndef LOG_DISABLE_LOGS + log_print_usage(); +#endif // LOG_DISABLE_LOGS } std::string get_system_info(const gpt_params & params) { diff --git a/common/common.h b/common/common.h index 84523a4fb..343b27217 100644 --- a/common/common.h +++ b/common/common.h @@ -110,6 +110,8 @@ struct gpt_params { std::string image = ""; // path to an image file }; +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); + bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); From 50337961a678fce4081554b24e56e86b67660163 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 20:11:02 +0200 Subject: [PATCH 36/63] llm : add llm_build_context (#3881) * llm : add llm_build_context * llm : deduce norm eps based on type + explict max_alibi_bias, clamp_kqv * llm : restore the non-graph llm_build_ functional API ggml-ci * llm : cleanup + comments --- llama.cpp | 2338 ++++++++++++++++++++++++----------------------------- 1 file changed, 1042 insertions(+), 1296 deletions(-) diff --git a/llama.cpp b/llama.cpp index 42cedc7a1..d0c4ef101 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3090,6 +3090,10 @@ static bool llama_model_load( return true; } +// +// llm_build +// + using llm_build_cb = std::function; enum llm_rope_type { @@ -3098,17 +3102,35 @@ enum llm_rope_type { LLM_ROPE_GLM, }; +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + static struct ggml_tensor * llm_build_inp_embd( struct ggml_context * ctx, + const llama_hparams & hparams, const llama_batch & batch, struct ggml_tensor * tok_embd, - int64_t n_embd, - int32_t n_tokens, const llm_build_cb & cb) { + const int64_t n_embd = hparams.n_embd; + struct ggml_tensor * inpL; if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); @@ -3117,7 +3139,7 @@ static struct ggml_tensor * llm_build_inp_embd( GGML_ASSERT(false && "not implemented"); #endif - inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); } return inpL; @@ -3126,28 +3148,21 @@ static struct ggml_tensor * llm_build_inp_embd( // Persimmon: n_rot = n_embd_head/2 // Other: n_rot = n_embd_head static void llm_build_k_shift( - const llama_context & lctx, - struct ggml_context * ctx, - struct ggml_cgraph * graph, - int64_t n_rot, - llm_rope_type type, - const llm_build_cb & cb) { - const auto & model = lctx.model; - const auto & kv_self = lctx.kv_self; - const auto & cparams = lctx.cparams; - - const auto & hparams = model.hparams; - + struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, + struct ggml_cgraph * graph, + llm_rope_type type, + int64_t n_ctx, + int64_t n_rot, + float freq_base, + float freq_scale, + const llm_build_cb & cb) { const int64_t n_layer = hparams.n_layer; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_ctx = lctx.cparams.n_ctx; - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - GGML_ASSERT(n_embd_head % n_rot == 0); struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); @@ -3165,11 +3180,11 @@ static void llm_build_k_shift( struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv_self.k, + ggml_view_3d(ctx, kv.k, n_rot, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + ggml_element_size(kv.k)*n_embd_head, + ggml_element_size(kv.k)*n_embd_gqa, + ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), K_shift, n_rot, rope_type, 0, freq_base, freq_scale); cb(tmp, "K_shifted", il); ggml_build_forward_expand(graph, tmp); @@ -3177,22 +3192,17 @@ static void llm_build_k_shift( } static void llm_build_kv_store( - const llama_context & lctx, struct ggml_context * ctx, + const llama_hparams & hparams, + const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, + int64_t n_ctx, int32_t n_tokens, int32_t kv_head, const llm_build_cb & cb, int64_t il) { - const auto & model = lctx.model; - const auto & kv_self = lctx.kv_self; - const auto & cparams = lctx.cparams; - - const auto & hparams = model.hparams; - - const int64_t n_ctx = cparams.n_ctx; const int64_t n_embd_gqa = hparams.n_embd_gqa(); // compute the transposed [n_tokens, n_embd] V matrix @@ -3200,13 +3210,13 @@ static void llm_build_kv_store( //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed cb(v_cur_t, "v_cur_t", il); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head)); cb(k_cache_view, "k_cache_view", il); - struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv.v), + (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v)); cb(v_cache_view, "v_cache_view", il); // important: storing RoPE-ed version of K in the KV cache! @@ -3214,23 +3224,18 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); } -enum llm_norm_type { - LLM_NORM, - LLM_NORM_RMS, -}; - static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, + const llama_hparams & hparams, struct ggml_tensor * mw, struct ggml_tensor * mb, llm_norm_type type, - float eps, const llm_build_cb & cb, int il) { switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break; + case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break; } if (mw || mb) { @@ -3251,18 +3256,6 @@ static struct ggml_tensor * llm_build_norm( return cur; } -enum llm_ffn_op_type { - LLM_FFN_SILU, - LLM_FFN_GELU, - LLM_FFN_RELU, - LLM_FFN_RELU_SQR, -}; - -enum llm_ffn_gate_type { - LLM_FFN_SEQ, - LLM_FFN_PAR, // ffn_gate is parallel to ffn_up -}; - static struct ggml_tensor * llm_build_ffn( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -3351,26 +3344,21 @@ static struct ggml_tensor * llm_build_ffn( // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( - const llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, + const llama_hparams & hparams, + const llama_kv_cache & kv, struct ggml_tensor * wo, struct ggml_tensor * wo_b, struct ggml_tensor * q_cur, struct ggml_tensor * kq_scale, struct ggml_tensor * kq_mask, + int64_t n_ctx, int32_t n_tokens, int32_t n_kv, - float alibi_bias_max, + float max_alibi_bias, const llm_build_cb & cb, - int il) { - const auto & model = lctx.model; - const auto & kv_self = lctx.kv_self; - const auto & cparams = lctx.cparams; - - const auto & hparams = model.hparams; - - const int64_t n_ctx = cparams.n_ctx; + int il) { const int64_t n_embd = hparams.n_embd; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; @@ -3381,11 +3369,11 @@ static struct ggml_tensor * llm_build_kqv( cb(q, "q", il); struct ggml_tensor * k = - ggml_view_3d(ctx, kv_self.k, + ggml_view_3d(ctx, kv.k, n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_element_size(kv.k)*n_embd_gqa, + ggml_element_size(kv.k)*n_embd_head, + ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); cb(k, "k", il); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); @@ -3394,11 +3382,11 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_scale(ctx, kq, kq_scale); cb(kq, "kq_scaled", il); - if (alibi_bias_max > 0.0f) { + if (max_alibi_bias > 0.0f) { // TODO: n_head or n_head_kv // TODO: K-shift is likely not working // TODO: change to ggml_add - kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max); + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); cb(kq, "kq_scaled_alibi", il); } @@ -3410,11 +3398,11 @@ static struct ggml_tensor * llm_build_kqv( // split cached v into n_head heads struct ggml_tensor * v = - ggml_view_3d(ctx, kv_self.v, + ggml_view_3d(ctx, kv.v, n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_element_size(kv.v)*n_ctx, + ggml_element_size(kv.v)*n_ctx*n_embd_head, + ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il); cb(v, "v", il); struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); @@ -3438,1259 +3426,1011 @@ static struct ggml_tensor * llm_build_kqv( return cur; } -static struct ggml_cgraph * llm_build_llama( +struct llm_build_context { + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_batch & batch; + const llama_kv_cache & kv_self; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head; + const int64_t n_embd_gqa; + + const float freq_base; + const float freq_scale; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t kv_head; // index of where we store new KV data in the cache + + const bool do_rope_shift; + + const llm_build_cb & cb; + + llama_buffer & buf_compute; + + struct ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( llama_context & lctx, const llama_batch & batch, const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + bool worst_case) : + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head (hparams.n_embd_head()), + n_embd_gqa (hparams.n_embd_gqa()), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute (lctx.buf_compute) { + GGML_ASSERT(!!kv_self.ctx); - const auto & kv_self = lctx.kv_self; + // all initializations should be done in init() + } - GGML_ASSERT(!!kv_self.ctx); + void init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - const bool do_rope_shift = worst_case || kv_self.has_shift; - - //printf("n_kv = %d\n", n_kv); - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); + ctx0 = ggml_init(params); } - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + void free() { + if (ctx0) { + ggml_free(ctx0); + ctx0 = nullptr; + } + } + + struct ggml_cgraph * build_llama() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_baichuan() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + switch (model.type) { + case MODEL_7B: + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + break; + case MODEL_13B: + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); + break; + default: + GGML_ASSERT(false); + } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + // apply ALiBi for 13B model + const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; + + cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_falcon() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); + + // self-attention + { + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, attn_norm, hparams, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); + } else { + cur = attn_norm; + } + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // using mode = 2 for neox mode + Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, attn_norm, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = cur; + + // feed forward + { + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; // norm - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "attn_norm", il); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + ggml_build_forward_expand(gf, cur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Kcur, "Kcur", il); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, cur, - model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; + return gf; } - cur = inpL; + struct ggml_cgraph * build_starcoder() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); - cur = llm_build_norm(ctx0, cur, - model.output_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, -1); - cb(cur, "result_norm", -1); + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - ggml_build_forward_expand(gf, cur); + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - ggml_free(ctx0); + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); - return gf; -} + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); -static struct ggml_cgraph * llm_build_baichaun( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); - const auto & kv_self = lctx.kv_self; + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); - GGML_ASSERT(!!kv_self.ctx); + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - GGML_ASSERT(n_embd_head == hparams.n_rot); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_rms_eps = hparams.f_norm_rms_eps; + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - const bool do_rope_shift = worst_case || kv_self.has_shift; + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - auto & buf_compute = lctx.buf_compute; + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - switch (model.type) { - case MODEL_7B: - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - break; - case MODEL_13B: - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); - break; - default: - GGML_ASSERT(false); - } - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - // apply ALiBi for 13B model - const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f; - - cur = llm_build_kqv(lctx, ctx0, cur, - model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, - model.output_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_falcon( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - const bool do_rope_shift = worst_case || kv_self.has_shift; - - //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", - // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb); - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; - - attn_norm = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(attn_norm, "attn_norm", il); - - // self-attention - { - if (model.layers[il].attn_norm_2) { - // Falcon-40B - cur = llm_build_norm(ctx0, attn_norm, - model.layers[il].attn_norm_2, - model.layers[il].attn_norm_2_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "attn_norm_2", il); - } else { - cur = attn_norm; + cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - // using mode = 2 for neox mode - Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); - cb(Kcur, "Kcur", il); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, attn_norm, - model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = cur; - - // feed forward - { - cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result - model.layers[il].ffn_up, NULL, - NULL, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - cur = llm_build_norm(ctx0, cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, norm_eps, cb, -1); - cb(cur, "result_norm", -1); - - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_starcoder( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float norm_eps = hparams.f_norm_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * pos; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - cb(pos, "pos_embd", -1); - - inpL = ggml_add(ctx0, inpL, pos); - cb(inpL, "inpL", -1); - - for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, cur, - model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); - cb(cur, "kqv_out", il); - } - - // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); - } - - cur = llm_build_norm(ctx0, inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, norm_eps, cb, -1); - cb(cur, "result_norm", -1); - - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_persimmon( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const auto & cparams = lctx.cparams; - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_head = hparams.n_head; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_rot = n_embd_head / 2; - - const float freq_base = cparams.rope_freq_base; - const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - const bool do_rope_shift = worst_case || kv_self.has_shift; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "imp_embd", -1); - - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - if (do_rope_shift) { - llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb); - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * residual = inpL; - - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "attn_norm", il); - - // self attention - { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - // split qkv - GGML_ASSERT(n_head_kv == n_head); - - struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - cb(tmpqkv, "tmpqkv", il); - - struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - cb(tmpqkv_perm, "tmpqkv", il); - - struct ggml_tensor * tmpq = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - 0 - ); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * tmpk = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens - ); - cb(tmpk, "tmpk", il); - - // Q/K Layernorm - tmpq = llm_build_norm(ctx0, tmpq, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(tmpq, "tmpq", il); - - tmpk = llm_build_norm(ctx0, tmpk, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(tmpk, "tmpk", il); - - // RoPE the first n_rot of q/k, pass the other half, and concat. - struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - ggml_element_size(tmpq) * n_embd_head, - ggml_element_size(tmpq) * n_embd_head * n_head, - 0 - ); - cb(qrot, "qrot", il); - - struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - ggml_element_size(tmpk) * n_embd_head, - ggml_element_size(tmpk) * n_embd_head * n_head, - 0 - ); - cb(krot, "krot", il); - - // get the second half of tmpq, e.g tmpq[n_rot:, :, :] - struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, - ggml_element_size(tmpq) * n_embd_head, - ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * n_rot - ); - cb(qpass, "qpass", il); - - struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, - ggml_element_size(tmpk) * n_embd_head, - ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * n_rot - ); - cb(kpass, "kpass", il); - - struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); - cb(qrotated, "qrotated", il); - - struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); - cb(krotated, "krotated", il); - - // ggml currently only supports concatenation on dim=2 - // so we need to permute qrot, qpass, concat, then permute back. - qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - cb(qrotated, "qrotated", il); - - krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - cb(krotated, "krotated", il); - - qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - cb(qpass, "qpass", il); - - kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - cb(kpass, "kpass", il); - - struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - cb(Q, "Q", il); - - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 - ); - cb(Vcur, "Vcur", il); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - // TODO: not tested, could be broken - cur = llm_build_kqv(lctx, ctx0, Q, - model.layers[il].wo, model.layers[il].bo, - Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, norm_eps, cb, -1); - cb(cur, "result_norm", -1); - - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_refact( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, Qcur, - model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, - model.output_norm, NULL, - LLM_NORM_RMS, norm_rms_eps, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_bloom( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const float norm_eps = hparams.f_norm_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; - - params.no_alloc = true; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - inpL = llm_build_norm(ctx0, inpL, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, norm_eps, cb, -1); - cb(inpL, "inp_norm", -1); - - for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, Qcur, - model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); - cb(cur, "kqv_out", il); - } - - // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, norm_eps, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); - } - - cur = llm_build_norm(ctx0, inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, norm_eps, cb, -1); - cb(cur, "result_norm", -1); - - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_mpt( - llama_context & lctx, - const llama_batch & batch, - const llm_build_cb & cb, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - const float norm_eps = hparams.f_norm_eps; - const float clamp_kqv = hparams.f_clamp_kqv; - const float max_alibi_bias = hparams.f_max_alibi_bias; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ false, - }; - - params.no_alloc = true; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); - cb(inpL, "inp_embd", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; - - attn_norm = llm_build_norm(ctx0, inpL, - model.layers[il].attn_norm, - NULL, - LLM_NORM, norm_eps, cb, il); - cb(attn_norm, "attn_norm", il); - - // self-attention - { - cur = attn_norm; - - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - if (clamp_kqv > 0.0f) { - cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - cb(cur, "wqkv_clamped", il); + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - cur = llm_build_kqv(lctx, ctx0, Qcur, - model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il); - cb(cur, "kqv_out", il); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - // feed forward - { - cur = llm_build_norm(ctx0, ffn_inp, - model.layers[il].ffn_norm, + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_persimmon() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + const int64_t n_rot = n_embd_head / 2; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "imp_embd", -1); + + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * residual = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + // split qkv + GGML_ASSERT(n_head_kv == n_head); + + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); + cb(tmpqkv, "tmpqkv", il); + + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); + cb(tmpqkv_perm, "tmpqkv", il); + + struct ggml_tensor * tmpq = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + 0 + ); + cb(tmpq, "tmpq", il); + + struct ggml_tensor * tmpk = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens + ); + cb(tmpk, "tmpk", il); + + // Q/K Layernorm + tmpq = llm_build_norm(ctx0, tmpq, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + cb(tmpq, "tmpq", il); + + tmpk = llm_build_norm(ctx0, tmpk, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + cb(tmpk, "tmpk", il); + + // RoPE the first n_rot of q/k, pass the other half, and concat. + struct ggml_tensor * qrot = ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + 0 + ); + cb(qrot, "qrot", il); + + struct ggml_tensor * krot = ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + 0 + ); + cb(krot, "krot", il); + + // get the second half of tmpq, e.g tmpq[n_rot:, :, :] + struct ggml_tensor * qpass = ggml_view_3d( + ctx0, tmpq, n_rot, n_head, n_tokens, + ggml_element_size(tmpq) * n_embd_head, + ggml_element_size(tmpq) * n_embd_head * n_head, + ggml_element_size(tmpq) * n_rot + ); + cb(qpass, "qpass", il); + + struct ggml_tensor * kpass = ggml_view_3d( + ctx0, tmpk, n_rot, n_head, n_tokens, + ggml_element_size(tmpk) * n_embd_head, + ggml_element_size(tmpk) * n_embd_head * n_head, + ggml_element_size(tmpk) * n_rot + ); + cb(kpass, "kpass", il); + + struct ggml_tensor * qrotated = ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale + ); + cb(qrotated, "qrotated", il); + + struct ggml_tensor * krotated = ggml_rope_custom( + ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale + ); + cb(krotated, "krotated", il); + + // ggml currently only supports concatenation on dim=2 + // so we need to permute qrot, qpass, concat, then permute back. + qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); + cb(qrotated, "qrotated", il); + + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); + cb(krotated, "krotated", il); + + qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); + cb(qpass, "qpass", il); + + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); + cb(kpass, "kpass", il); + + struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); + cb(Q, "Q", il); + + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + ); + cb(Vcur, "Vcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + // TODO: not tested, could be broken + cur = llm_build_kqv(ctx0, Q, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_refact() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_bloom() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + inpL = llm_build_norm(ctx0, inpL, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); + } + + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_mpt() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + attn_norm = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, - LLM_NORM, norm_eps, cb, il); - cb(cur, "ffn_norm", il); + LLM_NORM, cb, il); + cb(attn_norm, "attn_norm", il); - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - NULL, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); + // self-attention + { + cur = attn_norm; + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + if (hparams.f_clamp_kqv > 0.0f) { + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); + } + + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // feed forward + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); + cur = inpL; - // input for next layer - inpL = cur; + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, - model.output_norm, - NULL, - LLM_NORM, norm_eps, cb, -1); - cb(cur, "result_norm", -1); - - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} +}; // // tensor offloading helpers @@ -5122,43 +4862,49 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; + struct llm_build_context llm(lctx, batch, cb, worst_case); + + llm.init(); + switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, batch, cb, worst_case); + result = llm.build_llama(); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, batch, cb, worst_case); + result = llm.build_baichuan(); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, batch, cb, worst_case); + result = llm.build_falcon(); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, batch, cb, worst_case); + result = llm.build_starcoder(); } break; case LLM_ARCH_PERSIMMON: { - result = llm_build_persimmon(lctx, batch, cb, worst_case); + result = llm.build_persimmon(); } break; case LLM_ARCH_REFACT: { - result = llm_build_refact(lctx, batch, cb, worst_case); + result = llm.build_refact(); } break; case LLM_ARCH_BLOOM: { - result = llm_build_bloom(lctx, batch, cb, worst_case); + result = llm.build_bloom(); } break; case LLM_ARCH_MPT: { - result = llm_build_mpt(lctx, batch, cb, worst_case); + result = llm.build_mpt(); } break; default: GGML_ASSERT(false); } + llm.free(); + if (worst_case) { int n_non_view_total = 0; From ff8f9a88da0018972dfdf6fe64b5c8992caabd9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 21:15:55 +0200 Subject: [PATCH 37/63] common : minor (#3715) --- common/common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 89be41261..7a48e9d11 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -110,8 +110,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(0); } } - catch (const std::invalid_argument& ex) { - fprintf(stderr, ex.what()); + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); gpt_print_usage(argc, argv, gpt_params()); exit(1); } From e16b9fa4baa8a09c6619b116159830e898050942 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 21:25:00 +0200 Subject: [PATCH 38/63] metal : multi-simd softmax (#3710) ggml-ci --- ggml-metal.m | 9 +++- ggml-metal.metal | 129 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 108 insertions(+), 30 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index bc881395a..1f0341507 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1001,11 +1001,15 @@ void ggml_metal_graph_compute( } break; case GGML_OP_SOFT_MAX: { - const int nth = MIN(32, ne00); + int nth = 32; // SIMD width if (ne00%4 == 0) { [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; } else { + do { + nth *= 2; + } while (nth <= ne00 && nth <= 1024); + nth /= 2; [encoder setComputePipelineState:ctx->pipeline_soft_max]; } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1013,8 +1017,9 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_DIAG_MASK_INF: { diff --git a/ggml-metal.metal b/ggml-metal.metal index f4b460564..f3152778a 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -184,36 +184,73 @@ kernel void kernel_soft_max( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig[2]; - const int64_t i02 = tgpig[1]; - const int64_t i01 = tgpig[0]; + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t i03 = (tgpig) / (ne02*ne01); + const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; + const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY; - for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) { + float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY; + + for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) { lmax = MAX(lmax, psrc0[i00]); } - const float max = simd_max(lmax); + + float max = simd_max(lmax); + if (tiisg == 0) { + buf[sgitg] = max; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max = buf[0]; // parallel sum float lsum = 0.0f; - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { const float exp_psrc0 = exp(psrc0[i00] - max); lsum += exp_psrc0; // Remember the result of exp here. exp is expensive, so we really do not - // whish to compute it twice. + // wish to compute it twice. pdst[i00] = exp_psrc0; } - const float sum = simd_sum(lsum); + float sum = simd_sum(lsum); + if (tiisg == 0) { + buf[sgitg] = sum; + } - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] += buf[tpitg + i]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[0]; + + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { pdst[i00] /= sum; } } @@ -224,37 +261,73 @@ kernel void kernel_soft_max_4( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig[2]; - const int64_t i02 = tgpig[1]; - const int64_t i01 = tgpig[0]; + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t i03 = (tgpig) / (ne02*ne01); + const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; + const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max - float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY; - for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) { + float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY; + + for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) { lmax4 = fmax(lmax4, psrc4[i00]); } - float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - const float max = simd_max(lmax); + const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); + float max = simd_max(lmax); + if (tiisg == 0) { + buf[sgitg] = max; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max = buf[0]; // parallel sum float4 lsum4 = 0.0f; - for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { const float4 exp_psrc4 = exp(psrc4[i00] - max); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } - float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - const float sum = simd_sum(lsum); + const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + float sum = simd_sum(lsum); + if (tiisg == 0) { + buf[sgitg] = sum; + } - for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + buf[tpitg] += buf[tpitg + i]; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[0]; + + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { pdst4[i00] /= sum; } } @@ -274,7 +347,7 @@ kernel void kernel_diag_mask_inf( dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; } else { dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; - } + } } kernel void kernel_diag_mask_inf_8( From 523e49b11174368cd73460fa5eae7b39d856f300 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 23:00:50 +0200 Subject: [PATCH 39/63] llm : fix falcon norm after refactoring (#3837) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d0c4ef101..17cf364bb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3763,7 +3763,7 @@ struct llm_build_context { { if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = llm_build_norm(ctx0, attn_norm, hparams, + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); From c43c2da8afacaddfe51c09b21dbd9922cd0ea46b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 1 Nov 2023 23:08:30 +0200 Subject: [PATCH 40/63] llm : fix llm_build_kqv taking unused tensor (benign, #3837) --- llama.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 17cf364bb..1c6d482f8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3345,7 +3345,6 @@ static struct ggml_tensor * llm_build_ffn( // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, - struct ggml_tensor * cur, const llama_hparams & hparams, const llama_kv_cache & kv, struct ggml_tensor * wo, @@ -3411,7 +3410,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); cb(cur, "kqv_merged_cont", il); cur = ggml_mul_mat(ctx, wo, cur); @@ -3565,7 +3564,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); @@ -3677,7 +3676,7 @@ struct llm_build_context { // apply ALiBi for 13B model const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); cb(cur, "kqv_out", il); @@ -3795,7 +3794,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, attn_norm, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); @@ -3895,7 +3894,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, cur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); @@ -4100,7 +4099,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); // TODO: not tested, could be broken - cur = llm_build_kqv(ctx0, Q, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); @@ -4191,7 +4190,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); cb(cur, "kqv_out", il); @@ -4288,7 +4287,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); cb(cur, "kqv_out", il); @@ -4382,7 +4381,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self, + cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); cb(cur, "kqv_out", il); From 898aeca90a9bb992f506234cf3b8b7f7fa28a1df Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Wed, 1 Nov 2023 18:04:33 -0400 Subject: [PATCH 41/63] llama : implement YaRN RoPE scaling (#2268) Co-authored-by: cebtenzzre Co-authored-by: Jeffrey Quesnelle --- common/common.cpp | 79 +++++- common/common.h | 7 + convert-baichuan-hf-to-gguf.py | 3 +- convert.py | 97 ++++--- examples/finetune/finetune.cpp | 5 +- examples/server/server.cpp | 59 ++++- .../train-text-from-scratch.cpp | 6 +- ggml-cuda.cu | 153 ++++++++--- ggml-metal.m | 22 +- ggml-metal.metal | 61 ++++- ggml.c | 241 +++++++++++++----- ggml.h | 20 +- gguf-py/gguf/gguf.py | 29 ++- llama.cpp | 220 ++++++++++++---- llama.h | 18 +- 15 files changed, 763 insertions(+), 257 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 7a48e9d11..b182ffaae 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -219,12 +219,52 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); + } else if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + } else if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -716,9 +756,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -826,17 +873,23 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); - cparams.n_ctx = params.n_ctx; - cparams.n_batch = params.n_batch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; - cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; - cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; - cparams.rope_freq_base = params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale; + cparams.n_ctx = params.n_ctx; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.mul_mat_q = params.mul_mat_q; + cparams.seed = params.seed; + cparams.f16_kv = params.memory_f16; + cparams.logits_all = params.logits_all; + cparams.embedding = params.embedding; + cparams.rope_scaling_type = params.rope_scaling_type; + cparams.rope_freq_base = params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_orig_ctx = params.yarn_orig_ctx; return cparams; } diff --git a/common/common.h b/common/common.h index 343b27217..7be69f925 100644 --- a/common/common.h +++ b/common/common.h @@ -9,6 +9,7 @@ #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" +#include #include #include #include @@ -54,6 +55,12 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = NAN; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f;// YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // // sampling parameters struct llama_sampling_params sparams; diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index 5ee99be73..67ccbe99f 100755 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -163,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: if "type" in hparams["rope_scaling"]: if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) + gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) # TOKENIZATION diff --git a/convert.py b/convert.py index bfbfab283..9110f1580 100755 --- a/convert.py +++ b/convert.py @@ -151,8 +151,11 @@ class Params: n_head_kv: int f_norm_eps: float + rope_scaling_type: gguf.RopeScalingType | None = None f_rope_freq_base: float | None = None f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None ftype: GGMLFileType | None = None @@ -198,20 +201,20 @@ class Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] - n_embd = config["hidden_size"] - n_layer = config["num_hidden_layers"] - n_ff = config["intermediate_size"] - n_head = config["num_attention_heads"] - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head - f_norm_eps = config["rms_norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - + rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") - if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": - f_rope_scale = config["rope_scaling"].get("factor") - else: - f_rope_scale = None + + if rope_scaling is not None and (typ := rope_scaling.get("type")): + rope_factor = rope_scaling.get("factor") + f_rope_scale = rope_factor + if typ == "linear": + rope_scaling_type = gguf.RopeScalingType.LINEAR + elif typ == "yarn": + rope_scaling_type = gguf.RopeScalingType.YARN + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] + else: + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] @@ -222,16 +225,19 @@ class Params: "Suggestion: provide 'config.json' of the model in the same directory containing model files.") return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, - n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, - f_rope_scale = f_rope_scale, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json @@ -240,17 +246,8 @@ class Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] if "vocab_size" in config else -1 - n_embd = config["dim"] - n_layer = config["n_layers"] - n_ff = -1 - n_head = config["n_heads"] - n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head - f_norm_eps = config["norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - # hack to determine LLaMA v1 vs v2 vs CodeLlama - if f_rope_freq_base == 1000000: + if config.get("rope_theta") == 1000000: # CodeLlama n_ctx = 16384 elif config["norm_eps"] == 1e-05: @@ -260,22 +257,16 @@ class Params: # LLaMA v1 n_ctx = 2048 - if n_vocab == -1: - n_vocab = model["tok_embeddings.weight"].shape[0] - - if n_ff == -1: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] - return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, + n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]), + n_embd = config["dim"], + n_layer = config["n_layers"], n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta"), ) @staticmethod @@ -831,8 +822,16 @@ class OutputFile: if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) - if params.f_rope_scale is not None: - self.gguf.add_rope_scale_linear(params.f_rope_scale) + if params.rope_scaling_type: + assert params.f_rope_scale is not None + self.gguf.add_rope_scaling_type(params.rope_scaling_type) + self.gguf.add_rope_scaling_factor(params.f_rope_scale) + + if params.n_orig_ctx is not None: + self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) + + if params.rope_finetuned is not None: + self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) if params.ftype is not None: self.gguf.add_file_type(params.ftype) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 60c7faa79..649a3b7c1 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int rope_mode = 0; return ggml_rope_custom(ctx, - t, KQ_pos, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale); + t, KQ_pos, n_rot, rope_mode, n_ctx, 0, + rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f + ); }; set_name(tokens_input, "tokens_input"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 47ae0d558..84b04d5a0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1755,12 +1755,18 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) @@ -1881,6 +1887,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } + else if (arg == "--rope-scaling") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } + } else if (arg == "--rope-freq-base") { if (++i >= argc) @@ -1899,6 +1918,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.rope_freq_scale = std::stof(argv[i]); } + else if (arg == "--yarn-ext-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } + else if (arg == "--yarn-attn-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } + else if (arg == "--yarn-beta-fast") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } + else if (arg == "--yarn-beta-slow") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 1ce6cef29..2a257e632 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs( // not capturing these, to silcence warnings const int rope_mode = 0; - return ggml_rope_custom(ctx, - t, KQ_pos, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale); + return ggml_rope_custom( + ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + ); }; set_name(tokens_input, "tokens_input"); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4e6e7cd94..12ee10e3d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4493,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, cpy_1(cx + x_offset, cdst + dst_offset); } -// rope == RoPE == rotary positional embedding +static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} +struct rope_corr_dims { + float v[4]; +}; + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static __device__ void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// rope == RoPE == rotary positional embedding template -static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale) { +static __global__ void rope( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { @@ -4509,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t const int i2 = row/p_delta_rows; const int p = has_pos ? pos[i2] : 0; - const float p0 = p*freq_scale; - const float theta = p0*powf(theta_scale, col/2); - const float sin_theta = sinf(theta); - const float cos_theta = cosf(theta); + const float theta_base = p*powf(freq_base, -col/ncols); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); const float x0 = x[i + 0]; const float x1 = x[i + 1]; @@ -4522,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t } template -static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale) { +static __global__ void rope_neox( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { @@ -4534,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in const int i = row*ncols + col/2; const int i2 = row/p_delta_rows; + // simplified from `(row * ncols + col) * (-1 / ncols)` + const float cur_rot = -col/ncols - row; + const int p = has_pos ? pos[i2] : 0; - const float p0 = p*freq_scale; - const float theta = p0*powf(theta_scale, col/2); - const float sin_theta = sinf(theta); - const float cos_theta = cosf(theta); + const float theta_base = p*powf(freq_base, cur_rot); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const float x0 = x[i + 0]; const float x1 = x[i + ncols/2]; @@ -4547,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, const int n_ctx) { +static __global__ void rope_glm_f32( + const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + int n_ctx +) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int half_n_dims = ncols/4; @@ -4560,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol const int i = row*ncols + col; const int i2 = row/p_delta_rows; - const float col_theta_scale = powf(theta_scale, col); + const float col_theta_scale = powf(freq_base, -2.0f*col/ncols); // FIXME: this is likely wrong const int p = pos != nullptr ? pos[i2] : 0; @@ -5584,40 +5621,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const } template -static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +static void rope_cuda( + const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); if (pos == nullptr) { - rope<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } else { - rope<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } } template -static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +static void rope_neox_cuda( + const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); if (pos == nullptr) { - rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope_neox<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } else { - rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope_neox<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } } -static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { +static void rope_glm_f32_cuda( + const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, int n_ctx, cudaStream_t stream +) { GGML_ASSERT(ncols % 4 == 0); const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx); + rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx); } static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, @@ -6477,17 +6528,20 @@ inline void ggml_cuda_op_rope( const int64_t ne2 = dst->ne[2]; const int64_t nrows = ggml_nrows(src0); - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + // RoPE alteration for extended context - - float freq_base, freq_scale; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f/n_dims); + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); const int32_t * pos = nullptr; if ((mode & 1) == 0) { @@ -6499,24 +6553,39 @@ inline void ggml_cuda_op_rope( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); + // compute if (is_glm) { GGML_ASSERT(false); - rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream); } else if (is_neox) { GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); if (src0->type == GGML_TYPE_F32) { - rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_neox_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else if (src0->type == GGML_TYPE_F16) { - rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_neox_cuda( + (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else { GGML_ASSERT(false); } } else { if (src0->type == GGML_TYPE_F32) { - rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else if (src0->type == GGML_TYPE_F16) { - rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_cuda( + (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else { GGML_ASSERT(false); } diff --git a/ggml-metal.m b/ggml-metal.m index 1f0341507..611d5e173 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1400,14 +1400,18 @@ void ggml_metal_graph_compute( const int nth = MIN(1024, ne00); - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[3]; - float freq_base; - float freq_scale; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; @@ -1439,6 +1443,10 @@ void ggml_metal_graph_compute( [encoder setBytes:&mode length:sizeof( int) atIndex:21]; [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; + [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24]; + [encoder setBytes:&attn_factor length:sizeof(float) atIndex:25]; + [encoder setBytes:&beta_fast length:sizeof(float) atIndex:26]; + [encoder setBytes:&beta_slow length:sizeof(float) atIndex:27]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index f3152778a..471d7d390 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1061,6 +1061,45 @@ kernel void kernel_alibi_f32( } } +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +} + +static void rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + typedef void (rope_t)( device const void * src0, device const int32_t * src1, @@ -1116,6 +1155,10 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]) { @@ -1125,19 +1168,22 @@ kernel void kernel_rope( const bool is_neox = mode & 2; + float corr_dims[2]; + rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); + device const int32_t * pos = src1; const int64_t p = pos[i2]; - const float theta_0 = freq_scale * (float)p; + const float theta_0 = (float)p; const float inv_ndims = -1.f/n_dims; if (!is_neox) { for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { const float theta = theta_0 * pow(freq_base, inv_ndims*i0); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -1152,9 +1198,12 @@ kernel void kernel_rope( for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { - const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + const float cur_rot = inv_ndims*ic - ib; + + const float theta = theta_0 * pow(freq_base, cur_rot); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const int64_t i0 = ib*n_dims + ic/2; diff --git a/ggml.c b/ggml.c index 80d682255..2c7fe476b 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,5 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" #include "ggml-quants.h" @@ -4845,8 +4846,13 @@ static struct ggml_tensor * ggml_rope_impl( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, float xpos_base, bool xpos_down, bool inplace) { @@ -4862,11 +4868,15 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &xpos_base, sizeof(float)); - memcpy(params + 7, &xpos_down, sizeof(bool)); + int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -4884,7 +4894,9 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); + return ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ); } struct ggml_tensor * ggml_rope_inplace( @@ -4894,7 +4906,9 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); + return ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ); } struct ggml_tensor * ggml_rope_custom( @@ -4904,9 +4918,17 @@ struct ggml_tensor * ggml_rope_custom( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false + ); } struct ggml_tensor * ggml_rope_custom_inplace( @@ -4916,9 +4938,17 @@ struct ggml_tensor * ggml_rope_custom_inplace( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale) { - return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true + ); } struct ggml_tensor * ggml_rope_xpos_inplace( @@ -4928,7 +4958,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace( int n_dims, float base, bool down) { - return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); + return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); } // ggml_rope_back @@ -10901,6 +10931,45 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / MAX(0.001f, high - low); + return 1 - MIN(1, MAX(0, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); +} + static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10910,21 +10979,26 @@ static void ggml_compute_forward_rope_f32( return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; // these two only relevant for xPos RoPE: float xpos_base; bool xpos_down; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool)); GGML_TENSOR_UNARY_OP_LOCALS @@ -10952,6 +11026,9 @@ static void ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -10965,18 +11042,18 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -10994,13 +11071,16 @@ static void ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); + // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta_base *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11014,12 +11094,19 @@ static void ggml_compute_forward_rope_f32( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; - theta *= theta_scale; + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11048,15 +11135,19 @@ static void ggml_compute_forward_rope_f16( return; } - float freq_base; - float freq_scale; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); GGML_TENSOR_UNARY_OP_LOCALS @@ -11084,6 +11175,9 @@ static void ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -11097,18 +11191,18 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -11126,10 +11220,12 @@ static void ggml_compute_forward_rope_f16( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); - theta *= theta_scale; + theta_base *= theta_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11143,12 +11239,19 @@ static void ggml_compute_forward_rope_f16( } else { // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; - theta *= theta_scale; + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); + + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11256,17 +11359,18 @@ static void ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); + // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta_base *= theta_scale; const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11280,10 +11384,10 @@ static void ggml_compute_forward_rope_back_f32( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -11356,14 +11460,14 @@ static void ggml_compute_forward_rope_back_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta_base = (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11377,10 +11481,10 @@ static void ggml_compute_forward_rope_back_f16( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -15505,9 +15609,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1, n_dims, mode, + 0, n_ctx, freq_base, freq_scale, + 0.0f, + 1.0f, + 0.0f, + 0.0f, xpos_base, xpos_down, false), diff --git a/ggml.h b/ggml.h index 9d16c5a72..70eb25a6b 100644 --- a/ggml.h +++ b/ggml.h @@ -219,7 +219,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 64 -#define GGML_MAX_OP_PARAMS 32 +#define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 #if UINTPTR_MAX == 0xFFFFFFFF @@ -1326,8 +1326,13 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_custom_inplace( @@ -1337,8 +1342,17 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // compute correction dims for YaRN RoPE scaling + void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 6b7d65429..727b4e554 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -7,7 +7,7 @@ import shutil import struct import sys import tempfile -from enum import IntEnum, auto +from enum import Enum, IntEnum, auto from io import BufferedWriter from pathlib import Path from typing import IO, Any, BinaryIO, Callable, Sequence @@ -53,9 +53,12 @@ KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" # RoPE -KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" -KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" -KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" +KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" +KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" +KEY_ROPE_SCALING_TYPE = "{arch}.rope.scaling.type" +KEY_ROPE_SCALING_FACTOR = "{arch}.rope.scaling.factor" +KEY_ROPE_SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" +KEY_ROPE_SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" # tokenization KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" @@ -577,6 +580,11 @@ class TokenType(IntEnum): UNUSED = 5 BYTE = 6 +class RopeScalingType(Enum): + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + # # implementation # @@ -948,8 +956,17 @@ class GGUFWriter: def add_rope_freq_base(self, value: float): self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value) - def add_rope_scale_linear(self, value: float): - self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value) + def add_rope_scaling_type(self, value: RopeScalingType): + self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value) + + def add_rope_scaling_factor(self, value: float): + self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_orig_ctx_len(self, value: int): + self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) + + def add_rope_scaling_finetuned(self, value: bool): + self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value) def add_tokenizer_model(self, model: str): self.add_string(KEY_TOKENIZER_MODEL, model) diff --git a/llama.cpp b/llama.cpp index 1c6d482f8..685882c20 100644 --- a/llama.cpp +++ b/llama.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -235,6 +236,10 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -276,9 +281,13 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -552,6 +561,22 @@ do { \ } \ } while (0) +static std::map LLAMA_ROPE_SCALING_TYPES = { + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, +}; + +static int8_t llama_rope_scaling_type_from_string(const std::string & name) { + for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { + if (kv.second == name) { + return kv.first; + } + } + + return LLAMA_ROPE_SCALING_UNSPECIFIED; +} + // // ggml helpers // @@ -1035,8 +1060,11 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; - float rope_freq_base_train; - float rope_freq_scale_train; + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; float f_clamp_kqv; float f_max_alibi_bias; @@ -1051,6 +1079,8 @@ struct llama_hparams { if (this->n_layer != other.n_layer) return true; if (this->n_rot != other.n_rot) return true; if (this->n_ff != other.n_ff) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; const float EPSILON = 1e-9; @@ -1081,8 +1111,16 @@ struct llama_cparams { uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - float rope_freq_base; - float rope_freq_scale; + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; bool mul_mat_q; }; @@ -2014,14 +2052,30 @@ static void llm_load_hparams( hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + hparams.rope_finetuned = false; + GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, + kv(LLM_KV_ROPE_SCALING_FINETUNED)); + + hparams.n_yarn_orig_ctx = hparams.n_ctx_train; + GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, + kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + std::string rope_scaling("linear"); + GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + // rope_freq_scale (inverse of the kv) is optional - float ropescale = 1.0f; - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - hparams.rope_freq_scale_train = 1.0f/ropescale; + float ropescale = 0.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); + if (ropescale == 0.0f) { // try the old key name + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + } + hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; // sanity check for n_rot (optional) { @@ -2371,6 +2425,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; + const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); @@ -2389,8 +2445,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); @@ -3047,21 +3106,11 @@ static void llm_load_tensors( model.t_load_us = ggml_time_us() - model.t_start_us; } -static bool llama_model_load( - const std::string & fname, - llama_model & model, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - llama_model_loader ml(fname, use_mmap); + llama_model_loader ml(fname, params.use_mmap); - model.hparams.vocab_only = vocab_only; + model.hparams.vocab_only = params.vocab_only; llm_load_arch (ml, model); llm_load_hparams(ml, model); @@ -3073,15 +3122,15 @@ static bool llama_model_load( throw std::runtime_error("vocab size mismatch"); } - if (vocab_only) { + if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return true; } llm_load_tensors( - ml, model, n_gpu_layers, - main_gpu, tensor_split, - use_mlock, progress_callback, progress_callback_user_data); + ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + params.progress_callback, params.progress_callback_user_data + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; @@ -3150,6 +3199,7 @@ static struct ggml_tensor * llm_build_inp_embd( static void llm_build_k_shift( struct ggml_context * ctx, const llama_hparams & hparams, + const llama_cparams & cparams, const llama_kv_cache & kv, struct ggml_cgraph * graph, llm_rope_type type, @@ -3162,6 +3212,11 @@ static void llm_build_k_shift( const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_head = hparams.n_embd_head(); + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; + const float ext_factor = cparams.yarn_ext_factor; + const float attn_factor = cparams.yarn_attn_factor; + const float beta_fast = cparams.yarn_beta_fast; + const float beta_slow = cparams.yarn_beta_slow; GGML_ASSERT(n_embd_head % n_rot == 0); @@ -3185,7 +3240,8 @@ static void llm_build_k_shift( ggml_element_size(kv.k)*n_embd_head, ggml_element_size(kv.k)*n_embd_gqa, ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), - K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted", il); ggml_build_forward_expand(graph, tmp); } @@ -3442,12 +3498,17 @@ struct llm_build_context { const float freq_base; const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; const float norm_eps; const float norm_rms_eps; const int32_t n_tokens; const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; const bool do_rope_shift; @@ -3477,11 +3538,16 @@ struct llm_build_context { n_embd_gqa (hparams.n_embd_gqa()), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), cb (cb), buf_compute (lctx.buf_compute) { @@ -3532,7 +3598,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3556,10 +3622,18 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); @@ -3634,7 +3708,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3658,8 +3732,16 @@ struct llm_build_context { switch (model.type) { case MODEL_7B: - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); break; case MODEL_13B: Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); @@ -3746,7 +3828,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -3786,10 +3868,16 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(Kcur, "Kcur", il); llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); @@ -3960,7 +4048,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4053,13 +4141,15 @@ struct llm_build_context { cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); + ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale - ); + ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); cb(krotated, "krotated", il); // ggml currently only supports concatenation on dim=2 @@ -7883,8 +7973,13 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, + /*.yarn_ext_factor =*/ NAN, + /*.yarn_attn_factor =*/ 1.0f, + /*.yarn_beta_fast =*/ 32.0f, + /*.yarn_beta_slow =*/ 1.0f, /*.mul_mat_q =*/ true, /*.f16_kv =*/ true, /*.logits_all =*/ false, @@ -7971,10 +8066,7 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load(path_model, *model, params.n_gpu_layers, - params.main_gpu, params.tensor_split, - params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *model, params)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; @@ -8000,13 +8092,35 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_batch = params.n_batch; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.mul_mat_q = params.mul_mat_q; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : + hparams.n_ctx_train; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + } if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/llama.h b/llama.h index 75fe391ef..3f1becd76 100644 --- a/llama.h +++ b/llama.h @@ -106,6 +106,14 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; + enum llama_rope_scaling_type { + LLAMA_ROPE_SCALING_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_NONE = 0, + LLAMA_ROPE_SCALING_LINEAR = 1, + LLAMA_ROPE_SCALING_YARN = 2, + LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -172,10 +180,16 @@ extern "C" { uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing + int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim + uint32_t yarn_orig_ctx; // YaRN original context size // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) From d02e98cde035d91ed8032ab943d1d504fe9da394 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 1 Nov 2023 23:10:09 +0100 Subject: [PATCH 42/63] ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel (#3891) * ggml-cuda : compute ptrs for cublasGemmBatchedEx in a kernel * fix warnings --- ggml-cuda.cu | 78 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 12ee10e3d..61cd1747c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6696,8 +6696,10 @@ inline void ggml_cuda_op_clamp( GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float min = ((float *) dst->op_params)[0]; - const float max = ((float *) dst->op_params)[1]; + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); @@ -7221,6 +7223,30 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } +__global__ void k_compute_batched_ptrs( + const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, + void ** ptrs, + int ne12, int ne13, + int ne23, + int nb02, int nb03, + int nb12, int nb13, + int nb2, int nb3, + int r2, int r3) { + int i13 = blockIdx.x * blockDim.x + threadIdx.x; + int i12 = blockIdx.y * blockDim.y + threadIdx.y; + + if (i13 >= ne13 || i12 >= ne12) { + return; + } + + int i03 = i13 / r3; + int i02 = i12 / r2; + + ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; + ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2; +} + static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); @@ -7322,49 +7348,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx - // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000 const int ne23 = ne12*ne13; - // TODO: avoid this alloc - void ** ptrs = (void **) malloc(3*ne23*sizeof(void *)); - - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - int i03 = i13 / r3; - int i02 = i12 / r2; - - ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3]; - ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2; - ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2; - } - } - - // allocate device memory for pointers void ** ptrs_as = nullptr; - CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *))); + size_t ptrs_s = 0; + ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s); - // TODO: this does not work for some reason -- not sure why? - //size_t ptrs_s = 0; - //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s); - - // copy pointers to device - CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice)); - - free(ptrs); + dim3 block_dims(ne13, ne12); + k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( + src0_as_f16, src1_as_f16, dst_f16, + ptrs_as, + ne12, ne13, + ne23, + nb02, nb03, + nb12, nb13, + dst->nb[2], dst->nb[3], + r2, r3); + CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01, + &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01, ne23, CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // free device memory for pointers - CUDA_CHECK(cudaFree(ptrs_as)); - //ggml_cuda_pool_free(ptrs_as, ptrs_s); + ggml_cuda_pool_free(ptrs_as, ptrs_s); } #endif From 0eb332a10f3f14a3746c391bf80ff5e7bdf29d5d Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Wed, 1 Nov 2023 19:29:14 -0400 Subject: [PATCH 43/63] llama : fix llama_context_default_params after #2268 (#3893) --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index 685882c20..32d7d23de 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7980,6 +7980,7 @@ struct llama_context_params llama_context_default_params() { /*.yarn_attn_factor =*/ 1.0f, /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, + /*.yarn_orig_ctx =*/ 0, /*.mul_mat_q =*/ true, /*.f16_kv =*/ true, /*.logits_all =*/ false, From 2fffa0d61fa10e4b466e78cabcc6a4e16717b580 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Thu, 2 Nov 2023 01:49:44 -0400 Subject: [PATCH 44/63] cuda : fix RoPE after #2268 (#3897) --- ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 61cd1747c..57a528ede 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4539,7 +4539,7 @@ static __global__ void rope( const int i2 = row/p_delta_rows; const int p = has_pos ? pos[i2] : 0; - const float theta_base = p*powf(freq_base, -col/ncols); + const float theta_base = p*powf(freq_base, -float(col)/ncols); float cos_theta, sin_theta; rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -4566,8 +4566,8 @@ static __global__ void rope_neox( const int i = row*ncols + col/2; const int i2 = row/p_delta_rows; - // simplified from `(row * ncols + col) * (-1 / ncols)` - const float cur_rot = -col/ncols - row; + // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero + const float cur_rot = -float(col)/ncols; const int p = has_pos ? pos[i2] : 0; const float theta_base = p*powf(freq_base, cur_rot); From 183b3fac6c28e65d23ac0230c1dd6fb84bf0154d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 08:33:37 +0200 Subject: [PATCH 45/63] metal : fix build errors and kernel sig after #2268 (#3898) --- ggml-metal.m | 57 ++++++++++++++++++++++++------------------------ ggml-metal.metal | 16 +++++++++----- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 611d5e173..b33a3cb8f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1419,34 +1419,35 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false); }; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:22]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23]; - [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24]; - [encoder setBytes:&attn_factor length:sizeof(float) atIndex:25]; - [encoder setBytes:&beta_fast length:sizeof(float) atIndex:26]; - [encoder setBytes:&beta_slow length:sizeof(float) atIndex:27]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; + [encoder setBytes:&mode length:sizeof( int) atIndex:21]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 471d7d390..7c35f23a7 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1070,20 +1070,20 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. static void rope_yarn( float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta + thread float * cos_theta, thread float * sin_theta ) { // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; float theta = theta_interp; if (ext_factor != 0.0f) { - ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); } - *cos_theta = cosf(theta) * mscale; - *sin_theta = sinf(theta) * mscale; + *cos_theta = cos(theta) * mscale; + *sin_theta = sin(theta) * mscale; } // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get @@ -1123,8 +1123,13 @@ typedef void (rope_t)( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, + constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint tiitg[[thread_index_in_threadgroup]], uint3 tptg[[threads_per_threadgroup]], uint3 tgpig[[threadgroup_position_in_grid]]); @@ -1153,6 +1158,7 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, constant float & ext_factor, From 4d719a6d4e74b9a98e75f826f865f3153717d54b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 08:35:10 +0200 Subject: [PATCH 46/63] cuda : check if this fixes Pascal card regression (#3882) --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 57a528ede..e46295126 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7420,7 +7420,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { From b12fa0d1c13596869c512f49a526b979c94787cc Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Thu, 2 Nov 2023 02:50:16 -0400 Subject: [PATCH 47/63] build : link against build info instead of compiling against it (#3879) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cmake : fix build when .git does not exist * cmake : simplify BUILD_INFO target * cmake : add missing dependencies on BUILD_INFO * build : link against build info instead of compiling against it * zig : make build info a .cpp source instead of a header Co-authored-by: Matheus C. França * cmake : revert change to CMP0115 --------- Co-authored-by: Matheus C. França --- .gitignore | 2 +- CMakeLists.txt | 33 --------- Makefile | 71 ++++++++++---------- build.zig | 38 +++++------ common/CMakeLists.txt | 42 +++++++++++- common/build-info.cpp.in | 4 ++ common/common.cpp | 5 +- common/common.h | 12 +++- examples/benchmark/CMakeLists.txt | 5 +- examples/benchmark/benchmark-matmult.cpp | 1 - examples/embedding/CMakeLists.txt | 3 - examples/embedding/embedding.cpp | 1 - examples/infill/CMakeLists.txt | 3 - examples/infill/infill.cpp | 5 +- examples/llama-bench/CMakeLists.txt | 3 - examples/llama-bench/llama-bench.cpp | 5 +- examples/llava/CMakeLists.txt | 6 -- examples/main/CMakeLists.txt | 3 - examples/main/main.cpp | 5 +- examples/parallel/CMakeLists.txt | 3 - examples/parallel/parallel.cpp | 2 - examples/perplexity/CMakeLists.txt | 3 - examples/perplexity/perplexity.cpp | 1 - examples/quantize-stats/CMakeLists.txt | 2 +- examples/quantize-stats/quantize-stats.cpp | 1 - examples/quantize/CMakeLists.txt | 5 +- examples/quantize/quantize.cpp | 1 - examples/save-load-state/CMakeLists.txt | 3 - examples/save-load-state/save-load-state.cpp | 1 - examples/server/CMakeLists.txt | 3 - examples/server/server.cpp | 5 +- examples/speculative/CMakeLists.txt | 3 - examples/speculative/speculative.cpp | 2 - scripts/build-info.cmake | 30 +++++---- scripts/build-info.h.in | 9 --- scripts/build-info.sh | 13 ++-- 36 files changed, 143 insertions(+), 191 deletions(-) create mode 100644 common/build-info.cpp.in delete mode 100644 scripts/build-info.h.in diff --git a/.gitignore b/.gitignore index 5d7c5479e..50cbd0b47 100644 --- a/.gitignore +++ b/.gitignore @@ -65,7 +65,7 @@ models-mnt /parallel /train-text-from-scratch /vdot -build-info.h +/common/build-info.cpp arm_neon.h compile_commands.json CMakeSettings.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 3659279e2..611ed3f4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,39 +100,6 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) -# -# Build info header -# - -# Generate initial build-info.h -include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) - -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git") - - # Is git submodule - if(NOT IS_DIRECTORY "${GIT_DIR}") - file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) - string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}") - endif() - - # Add a custom target for build-info.h - add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") - - # Add a custom command to rebuild build-info.h when .git/index changes - add_custom_command( - OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" - COMMENT "Generating build details from Git" - COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS "${GIT_DIR}/index" - VERBATIM - ) -else() - message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") -endif() - # # Compile flags # diff --git a/Makefile b/Makefile index c53c1e726..300c1e6c7 100644 --- a/Makefile +++ b/Makefile @@ -542,9 +542,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l $(CXX) $(CXXFLAGS) -c $< -o $@ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h -COMMON_DEPS = common.o sampling.o grammar-parser.o +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o -common.o: common/common.cpp build-info.h $(COMMON_H_DEPS) +common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ sampling.o: common/sampling.cpp $(COMMON_H_DEPS) @@ -563,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) # # Examples # -main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) @@ -614,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) @@ -623,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifdef LLAMA_METAL @@ -648,7 +648,7 @@ swift: examples/batched.swift (cd examples/batched.swift; make build) endif -build-info.h: $(wildcard .git/index) scripts/build-info.sh +common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh $(CC) > $@.tmp @if ! cmp -s $@.tmp $@; then \ mv $@.tmp $@; \ @@ -656,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh rm $@.tmp; \ fi +build-info.o: common/build-info.cpp + $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ + # # Tests # tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) run-benchmark-matmult: benchmark-matmult @@ -676,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-c.o: tests/test-c.c llama.h diff --git a/build.zig b/build.zig index 9b58b74ca..699738f3d 100644 --- a/build.zig +++ b/build.zig @@ -10,7 +10,6 @@ const Maker = struct { builder: *std.build.Builder, target: CrossTarget, optimize: Mode, - config_header: *ConfigHeader, enable_lto: bool, include_dirs: ArrayList([]const u8), @@ -41,26 +40,24 @@ const Maker = struct { const commit_hash = try std.ChildProcess.exec( .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } }, ); - const config_header = builder.addConfigHeader( - .{ .style = .blank, .include_path = "build-info.h" }, - .{ - .BUILD_NUMBER = 0, - .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline - .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}), - .BUILD_TARGET = try target.allocDescription(builder.allocator), - }, - ); + try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt( + \\int LLAMA_BUILD_NUMBER = {}; + \\char const *LLAMA_COMMIT = "{s}"; + \\char const *LLAMA_COMPILER = "Zig {s}"; + \\char const *LLAMA_BUILD_TARGET = "{s}"; + \\ + , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) })); var m = Maker{ .builder = builder, .target = target, .optimize = builder.standardOptimizeOption(.{}), - .config_header = config_header, .enable_lto = false, .include_dirs = ArrayList([]const u8).init(builder.allocator), .cflags = ArrayList([]const u8).init(builder.allocator), .cxxflags = ArrayList([]const u8).init(builder.allocator), .objs = ArrayList(*Compile).init(builder.allocator), }; + try m.addCFlag("-std=c11"); try m.addCxxFlag("-std=c++11"); try m.addProjectInclude(&.{}); @@ -72,7 +69,7 @@ const Maker = struct { const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); if (o.target.getAbi() != .msvc) o.defineCMacro("_GNU_SOURCE", null); - o.addConfigHeader(m.config_header); + if (std.mem.endsWith(u8, src, ".c")) { o.addCSourceFiles(&.{src}, m.cflags.items); o.linkLibC(); @@ -85,7 +82,6 @@ const Maker = struct { o.linkLibCpp(); } } - o.addConfigHeader(m.config_header); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); o.want_lto = m.enable_lto; return o; @@ -105,7 +101,6 @@ const Maker = struct { // linkLibCpp already add (libc++ + libunwind + libc) e.linkLibCpp(); } - e.addConfigHeader(m.config_header); m.builder.installArtifact(e); e.want_lto = m.enable_lto; return e; @@ -121,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void { const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); const llama = make.obj("llama", "llama.cpp"); + const buildinfo = make.obj("common", "common/build-info.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); const sampling = make.obj("sampling", "common/sampling.cpp"); @@ -128,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void { const train = make.obj("train", "common/train.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index fbb0ff095..0150114e3 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,8 +1,46 @@ # common + +# Build info header +# + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git") + + # Is git submodule + if(NOT IS_DIRECTORY "${GIT_DIR}") + file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) + string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}") + endif() + + set(GIT_INDEX "${GIT_DIR}/index") +else() + message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") + set(GIT_INDEX "") +endif() + +# Add a custom command to rebuild build-info.cpp when .git/index changes +add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp" + COMMENT "Generating build details from Git" + COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} + -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} + VERBATIM +) +set(TARGET build_info) +add_library(${TARGET} OBJECT build-info.cpp) +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + + set(TARGET common) -add_library(${TARGET} OBJECT +add_library(${TARGET} STATIC common.h common.cpp sampling.h @@ -21,4 +59,4 @@ endif() target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE llama) +target_link_libraries(${TARGET} PRIVATE llama build_info) diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in new file mode 100644 index 000000000..0b945aa68 --- /dev/null +++ b/common/build-info.cpp.in @@ -0,0 +1,4 @@ +int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; +char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; +char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; +char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; diff --git a/common/common.cpp b/common/common.cpp index b182ffaae..e938dee16 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,5 +1,4 @@ #include "common.h" -#include "build-info.h" #include "llama.h" #include @@ -1199,8 +1198,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const llama_sampling_params & sparams = params.sparams; - fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); - fprintf(stream, "build_number: %d\n", BUILD_NUMBER); + fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); + fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 7be69f925..72a49b890 100644 --- a/common/common.h +++ b/common/common.h @@ -26,11 +26,17 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) -#define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +// build info +extern int LLAMA_BUILD_NUMBER; +extern char const *LLAMA_COMMIT; +extern char const *LLAMA_COMPILER; +extern char const *LLAMA_BUILD_TARGET; + // // CLI argument parsing // diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 14916d831..2bb47bab5 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,9 +1,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index f1c382aa9..76e3f57cc 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "ggml.h" diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 0c752c7bb..8ffc33868 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 14075609e..3295cd240 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 57d01cb0b..e4e8028da 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 9c52b7bba..62f5ce3c1 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -2,7 +2,6 @@ #include "console.h" #include "llama.h" -#include "build-info.h" #include "grammar-parser.h" #include @@ -184,8 +183,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt index 7e395afd0..5bdbea4e2 100644 --- a/examples/llama-bench/CMakeLists.txt +++ b/examples/llama-bench/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 780398184..9bd82d565 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -19,7 +19,6 @@ #include "ggml.h" #include "llama.h" #include "common.h" -#include "build-info.h" #include "ggml-cuda.h" // utils @@ -641,8 +640,8 @@ struct test { } }; -const std::string test::build_commit = BUILD_COMMIT; -const int test::build_number = BUILD_NUMBER; +const std::string test::build_commit = LLAMA_COMMIT; +const int test::build_number = LLAMA_BUILD_NUMBER; const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::metal = !!ggml_cpu_has_metal(); diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 2d7979ecd..03d32c26e 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -5,9 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if (NOT MSVC) target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h - endif() -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) endif() set(TARGET llava) @@ -15,6 +12,3 @@ add_executable(${TARGET} llava.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index cc1888948..d532980b7 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8a43b6ab8..8d985c82a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -2,7 +2,6 @@ #include "console.h" #include "llama.h" -#include "build-info.h" #include #include @@ -153,8 +152,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt index 0bbf89eae..319535a6e 100644 --- a/examples/parallel/CMakeLists.txt +++ b/examples/parallel/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 9a0b9c183..a78df305f 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -1,8 +1,6 @@ // A basic application simulating a server with multiple clients. // The clients submite requests to the server and they are processed in parallel. -#include "build-info.h" - #include "common.h" #include "llama.h" diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index af00b4e16..3c76d3221 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index bd2c73d87..de60c5227 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index db182e263..e31cf5e38 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index dd76b1cee..271282477 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,5 +1,4 @@ #define LLAMA_API_INTERNAL -#include "build-info.h" #include "common.h" #include "ggml.h" #include "llama.h" diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 4a8eed544..6f374a2bd 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,9 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index be0b2fe1e..d27ea5e91 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index eadd13cdf..cc6ed8554 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 38d05f4d3..48d801110 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,4 +1,3 @@ -#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index a23ddcc55..1f0d26f77 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -11,6 +11,3 @@ if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 84b04d5a0..fd755327a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,5 @@ #include "common.h" #include "llama.h" -#include "build-info.h" #include "grammar-parser.h" #include "../llava/clip.h" @@ -2264,8 +2263,8 @@ int main(int argc, char **argv) llama_backend_init(params.numa); - LOG_INFO("build info", {{"build", BUILD_NUMBER}, - {"commit", BUILD_COMMIT}}); + LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER}, + {"commit", LLAMA_COMMIT}}); LOG_INFO("system info", { {"n_threads", params.n_threads}, diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt index 6c5c9456e..810f3c46a 100644 --- a/examples/speculative/CMakeLists.txt +++ b/examples/speculative/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 323c74652..798684f66 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -1,5 +1,3 @@ -#include "build-info.h" - #include "common.h" #include "llama.h" diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake index c86ab4379..73853dfa4 100644 --- a/scripts/build-info.cmake +++ b/scripts/build-info.cmake @@ -1,5 +1,5 @@ -set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in") -set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") +set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") +set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") set(BUILD_NUMBER 0) set(BUILD_COMMIT "unknown") set(BUILD_COMPILER "unknown") @@ -24,15 +24,21 @@ if(Git_FOUND) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE HEAD OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RES ) + if (RES EQUAL 0) + set(BUILD_COMMIT ${HEAD}) + endif() execute_process( COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE COUNT OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RES ) - set(BUILD_COMMIT ${HEAD}) - set(BUILD_NUMBER ${COUNT}) + if (RES EQUAL 0) + set(BUILD_NUMBER ${COUNT}) + endif() endif() if(MSVC) @@ -53,22 +59,22 @@ else() set(BUILD_TARGET ${OUT}) endif() -# Only write the header if it's changed to prevent unnecessary recompilation -if(EXISTS ${HEADER_FILE}) - file(READ ${HEADER_FILE} CONTENTS) - string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS}) +# Only write the build info if it changed +if(EXISTS ${OUTPUT_FILE}) + file(READ ${OUTPUT_FILE} CONTENTS) + string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) set(OLD_COMMIT ${CMAKE_MATCH_1}) - string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS}) + string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) set(OLD_COMPILER ${CMAKE_MATCH_1}) - string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS}) + string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) set(OLD_TARGET ${CMAKE_MATCH_1}) if ( NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR NOT OLD_TARGET STREQUAL BUILD_TARGET ) - configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) + configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) endif() else() - configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) + configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) endif() diff --git a/scripts/build-info.h.in b/scripts/build-info.h.in deleted file mode 100644 index e996faef0..000000000 --- a/scripts/build-info.h.in +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef BUILD_INFO_H -#define BUILD_INFO_H - -#define BUILD_NUMBER @BUILD_NUMBER@ -#define BUILD_COMMIT "@BUILD_COMMIT@" -#define BUILD_COMPILER "@BUILD_COMPILER@" -#define BUILD_TARGET "@BUILD_TARGET@" - -#endif // BUILD_INFO_H diff --git a/scripts/build-info.sh b/scripts/build-info.sh index 3c8b1fb85..32682afbd 100755 --- a/scripts/build-info.sh +++ b/scripts/build-info.sh @@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then build_target=$out fi -echo "#ifndef BUILD_INFO_H" -echo "#define BUILD_INFO_H" -echo -echo "#define BUILD_NUMBER $build_number" -echo "#define BUILD_COMMIT \"$build_commit\"" -echo "#define BUILD_COMPILER \"$build_compiler\"" -echo "#define BUILD_TARGET \"$build_target\"" -echo -echo "#endif // BUILD_INFO_H" +echo "int LLAMA_BUILD_NUMBER = ${build_number};" +echo "char const *LLAMA_COMMIT = \"${build_commit}\";" +echo "char const *LLAMA_COMPILER = \"${build_compiler}\";" +echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";" From 1efae9b7dca2a5cc5aa21c1997b538022964ea19 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 09:54:18 +0200 Subject: [PATCH 48/63] llm : prevent from 1-D tensors being GPU split (#3697) --- llama.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 32d7d23de..bb60044b4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1837,6 +1837,12 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } + if (backend == GGML_BACKEND_GPU_SPLIT) { + if (ne.size() == 1) { + throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); + } + } + { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2817,8 +2823,8 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2877,13 +2883,13 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); @@ -2949,19 +2955,19 @@ static void llm_load_tensors( layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += From 2756c4fbffab097736d5116007872d86456a544a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 11:20:21 +0200 Subject: [PATCH 49/63] gguf : remove special-case code for GGUFv1 (#3901) ggml-ci --- ggml.c | 58 +++-------------------------------- models/ggml-vocab-llama.gguf | Bin 595423 -> 723676 bytes 2 files changed, 5 insertions(+), 53 deletions(-) diff --git a/ggml.c b/ggml.c index 2c7fe476b..d5a49d8e4 100644 --- a/ggml.c +++ b/ggml.c @@ -18811,8 +18811,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) return n == size; } -// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 -static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { +static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; @@ -18824,19 +18823,6 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset return ok; } -static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { - p->n = 0; - p->data = NULL; - - bool ok = true; - - uint32_t n = 0; - ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; - ok = ok && gguf_fread_el(file, p->data, p->n, offset); - - return ok; -} - struct gguf_context * gguf_init_empty(void) { struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); @@ -18895,21 +18881,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->data = NULL; ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); - - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n_tensors = 0; - uint32_t n_kv = 0; - - ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); - ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); - - ctx->header.n_tensors = n_tensors; - ctx->header.n_kv = n_kv; - } else { - ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); - } + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); @@ -18919,12 +18892,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; - if (ctx->header.version == 1) { - gguf_fread_str = gguf_fread_str_v1; - } - // read the kv pairs { ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); @@ -18955,15 +18922,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_ARRAY: { ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t n = 0; - ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); - kv->value.arr.n = n; - } else { - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); - } + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -19022,14 +18981,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_str(file, &info->name, &offset); ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); for (uint32_t j = 0; j < info->n_dims; ++j) { - if (ctx->header.version == 1) { - // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 - uint32_t t = 0; - ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); - info->ne[j] = t; - } else { - ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); - } + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); } ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf index 63bfaf672f382c0f5bbcffe54736e2698ef3ac55..549eed8c53f438a61f1b00c9bd3b7d02325f2479 100644 GIT binary patch literal 723676 zcma&P`*UR1ap$Sn_e(o78#^2Oh~2PTy=H?Vo96o=l6K+M4GQRn05sb+Nl~||E>O2o zRkv6VpwVKREyR-=F{ggK}7oi~i+e-04-*vNN5H%m0Gk z{?qf{`~6~2{5SVKJS+yyL-wHk+uSU+TEB4gm=D^#bh%Rxr{(SG>-}J`7imgS*W{%Z%=36p2@yFn|*6#-U45hTjtzeotihJ(1z}M26oJ8GcV>_&t%~_e6%@6B&L_WcWRi z;rB#_-xC>rPh|K#nc??jhToGJeotojJ(=P6WQO098GcV@_&u57_hg3OlNo+bX81js z;rC>Q-;)`BPiFW%mErePhTl^eeotliJ(c12REFPE8GcV?_&t^3_f&@8QyG3wW%xan z;rCRA-%}ZWPi6Q$o#FR%hTqc}eotrkJ)PnAbcWy48GcV^_&uHB_jHEe(;0qGXZSsx z;rDcg-_sd>PiOc&li~MFhTk(8e$Qn1J(J=0Oorbx8Gg@X_&t;1_e_T0GZ}u*WcWRk z;rC32-!mD0&t&*Lo8k9thTpRpe$Qt3J)7b8Y=+;n8Gg@Z_&uB9_iTpWvl)KRX81ju z;rDEY-?JHh&t~|wGW=Q@eyt3@R)$|I!>^U$*UIo~W%#u+{8|})tqi|bhF>egua)7~ z%J6Gt_$_DnEob;GXZS5=_$_DnEob;GXZS5=_$_DnEob;GXZS5=_$_DnEob;GXZS5= z_^o94tz`JEWcaOQ_^o94tz`JEWcaOQ_^o94tz`JEWcaOQ_^o94tz`JEWcaOQ_^oF6 zt!DVGX85gU_^oF6t!DVGX85gU_^oF6t!DVGX85gU_^oF6t!DVGX85gU_^oC5t!4PF zW%#XS_^oC5t!4PFW%#XS_^oC5t!4PFW%#XS_^oC5t!4PFW%#XS_^oI7t!MbHXZWpW z_^oI7t!MbHXZWpW_^oI7t!MbHXZWpW_^oI7t!MbHXZWor{C@VGukxQ;hJ2j=q#K!ix)TiujBfEJm&>n zK>h4bR(*l8-uFLugZ>?b?2`Xp41{7UGDKGwX;&Va$>)}QepNn~!UDxm7iiBXu9Y}P z{?{s&wGJjMg19q9A(pRE}&(>ZG3e06f(WH2WtG&QTI)MeNOKMtj{Fb?lgNQ6^zN8YvMb@@1lX zd^D~Fsbi?%@Qf6O!!$z(+EFaY(ii-te*O0niB#BSiyTrNN4X&S6?tf-4%tu@K6#M| z^nM}lz9|Tnain%^L+CHJ1JJY@Z5ifP{2oX3OhJ6fL8&@D8Mmva*LC?JNuuh=U#-W9 zrN+ubGX?c?pObZ&_L6Wr)P2`e8ax~`%=9GR>*<0IYn^f!998x880v=<=u|%HSWe?% z{Qk%zPO^IBge(uL;eVC*fVDI!_RHxn${GLO&rZ&^4eL{!ef8{X4p=b>o~-UTG)tHL zQ+?ngFxC8T`4>nB8#;5&P3{;tE*F30|{==9=a z#wC04_gO!r7g*mF3fz$CvqRvHk2#x6h1Gk-QQ*|CIzAF~{ZJOGBVs+^>?E17^pH%u zA*y4Xwq?IBv+l#(G1aH@_k7u=y0KsJCyI7IV5~E(Fj0zFp+8<7^kT)TMU2k20)%=% zP%3g|W*`VmRKt8;erT!0IQyzow3vp-tg=z3LaQK3Jr1H&(>M}{kYX%fs!-==v86{72bzeoe}to)U10SB!5 z>ID^C^H2TH`$>QlflDW(BcXp~!G0yPj?3V;GKu+~SQgVV)~qVW7yq5Y1j^j^pOCp|%a>%Lv!Zd7UB2urH$t^?K&nGQ4*?@QW0@EJ z3LN{Pe@33f@i$2eKmW=6Lm#!DHeq?}3tkRZQh*%FxpM5sGUnT9PgLoX9 zqB4f`ITNGIKbU{w;~$i&$Mu7mA>~nKjC{%87#Dh4%zx-;m%~oL$J@uqT?d!1rqDPf zRcLrfuHc1}%u@IzAv;>De49;TUmy%D@525uMQdn+N9H_68Fi>h?r=cQzLwUfUmpu& ztf&zQW#Z*zQrENC2GuYO#E}u&$7i8KcO%g!B*H8?oXR-WW(WQnR(8-rHd}8RVASd1 z`A^QiC4dh@GuHp1lM80AIE~!;}cma1fo;0 z9EV0wCC}Pq8vdHPEib+?ixI0rJrVUxxpmZ+cZOy7XxMxHxeTs_{h6zHTO^21V@qaixX{%rO*R>6kv?w8o1f$c}1x zZZ;u(&sV2x3l`?)@d4^%Rca=pLUc;iIxdE(jKSzrPf5Rb6Hr%+5 zgSRX~0yM7AQ2LA5x=19a%&d()=@1s_3}BjxI?HZ`we>Ln9DqsK3F+W34ni|VE_|5A zR?+vfxBj{RP$sr5IXUq;8oM+*|%C5>IA0 z&OMi{5aq{l7!5+7d;SptW+!L~WaUK0tcJ;+XvZ+c9od?n``qVbsQC{PaBAwd;^6P6 z@X-8+>EI{r)Zd&WeGqZ;sepqd3in|$p6BnS^Bn{pkr%p{%mQ6T~a3p3A#R8aTFs?>!F;9 zlb#*kPO#EkhXR&qTk^H&PgL|y!X!fKDrmo6hdq1xzUsk;v2>S~z95q4H)RsE(>T5- z6FG;!G`r7soaphxiJ=r+jU8VPGu*7{iE1h1{`zzbu>{>c@iyNHpo?xe4`xX{KS&OWc9ZmeuG6EaWk>cNveM_9W#)Ef zN%pdU-EDsZ1b0;l>GVPpp$LzZKW^bp6qH9PW`?gM6pCs;9R=eu7WeL5A;T0;Krrhl zX*QpneN&#=uaYmY7$5p7RGp$pL_a6T{(U+JZa~3p{!iu0)qp06q?v63how$?uHWnq z|6yEoWSl~mv|A^S*h!otV^a7163fOSlmj8{a296IQ5j)QWOeB9HS$4T z=%z>|o`^cOI0J8}5;qy8*>$hOQu-Ad*8kgokY*t;pqa?GY8onBIZ4b>_PZjFMvdYi zl%4;X4AD)|Cd7G)XxIC~Bt{6AhYv@3J$S=C&M{Us;-9Nwwi+ozpcv$i&SME+cEX{= zLHDT-h5i^gR>^*xb$X(tVg@A9F#kBIGW9ruC)l$i<%OfL5YS8#a~`P_Ks`(rA$2Z7 z#LWn>eK~w4<8t5Gu`0t~y0#;foJ><>X1@x>dZ5zbs8&TCHc3)*T&Mg398_vD!d;be zYr9lAQdV)U-wk8yf-dD`ruI`kQXWapoT=K`)6DAL&aD3FxKOlV4ricJhn^m5{MLBL zDOe|ky`P~nNvRr{-djY3vYnU#zjSC)<%zHi-N}M!MJH+6C&7Q5KMvh&92;U% zWy7LL#zRSH-Hn*~%T;+Sw8axCT-8=;SbRX9#Hx~siZ(t8-z0rv`#$L)JGc6h0Dwpr zu}}@n-RN&&S_v4#VD9wkx#t9yxQb~V8Yiv_VTg8_^x)Hb%D3ZL zXfzyt0#qlI`UfABKhOWzXPfnlP%jF+bpr3W)YIf}SnCo@N`16fMI>#Q>lO1KCH6Bp~m;^AEo&Oi-L@u+bX z-m+d0=j^vK=SEA9lrnkZt$I(j?)keyQOe?+caZk}agu4m=z{n;<)z)E+KqAt{mCzY`o|=aP%YCEa7KH2Jtxh5KrK>@SSM(R%h( z4JXR#)X6#0Db+xl;`o*wNkWiX+29`Th*_jRo_%Bfq1X=%{^Efi7o+f85H)r(Arj_) zDs&LnGKRPFj%Xsh&@JQzQ9o)O^@XT#9H;!0SywmOdaW!x4jp43ZPV8ojp1F{K!uqd zPT{+6%8Cvez0ZHTu;iel*2Kq&daT%`@;VBn1Vpc2^MQ&m6gdBqW=&`?d0AE@!Z!`k zb4?CRJbyy6b{Qv_SQ`*_Efj&`mLT0!3Ohg>*JSBL<2cB)Pi!!MVLyB(ZI$Jdlqi^M z2eYBNE9O6T>Xj+-->)M=QS(_)o1~Cwk@P!IOWv--@F!e))yE!G%HRDeMYVVp_j~2+ zKyJ>r{iUiCw4Q%*_HBOzhK#rZPDWYdx^1CJr^|-!+vS;_fE}Xrvi%0dp|UM!wfs23 zSL;&)-M5?==*ANH9hNcgWeu3j-*;?IzZ+)&cFqS5{QE1#GAkU=uj{w!D>MtFuA4Ga94YN3#s6`MdrKfE@*Wb{mV|A7t8o9kS|J zT`?qE;QP=Av#4VV{ITm))z}YYE20NMrr?lDPT2ZbQARD$MOM3H%xNgZVTHqX8qYE> z!l(`}Vv+gq+ePxM^P6B&=(-t>geD`01IFA?_eV$dGxNg8+M#%d&})>E6jw+w-_140q~xsiNeRKkpOGE@c~bU6 z0YMH0FIU}^?K_Li*ju_CC-@13Gd6-Ar}u&$`!n5^MAI^ZECPgtw@o_m5toU{i15T-J`SxaPk+4`vgt51S$beN;R=GYPFoz! z!X&8C{=?~Chpw|f7TvnM-8gFm8)RWftYT0p)hzQ;hX2`He=0{Jia7E_*{MeggY(>; z>PwQAK^~>dy$%P;kofL$+hmetGuzg26Mxnq&g*{StbUzJJYIM~rb8DF|IM93sDfH` zpJ_?>%TY-#B~5tVSaont|Fv4pxPr#IabXLEpzMcPFjbcB^s+Rpk&+J;$LE?R&m3{! z*zpz;w156snX$#73aMRotB|krAD_ML42y)>4Q?dF@3_DA1$kor`_6(DCW1ebOswQ5 z>z22LG1xCzHpu*6_&5jSLc?s=Rg@p39MLysM2#G`ei>Pp^;mUdB(zW(FM}>aw+y3U-MCPBQ5q?IDO#F()h^Zff{Y6t(IBgI z(YvSj1Pi>S*|Xi0boFvly*p)-U?uw>xnMq6|4XBiG)yOJnT~@jxRUK+)bH@^p|lkT zu|om8I3PRU@h!sGUkTOZ81I=(IREEPH>lxM?o=s_ZbUaCe}uFY|5#;4_I9e197co` zD_Ql~$t(quuwP;uSq2BZBb%g)#^Cc5RUAVo!OpZGI!!*QRng~Q@16+z`QKPVu#8P3 zr33kWQle+YiAo?S1$a>xCbH`LnrPbet#T@#gaMB>BM@SF`XjT*bR^|VssHEx#6kF& zI3pbSx*JjejOBpG>+pd;RF31-qhuQ4k{93`-Pu?72rsg1)P<)F5#yjd5Wa~EOJ5Mu zO^T%D&;L4QJEPJ@*b7PXwgXmAV+*6(1iZyKbd!shea-{58QQVrb+TqoQmhdns$dZ~ zb&~c=ILqgUi)E#|ncK5lClAiMXKy(e166bmsv}v=@^;AfH$vvkf0$gXSTUORwH$D) zr1}a@)6kf9)?(IRAY3?(3JbmyX4+OuAX!QNb`;LcWq(X*hYXN1SPGKGLvWgI7iJZ< zaaxc&Cqmf!(ra*FIb~<5cIqT$CGO!Abm0c{PQM?$>YA-&?l^efF*>?0^&qA6Ou`p7n<#g5 zGF57#(ODBYK5#Kij*xOc)czxla81Wa(YbWd-z*2Jx(%>K{Fl>r6p(A>v9LL6kd&0r zz|Uz4>#1;(4M1ohB+gzh>=DTHE6#O|k`O7i`>MVZu@0M=06?|RkA2On{g5ZsB*K#I zIGV>rqy|=5IC7AT-g=w`_bq2N6+0elutkzg34{oA^M58^w)AvDfpS)}_>vH#nCZ%Q z$@laz$cIh*f>`qb_C)(rztzQugilULf%F$R4}&{Ll@AX!uf!v`8Z_85PO5r4BJal; z5sqhSDM&g%6IK@lS1Z-P+QiG_EWL2qf$3ZPNx<)jo9mV>Ze^q_)MP7b%+h`s)%# znbiFzr^u%P=lQ>sFZ&v|B?yfVFIXzow{nd5I|%d?E;w(iX_+Pk>W70*B*H2ud>`$u z2{DH0?IPO~gll}c3K0{pzZqWkoh+|%AZ4cb_&$8_KPcY?{Nq2_bjY=_Dh(q7Bo6-0 zb^r90ha5s;-{Jgeyz)+-jwO{i9hj*`wG#2WnA^!=Imk)Q)ww^pmt5|%j*N&|9ZU4L zKe;TwV0lI1ugJ7XB^7k$M0Qv)kWw7*^j;i`xtOhmm;s zAxDtsx0V&G}dI+DZ|6Abn_Mgx%1W2)zh>%vA9O zq~=A}+u{K}wB@JuPei3KS1ZisGZ5sRl!E*sWZ=8z^qtrSAk!x9;J?t0)zNS5_+*x= zp6P`fYQ$7IFO3qjF!qw5c9XB? zizquCigq4m!tbonDx67V70MnnzqERKQ_3Ne|DqFw&!Xv7)6Ao6YC#wcHlHsij2z1F z?y^&yObD5OpDLYr;YgIb-7Jp7&KFovF9m(mo#aPJfFZeDo*N*+{5jK6Qao&=AuZH- zU$j&wRV3o;8DQBq{_tJ|)u7g1Abu;9IdpbCVFyVUZPLyhG~WK%TmRBWz3>J3J65Kz zGfyqYPOJO6Z3e^&{4vwI^GVgZH>1**1=oep3Vkjz@<>*BSAIIe9U@;3>N?hM{)2R) zuk6V~gsMa7X_vYeE%i?xw30^I#gisQ(Lt<+wd@s?bkjuSWujrhhvu_$B2<0q#GRmp zW^WBFJmrs(h?`|x!O0n8y`0Rz-CZBe9 zJq%Td)ZTg|U}D3d*{a03FO)%Dl%lnVggjKVMs@fQ?uP55d_zXGWaW?r}8WdHPI-SG(VA32|OBi4d&|Mb2LI~G}%Wk^AQut+t;jjGXx`JV`S=4ye! zvfj-h>HxB<6e9da!Iq>ze8+izmQ+&+bVZ5VLkP*})sRwbAH6e7$Y(b`Mjr0GMUc8 z&+Nm>&$O>IZlv+}(Nyu4*H9^D1WD|055~oBLfL)H*TN4{m7TE~r&-ERL z>FgV4-x2Cz$~hK;VkBFYM0L&mJxYGRQBMy#>;~3a(BFr1}H=RM4608ERk>mK#b#Fu-Ocvhc9pcQ z(d1NZphT&Pa|1&16B%=uI?Fupnv-D~fy63R0C({1J0amH(~e}JsP;%bJ(GcMsaNWt zhmm6tqM{a(6=1fr_{cG7Au_}LOhf*LHeZA&W?Wn%+iLXdqA*g*cAxy5hcgvvU-8gW3)5S^~UYzWMxb|#v!B3 z`x*PHfy5W+juOUV_~B~n+;OT@M;MVZay85{H33A+sZxd^8zh{Pc2QAC=$bOUFvtjo z$01gY^>94>o)gbRffSzLC=A$4W%MsRHvdC^r%{DJ@->+xVLU=02{u$m#R16x(^^mA zmTDeXX0kB*_h(?t)V8+(luwP9z(1aTFT^5Ku`2#UyN-&Vakvh#QBn?XW0DKlgEDoC z#(WCRtwJjn(a?B8BhGuQd4ym1taBiotyMLUH`Ap_I!cv%y>44WvguO!cRig5EG`clX2yO*4i48&X~{=u6NuKN6c9WB;{Fse!N9xKb@ zyuT7|7`kgEbowC$tJpthznv1ah(b9yU)q#yNp6+@eJ56@lNDc1RNPV=(X2qq=Ov3U=IvzAT_qr90F+18U#6)7Fx1*UOEb2*E9sRm4$LB z+7NUa#**)5XN~0hWN^Rg74$V#{_IT{h>iaCTMTnJQdNL7W z&eYWiQ$AJSvB@3Yql)Ygp_u7rJ}`Of;Zs@*4+o*4Z_YpR_nZ7dd4K*dGQSZ=xw8pr zB~lGRK13+D{aQ&gDPytkuijYKZPlEA4hDEnLUZQ$Nx3@wau|D?NO=SVRO!Rhfi1MW z4iP^PJ|ImlYy5!s#{ob4os?*C_jTupwlrf(H>6OdmmeHBzhmE6x{trkw)XKWTqlAh zd)^-^4`PFYeywLD9`)D|QUtY6n=L<%k(Ql9*Arqg$l&y4%=sV4%tPrb{*5dXYr|Ul zgnrhJ5<^Cq%nsBuP49~!+!!TG=DK)E=*{_)k}!GD!LU7X_I3Zp_}j^a&bMy5ao~jq zeC*j2cfZq&+r4EP^BJn-u7(lfA>5#qXy`@qo~3ze-#JUTGxN!3DG!3nfoeB%w9uVp z5ZrxwPpLDM6w{LA499>mu7$9`ixNC!ZPOZOgP&7tx6ey_ghZ>^KuPMD!r=f*3(EGm zNOfL|ez?vZ^lym6qO!BdH`pL_cB{UXxw}P3oT_o(`h=(NC{xoL*D(NB%Mm*W;^Syw zrg-oBvHxiUol?{A@97W=e}ia=ix_xAD_KWNCDP2?SEHRPe*TdAB<)0Nf!2x>qeGBY z6^3e(rTP){^(XhFab1-PX_l70;8;zwu7dq4g7qA`>Ds5$dm$?jgP{#VF$*I2Y*zl~ zv=E8nRKSP6T!o#C5Qr# zZ!GWyw9}6r`hIy^B(gPTmaletI1YH7VG;jkt&ZoP$lfU;YpLoa6}BlKc+X#AV(bEW zY5ra3NZW9Y`^w=$2lbEm!JO`tyM{O``XoX5V4T1yqQ*SCQROomR+?l>3zyhYH{#Qs zh^|PnODpqM&=Jq^u%zdO-yf5>v4pN_b{O=Z48n@)}U| z7IJEa;~{8+yVkc|!*CzYKXTb|5cgKB?Ku`e(@qs+WK;ybN1=h?>H}P#fyC^PzVufT zZlz>iy|y?jL+fXnZW(3lxGux0y|fgUUm0_y>?77Fh4H>vhV5Go)gezyH#BpHjli7d z9#W*EXU@$sY_$tZmmSd;pMOLYd20?~iLXMRYpLz>MwVV;Wd}YAg{l6rotPFBd@+cA zHZswlnj8kVRjH^Mf#eTd$y(PTw;V}UK6XVrYnl8~9&fcCQo$Ic5dE&hVk(!l<9EjB z22&e5q)P5K6^OpKT2^-KFAT!>**MG*3BujYc7NB=s8hZsJ&c5`R658GXi57F7$?-o zr8ni3tt@$}%L%BP2JE2P6Wnb=TK*6M#z_Eie(9I| z@BQ*faK^z)5~+fka?uSCC)~rY3eJSeLX^>U|BPN29v2r$7~45v&Z96z0Z z zjx{22yy+UqqfT%>#wpSS8a2FN;Wdg69r*_$BGqK_3+Xnl`RL?WiH1t&KOw_`i33N9 zyi%tK;l2AVyD%3^I6!CNrKWrYG<^gTVg2LjJJY{^?gd@6z)1gB=BMjPa)VmIf;zk> z=?>M64Tp?HCSfQftcCQ0a^yV+7xNq;qaf6cJz>%=W+Z9KP9Pp#OUZ(hI%QGNYRK`~ z&AiP;c$=-EPTG}y%pzY$j#xFqtmxAWB3YxPST-#^{TEQ`E8~+A5eM8XbKeA?6ix3U zhn2o@4g~pQ>R)TvKF*0MzG*<9aYL&#b0x!5J<6#jC6k)f5rUn84|E+NFCOK6j=wUo~aZeke$}sJZQpFFJ;vM6V9b%<`*fW2SZx+!?E_zY^g5*S9p@i z9Ld4-#YyHFs!6kPVDX#znnAy&%|((_FYr&~@&prA-;6~lMwR$%9Dl!LB>Q~X^?1gD$saB2_8SlO{j1i?i$J)AJHnMY1@I#^C zM3z4g5RmQ=44;{A@11?qW$db+W|oxfKTM7V&xv{H{Fm$dqCqt}6{{i?BkV7bP`5$? zgYP9$wo_u~?0~8d$GhssNBB7YI=$nv5j$A*fMHf+d(%ppg&i;g%>D1vo(#zUak5&O zzYhxMP%9Ou@~$Q3$S{_(fxr?DE%8r)#S|wW)NfI&Gl)`EOYH(?~h& zkagof0M?s&@^5DH`4?Xh5XoTDK8twHe2}KCm3tf+kz>#Fe`Jd<7QQ*M>YOfg##Uyy zncsrD7HWgV8+-(Yr8AO$Ycj$<4n7Hmq8P`^xFteAHJEz$Wm*>Z=jY)y$jk!b`)V`n zma%glSkj!#MQO9ng&IPMic^PNk*tqc$2u+r+ok#+0-V8^_*jG$=&YEf>6vp;_^MBZ zeny!0WLH=OGF`oe8#|KZ(Zqd0CUr+od`ru60HKN_(#T$)7cIkD3i)GqzpNc+%O-Kl zVS@Vm<*{&*WgTK|`|o&b50cSCFxCg1yb+q}adxvHH5~p7DceFRZ14nAD_P8o?44M^ zS@L}X+$7zQOER+_Os)H_^5~|d2;QUI-ce|zz8+iG>PX*eWxa?@B@}?kg-n%AoT_D4 zL>@|A))z?YvuS_jBp9c|lNhWzNRe;2sANR!)J7EetD|06UQjOGLrWoy^9H)Jsu+J8 zcZ$urRU8_vWgoFfsoAK@NrO83$0^sxnoauNZ7IB+rh3td3W+go{lT?e|8zY(Nc|>B zLCDG3l#!R#30Z&W6@GROX84u7D_yOjJAMk`V{KKkE1e&C(SbRPf-T0g96DvkNlBTA zKY0?{4Q^568$DWJD)LJrvi6~EoL)j5N-ScPNQswSZ*jJ)7v^L{{+z#x>VbH6xR&(# zm6BRFe~dFpE_>)jDiVlt;^hU{mW}Fig5R%1J{03k&g=5+4yY6 zHBbBE@q2!~r?t;_fUd{kl=Zr|GJq6f=pc+7Rf=Ik=`jhmsB`sSX>AkB$h^0LgZR=- z@v>QV``hWiaa8b?C9gsKb9|fvP#Y^w2pok%Ftb%07P6$Z)DnS^d2tZ%LZNXHa5rfr zGUiQ=Fn`j#1xd!50_?L{x`@2QMTjawhPgs}9u% zsI07;Vj_PTXlLQ$#(ftDb1~DNgw_X3A*73}I;}B?W^zs=04xjE$pC=mG=ho6O#2P2 zB!$JYF3kEDx^#fuA#KVz1uwhSLXW5>T8Gi^uUl#Zdr97)<>AzTWEQk6_RvPBqN5zMX+!`yYhh;euRroyrouk!9;=oTW zZ+=1?;0LGG2cr`mQ$xbb)3{ zOm4DNKFC?aJX8V}YWhXm-vPsNx^RMvTAn&bW zVWI%;olvZ+vh$Bps;F@mk{#NCpdF z?PhA_ugmS-JBwOFAIkI)$CLMdBc$oVNR-7a`ms@VuV0gQieMiY+WD+#Hw{628iHyc z3!Zc;X$a*@SN6lGy0o+uFa028m(eyGeOA)$Ywzz%L-)yT&2OMpyBM>aEXZK}T{j7u`{3VYKY zP&|{P^!ovx#=-M5t1EM7kT9P;JoIF6y1X=hQcI z!#`jC*;~=A+}g*>3={*s?a(tJrnaoWW<=rn$r__-563|!I`DtUW+pV*599y!$iy-g zUWQ-M%64=43tXS$pJRTIOcdhCVGL5*oJ=vw7_;lQ8W(d?B-nU9F+r2mNA9&Zokn&% z2_g1UHjq6pxS8OJbIiI;`Uquv96Ncn-1qt5?bJG`DN5IUz*RfG!&a^evl36jc2!fL zQB{&(W+}?dXD30jT40sGgOyXS0S|+K}xu70~20xmZjP=QYo?MA+ z^*JM~n9Qf&*p*FY^}`t(ZiRGx@v|b6tTE`KyjzGSbu{Q0DeN^sRdRUn9(>Dkw`5K$ zBTF}tG&v$ERRK>x@W8eT87hh#gM&EPNyGGsK>P}QVO3^Mqa%AO1jmIJpLbs5jY}5= zu0i83@n5IbEzgl%IJ|wmSt|>S`&ZKDu-251_md4W8bKIewdz*v4^r8`Dsz*EL79+` zIAZPbtT?{ccl`%Iwg{elwOcNH_+qFo?H=g(ZlqtCcGcnL>v8bINWBH@=kO2El{3kj z4rn+v$nDNGQaCInED+=+QPF4KT+15Dad%O$>ir2q7aqFcm|R+VDAvKQwDFmZyK6lY z5b{5CGPQQ)u}P6BSeCvUqZ~A@1D~MLwd5RJ6qjTN!D3|aW?WB>eUYNZWwN4a*;{A9zoBIlJMuSlS1-Z~J{Rh5B;2sh*; ze4;Gmo1b;8_%i=Lob*>z=CpLLYL|N3->R=th{gEfZ!hjx>_CJW86=f6+9$J(m=!o{&E48|O1zbyoPiPT}C_MKT2 zOwWSXM*%F(et(z?YVa}mqM@_IzG8}-DFF|C-5;_m^&G<49sl%-w!Eb^nLk}g78T7B zHG_pjI+#bWuyi$l&tgPA_6@DAYuXCROIw)t&MGIVYM@=CAp2mZM*3$9NefwN?U-&i zq_RTuyCH$}pLqQtCmMfWO~%8*2`XB3<4UliP=WF#;rOj&Op~4>T9t(ZaI!fzy`-2~ zjZtckgWH5Vh6D+#OpSB5$smV!9bxMvEs)Irc%G6|)0Gx0XsAx_yI=5kH)!Z5oNJZ% zsdvX_t$t^VN(h0-U+_WLCu1Q`lWAiuWMp8PG0}q45vO0JgR&Xs6z;WY;K<c9)54 z|G)bug5;FyYx%7Vrxcq$7n6X*ita8iHC}twUAr&Ae`ak zKF3+|{8^veIQm4Q`h=F@RoupeyhbRta znd*|?=d@dCqeIReQ-6i3m%?%?$^DlVZfpw6rjr%6YsgYk61hh*(B4&HoRd=bO_ATD z(4nP|Jlon12TGZW`sVMYesG&jQlo4Yq7PYoWSh8;Q=Ksszhzb4xF9S{#=}TsL3XG| z3A7Ml3Sg?9K+d8X2_z6C^^xVamSCKIrTL6sEcEnWVHGe1eRa`tXP8>fP^549N19mq zLol@6ROP==F(T`73|{H5YO*4df2fm=UIt0Yp=CeJdfSOv6lR#(mt4h|mu1eb(?La7 zV`4um@iDw<<9jjNB!69XzSAiqh&QnW={5_ zQH;Idd_j4NN>D57kDsJRBuSAUW)(4`A`Y&mD1Z*l{+{Jw$@(CpV}H}>HaWR><>Ewa z=O4>E^FK|VQvj|e3S)82@_AUICxW+B7{Dw zL{%rFuqi+Gsm;)pWwL8$>aOb9L&`^!I&rP#FLn;I@(<2fU&GBh!hd)e1dYOHsnEGzIbOEkW=3=Y`~?*{NV9HYN<2avPxQ)pU;qhC~2% zj5_3lXClLfS06bHdS0yJpAH+7$||MyG$_-*%hl z1WAm=sHg^$SD*K@_|hc@oj<3$8S7Lj1@$Ceh4;^2-P?8){`Myi`CP2MYFnHdFr$OT z6kA1{?|a&3<~l4aOvUBRR%_rdB0IPeSII;(gH$6|9UwCum?||ETKbA3$HiUATTW9I zwZ^UOqk|8GRv;u_tpkF5McU*(2$fjm!dOQeSuf;ric>N8gAY3OwzeHP{)~gj!9Pi! zndh@?Xs4pm-8Yjl}~g>YL5 zMABfKRjm}+rK*s8M28Rwm4#1sN{OF96>k0XxU(CV!pr=ZEZ*4n=IeL zj8~Cqidd~?!@{i}Isr&*=6R+HaLAJbd-7$ zcIC3Fkkn;Hohr3Vdc@=Y61)&z+7qT$XFH}}tn4C1Dpao%RSqeI8iz zHFt%)xDEmLf%n{9~7>mgw!A!pZzeb|X09JR$c>wGi1` z{pgb7Ajq?JEHER)>Qw_f!DKD$B5j9+9?=^8A_2?iCc(81Cm{$;ra{O;S8bmVmPjIU zxRXsoyT%kirDvG)hMax-r}x5B$}HT-6ow2A`UpQ607-apq&t}WkbyZH1P{@ZLanRo zv3*WIPUSO`H!_h0J!I7gzO&OD>eIgn;ZMP%;EWwm>0T(kzkv+JRj2ti1TUu=2XNR; z_B!97nlBLHo%oC>j&#@BytYn+sta>DL-9UFr*Htt6y~oIVChJ!@;b;fF^uLIPvhA56Tv=7`vRTdAG+aN8c9khw1!`gyzTIP$1^B6nL*Mfip(0+567My_wjQ`_{c z@^$2?8yk)aUzVj8#IPfCTRs8Ks5nHfr(U~--IL+R;V3kmdaCEb0V#a`%pDcrH+w+} z8lU96eE&4K5!`mawCSsXlSmV2w{Rpla3`7v zjlRW2uEK$qjFo9PM_Cx5+w?Q`Ih(lZd;Sg3xR1*@zRE6uD*Chn5+*=yXX2@wN(C6^ zW~C0YbbTUqG(LRYS4Md75hosRXDw2Ra)rD3k+>2Paq5%;k}%<~C5N6>Y&%PxUwtA} zf)6Rp6?|G2U6=1xArh9+0}nl*G)kAty%2#9&c#gA+sd?)?Uc@G7Y_ztggB@M(d9E^ zBe6%ne8YhToW*YPKR_t!+n8L%{6Q`p8J6lu?DQ!(c5?6_#nj5;k|My5l_r!ey5^=z zcz(wc`ANDtG^_UcfB#66=rOudL$mroT?qH}SixH#HO0(;GNljGtUh?)wO51yrmfDx z{^bUSa7*9cAxv<=r$;*7I`-6;^^%44YMhe)kj>c%v6@_8=S}yWYtFqe?}|FM zXy1tN5$^uY-;{O0;}!=FDLh0hgweJ#N9!)pNB@y+3Q2ro2u9Q-b?Tbs&7p|js_wCi zS>ME)1q|Hw+EA4^du&cbUMd1;P<(g(o{OSMM1APF;^Wz>=S;mYzWt^8mZQ;x7vt)~ zU%cR3dWPpy&cK_E7cq>!wNNvkyX$X2f;W6rK681Q|3auxWpQ&=sINwocHs|FPOqo% zr6u7r%>MiWG(seyU^a~kd`Kq0w(FGO3w;UIoj~Xu!y(R@tFfXy`s+!L<95&BSz67d zh3B!=zS(TAeDk}n!(T3#RApM?T-RW(XhF!Z21sqN6P3U|}WBB@)bvSJpq z=u>QtVV5wc!?AX4Pe{=;Xo{|fgzl1B%B`hN%lVH|*ly9aTULHoqu5#%y?I@n#$;ig zTj}!ZcAWEA&8Z~=@#e4j)ac6 z3-5?#A7I1=`_P6+tjlG7&N~cm(~c_ZN9m?UYk7HJ z!q(|u3liPzmawMtk^D-dR{6G!!Fcy>v|(v>n@lqQgZaC%84>5BsLtX= zDlTN?wZ`RE^#7OCKALC#t^W!7i&TQJ%z6CA&ZbZ`zo*K|st zu*}FKMI-<+D`SjzS7Mt#3@y9W+nk3w;ty}W^f*l7A zG!G(Q1UVJ9DL=GQz7N77dG5KWSJr|K(M5##Ed(3zb-5YdVco+g>~mT)I#v(qfoPZU zNWU-i46WN{xB56#Z-j>pzjdM{Zov0s0f_pCiU2BiGyp?IhR_@ji*sy|a&B&=J_y!M zP}zb6DtVpcK>;2)T1S2l^Bz}msI4>^{_-sOF0h z%(Mv3*;bFj1_D4{8P|uAxD8)5)t@d59=t2lHn-A-Vge>~hpW^@MxL!uVN$3IgyO&K zMhap}SP4GQ>asHe!)|G${0=FhJ}4b`aP0awo&-Z@0`dXEvkDWCIPbq$WJgACDP!TW zPPZM5Vt}nCF6sl|@rfh-^qt66{pr`V*48#Zr>=TBu`uYN;H%5E<2YioJY>snNQWCR z3u@#N`m31S%TPgm%PN?y-&i%-8+!8Wp2bM6oHJR=w%u<-!Q%&M&f%&i45KoI_^HOcPL0 z7r{Vfly6tY8d;^USti~a7IqTUsh*wV zG_gjtC~gx@$415oVPPloN7IV)Iybr%Ijb3)I09sE7FuJxh*o8-KC=GhMJKkZRhtNM zqo0QV-PpbIIoj{phbMf*P@P5>?_nb*{KW5H0j@T^bb<@%5)kc_R=~*0 zio=E{vAV_~muPF?yM2323}ldAB$fNpacrh;IjyPk=%kjL}*#Y8w^|4b%isiR6F*FY$Shd1w6|`)@csirY?lCbb@^C_G?^)F!jXe@m9Ze*F-{qn+8>sgicy4JUm#UQXXi3Veh0X7{r zb#VyxjDK_wF zuaS%GE58bZX3`SE&NexzJ4nP!9@0ud8?KB9U2V?X`$PqMCCM8VXQFu!JG zxY=z;B86y!D0Rp5!|?s}!izrB{U-7-dCTc?NrzE23$9V4Ba7Db*vwr%>5i^WvI-%= ztCxK0qmb=IsbgcKrN$ZA-H_%8$F>q0?&^IE<_%5G+|$1d6K-NT;js?1Ea|F&YbOXj z<%L>jzhx~moiNQsu>OUpX4Z&#E$LvylEbI|z-4C}+IssoC@?m9!7D(Nj`a<@%G5dP zxu3oDJAz}^dK>v~h?T@ABnCJ!e^YxkA6LoE$n9V0-PCdkM{{yuH5#Y`ypuYJ7i~2< zNZa+1hSCLQTpUzqm*gg(|z=0~=Oe+bTM8 z_@P=2Cw8WbZfc#bP``4AH0f%N%ifxZE6z%-M&OE6O$?^{Re#}JbUH3>*^nDNz;iDp zPC`VrvaI6Cmzvz<_-v6c+4a^N@i`locb z&xU$D*wIGB$C-7SOk`FqoLQ4}moOzTDJA_lvU$M&?CU~MVz3{|a)GJuc&1Cv172Gm zXKB5CE7z5Qwz2b_c4Q;OgfMD0%p;FH;)EfI`D>=x$o!-{`20<#srpm;yD1cc4<*joio$9iuE`$Ak|69F0v~~3%2%DH zcVF?pT8h5_z@2d+WGO6A4aM|he~ZsiV-FdwsDbiQb~DyBule3`TG#3wlk==Pm_e@=FT$U$~yyil4P0>b)i4*?Dn$KN#nl^du=v$&FI);R2O8 zhSO-J^s4rP=tjcx`NbtN;QXJaCc@Q3?>$*9vuoIjUhGpkYWj-7X4zhtq<$A19U;(` zPd0+z0Q+DZhrvV(x~(EQlWoLB+H+ISJxkYkIE>5KRY1b@os}h&?QmPdef3!Z)bDY^DkU z)N)6XXsweDa?sN%`BLfrA;hu1ha6VU5va(KbY%$(!o7{;{~$FhS&giSg@09`-AX3Y zjUD-WJ&MdgprTW_4Bu!jKTd52%noy?NOXzKdis5J8ooq~z;VWZCM=K$RxqTLO;evFFX@UNF-HbO zCCOUGT4XFx+9`uItu~l#;}~Lr*RgA?Fj}P%?Hj&1|B2(wB%5^>g1Oz~qM!{pfw|8; z?2wbB0`ca>OAiR6PPn5f)Bd@Kj)0uw&hvXzEy-8v9YVx4HKeMWcPz*!4_i`{FJ^HV z`A}w%htHTcvJUi$WcM2d!dwK5&qp;gxBmK=lf!t|?(_6JdU(7Ea4mXSWY}}dflyI4 z5#`ek28}Az1=zN0hJ~-GLxM(0=;MCU$7$f6oQsy%=<{LuV2=uQHn}lCmV&YXG;H|; zdt_XNLeEJVVr=O(9~4wa=@O*tkl(I)bxg)5BOTCoO13`KaqHG0dXd^KlQNxqpoUhR z=^KrtL%Z(!L)jJ9(k8EW+ew{>#yvIg7u~lVhl)nYmHHzP=x~l6;k3~lE29mHyj=x* zE-Ve2E(%oKCR`Dr^tmu6aJmYSG5V&dT@^)V{v?#84ndadCHjFnS8G%$JA9&r*w@>)az%`;RxT0%VKuB!>I1MK1)l4q)yr4waOL39r_s?^n2Y|6 z5f4+#p1YT#pQingYSvKoL*2OW$cuu?K7EsMPsn(!RD#es)Z=Bb1yjtDDJs3A3CiP# z*HxuacWUCo<8-D;O=44vj8WE}xtK(NnOD|T{8&MO(~``-6u#lX0jYu6T4$MKb`i9* zIuBCs3!i|M34WO-{&tE0q!u zUsG0T-SOcrJg2RQimXh5^3)Iy*5q5()_OJ}f_yU0Zzwd+a-h)d{D6R<+Ft*SU9sedt z`eT2|f}U#d-8l|!AqxD7K)h39kj3G~WY?4Z#eH->@vZDOA2OfxtnA@L3bHZ8bI>13$d}x{$|m)MWP;M_jSv zKbtobP2SO?4Y3cuNn|H1Z8p23LigTq;^Jdes}H9@a^eC70mZ1TA-W(nIae@Z)ymm# znllRGi#x(pV|qFKDvZ3bvEgu1qH~~0WENC+bm%nT?0FI<1Y1BA=G7_!XJ&KDQtZ_l zx9Be15JE^~$W*gkTtD;KDHM~@imb%TJPf#w;*_TnS)P?LN07adoC>tnQbCRg9!J^! z&0uZn_mm>^nTz#aL2z@=h7*s`SFSrR@Wr}VI{0K>$YWm&qLxDybqJ1d1X)CM{;>>a zNkp>8I%r_Y8Mhtjy3Kp<`DB$jgypTmu71d+aApvXdMC=>81_kFXg+L}r6%P@4p#()Y71y=(@Kq=ZrRab`T=50=xuk=CkmcSDmDDc7kcaL zZHIwCw~GZR9no;6ycD-T@TG_(i|vKqJ!1E>iEgn8dpU2>NRH&V_9-Ao7=+diRme!| zq^N1D!WR4d=QYv7iojHBFK=^j>bZe)T9{)I^1sh*qpSgmuHt#OX;OAJJjdbxXK`GrcS0}k}*&G0idJMn8Jb5#V=j`#)dJ;+jupmu_%M}U+e)e2Vqe)^0wO$+S|)A+ zn!U9hAtpg`y44-*)RHo+2EExXRiA-w+z zsUG230eCWN+zO8Sg{6mt)=lSdq>_&XI(I)SUv||L+s6_S80LSTHL$g?Pr5biPN|2e z^}9R_4le#)GH|<`Bi@nTA4r!bTc;5Ryt<=(p>EX?$G1u--$D#V*@VAx`c6!`cisP3 z%rroT7Dz3O%Bd^qmU)1)16}B4K^42~+P4EIZm`1QreRkG27c?7Q~|9E4|u^>FX zCLCjRhwy|iHTisY!U{KDp}aA!qeSifw*2-I(Ky|Xt82bCwpB(Ua@gh8?9hSUoyHpS z+$0&-d@I6RY-G~ZRj_6SoJRb5BOE}4WYWDKTrE2k4^0y&Ym%eo)vsI>K=+F{Ocs~e zD~If)q#sK<*iQxn?qKczL)w_*hcPO{FDJ`m>5}BFSs$>bd%dp=-QLyS8l~E3q`NQV z-3vnh<`D6vGg8t=+_40Rv~J`?eMg>q32WbhUCXW=8IAAgIm_|v<5g(R!T!v(eRbPM z_M&3@ah`Q8=cNQO>N?w#UwI`UA-9SDN;L0^KfI!a~Vj_(IH}wU(dG-MiW| z)lx)=h4_r8f0Z57osfatMId_us%4^fcg^m7@CjP$0>_Il3eaZ8WDS@{I3i><5R#T4 z?=$V3UtsFgVMW%faGh9OLZVWB?MkzrLKp5rQ$>xOH_LMLbvXSCmBR$x#`@ik7!AUX zQcn4KhfOCOo?7LN<#LP`;cz#5E4rSJOT4}Q-r4VGjz3Fz%t9DwbFin7kXFW=ukwd0 zr8UP8gh^ge_MOWVB5&SnQn7Sv^gjIZjbs|DrV*+?d|TZ_2E zFY-!m5n2-#TqMK9wnMA$tAQ{oR6chyi`M$6mt7%J)4HNFO(kdMf)Y?XbJh6-{)Xda zDOs-2j7oai$k^82$|P4D_$1@W&qbRcw>9sU(UwvEgv_tJ=1hputWuBOTJjdAvYt8{ z8mqBxleF`b<=QOo>-uhyco}Vv(qCVtYOJom^$>g2>Ai3=Tb`qao%Y3-1qaH^iej4x_iA3mb@U4wQi6?CrC+d|@~q-=BXf(EJTxAQpd@0b9PX*r6MTmg(xb^4KzjQui@sV(ikY2YD`V8ba7(M6GJ59BmCOlkaMG9OCmm-SY5Pm%O#Y%P2R_e>w+>9%f zpykgSGoN5`LIQDUFio~*xMdl1SFM*TDG|XO5=qA+p#Vk3*JW&B_U5*0A>RC*l+N+O zgShz`-BNZP*v0}Wk-%kWH5zw0Z`$rhjK$F9+3`mgGJ*;o<8PqViClao*y z2to;kuo$!bK)V+WMbP^~!ArO6ZrhDW0;?r9C!uTE2$8PanW+exFr&P;2rD>_r{C@A z=ClS*^6Ylgcv*lqSFgqb=Km^lh+~nfsNE@5`PxPh`%oq~P5Y8X9F!R(H}gbTk}p&U zfxN1!2dVjs)m=@~m8ms(*eV&nm>tC!=(zWN^Uw$&D?y}`j`Cm!8D`nGyo;dRCz%MK zLa-VL=dEx%i``c07@pri;L?o@P+8`iDP3v>o!`&L*IskbnEZ||qBU|TBCmaOLx8+t zindQf=SUx7)#`WsOXKtPFJmj~_cv%iD}Zr!%eUd_J^yC?=u^7RBGLRrWSm(#V4oNi z44!{M!0ce{X#0Zm29N2aJT64+DlvmQniY zA#W2_3Kft`0w3|6#;p6;zVa0oQR=ca$GDChmA&ke>&Yx!E97}5zYYAHf*8mF$dSAQ zS2Hi*E}K&pEKfSl749p|WgZU6tQ&KK#A=+Qd?DiILz+;^v{q<_F&BemJ=sav9fzh@#L-Qx_`Ls^j92ya z>L)N+_?BcZ;BoAbHX}<DZ$ZyE%I2*Ht@gR))buipCyfsd;^D<5}0rK=lhBqC$m7=^vA|9 zix}?fmmPT`mOj89g;gRTSL~u(;WA5fN`5ZT!wuy4VQ{817Uz$@wutZ}RDBN3jbVth zf1EX=#34(YZsB$7lE6ARu#%uxS~GP2dpXnWXG{l7Ne6*hIC;p`cMwkCLw}LWBK+el zPuYWyNN*GU#i9`AZI^M0`bjgBv+tz4rYV04aa#0(OxEgLl`ncx`fBfPIDB?Zn}5bM zjFe?WcqmUGHO*DaE9tUgE+-HYT1ty33$(xnWm?$Dd=16q zcxD$D2nTQ4uJiRJqrvAVVv~U3$W3l7c%0s^4#|oZ?^r>jHJ_64kMGNpc!^WTB`-G* z9i%>TfZj@1_29nb&Ln>ljexWh|HhMFN_Whai>r|xhhZJ8#D>ePsHR7{KgazrLZ+8y z{m@$&zG-2n{wlkI#IEx8w@=>>k|Q}joJY1FFTWglVW)ou+|CT97zhT(DYNZ?rHAB) zL8@K!Vp8EDbEKDuj2M=PmgY#!zu_RFcngB;?l7ZKCk~V;(0Pdk`+KL~52Y4le(h!F z2n!~w*VeA2!N8SmG^6Pd7YK+zK)oaZFH`8ZB46UF-a-t!tG9p?Q}!PQHeBj}pa^%P zW-irk0_;l42!H4Sf1JP1|4*UBXf}zguSJTVuslWo&HrE+e;UmE4ZyR_R=E{DjO84zF0w1Mp=}sriZ*>`HHqkHwl^6 zw4EjBw@8spWl+_Dnrr1jRD#>(Aeb>+?5@FH+{svparHp z6;?W)YHpwRg@F~3AF>R1BJ~KzN9&JL;DT}C!|zT{vNUolj4HwjPiQs^O3$NjFvbj; zjZ8@9OXvf8<5}#e%MbcgO}`ai0cO?Bg}FJLDBs{4jK26JE}MPq5FWoJ;xTIP(0}M~8f`jnWuZ8i_y(3h&0HcGCd=dCmqgl-ccJbb1?724?pY!Rv5GysdhVg#fB*wF=SbY z3WBh+&cz8(5^}%opaAH`wxEwXrh3$(O+Z!;fk#KyaDcvmt}P=>5kd~gTIxQ{M0+pG zi`Ir8sGLMbFj}^TPSH4zzvH}j93;uFtQn~98TYaJ`FA<#6VBFR)@jK-0`Q5Cy|*L4 z69W!|ax>aF032dP9wUt-Amzv1tfk0==!orBCju1?KG(*}{>M9~?+7tkLmF{pH`VRV zgcwXQu7l}G6Aa+5D*9%oI;UrJPF;GH@|YtzHmAQz4ByxL7QVfgSzk%Pl9`VX`eZw*8NHOUj&D=BBhVUCqFbN;t;#OM0iwp?gBj(=>=}qcEerk78pbP@+hA8C=?`7@= zYv569h{-Y+9FK8vBy?{os34~Q+D`h2NeBm&BS~vFQ-kAzPK+{~RV79#ImGOW%FPh6i)_@iqy>O&ZGOD3s zhWd2=BN=LCaf^Vw-sG(M(Dyalyp}wirYmOY@D{9%r3VR&ULh8@d~&{Jm7I1tBC4&H z&Dx+@Wmg|>hOf(_8H&J;OrYswY)?|;;#ubBKyWPJ2;E*UXQdSMxQAu@Q4Y0GQjbV# z5W-mZ(5UiR`!#=qTNfhdoM{OgXPWH^?F)S8l#vN&^udD=sQu8C1m{aq_hj0emQooH z!U*#ZeCjK(pZ~&$w`QuqVO41!nKOUnK_HZ>@Xj_B06wgxrHEjjkgF!brEv*sD#R8s zecD0_$W8puv01wKkPL=Q4qp) zOBa0dy)yK8n@zU@h4Ezg`hzr4m5I4eHuS4OR@OwpToND`mO`6abiELukjRB@fO8{R zXQ71PL#H_{$G+oa!1b2Gd7L@QySphe3PMB`BA?2VrhNo-w&ct4x|T31cyX+?V^%XH zkguTXYs83L2RPp;zaH7o_4vM4Ufo5Tc_%5^UcxrFIKa~dWb{i@k#&<0cK{D~6K&{i zXPNf=TSq6!w;YKL!X~?AFcd=_hSUC9Yimu=#D#e;R3JFmSK^rc?4N7tv`!KTVGbul zPhDkA1S4x))ExUh%GulVPb34!q`RoMNQ;I0G9rD37GKt60yF@HISex&blQGgSqtyGB94f}FD*Qn8AG=&b*!-B7SWH(O#uSXYa@DDTWKE?CHXj|QoK{tM zy>QotR6PA_y&`FG4TDiwP$v!C(X2|1@RQ<0w`~87Mx*i)5pn;vkQlNU^yk$aL(5pf z^_A(t-SJFh4xCFwP5#i_?HfK|5Z&GN+0cwWOyU@yb*KS!3Xh%Bx2TX)jjo5ODGH1j zVh1`T!;w@nSq=+P#!_xdGA2@58IKB(NCpN@x;It&-J%AnBwROB)luR3PP*Xf^jOr%aL{(i}R~o|d)eG_)VuWnkP*zPC3(ImD3o!a- z8C9BgY?TgKXSS=!z*@2NU7>v^P5zKjmR@c}I1J7^5yxSKA}WbnIKeHqa;<4P zKGp2V>-*Y^25$y;zd0zU-$N0QCA%dZoZy5i?q1=`RrMCEW@TVa#Mrrs5~A3l6S4$# zr_|P}*}=wOz(4Jv+(0MOM=5WcgxGsgQ?a?khr~r+gxN3zDq$G^D?K=T;2FuIy6K_u zQCOX2D%1x>(otox+x3*91ZV9Hy}@nK4Jh1E)qy6F?BNP_9Idr(rZ#WZdJoeIZEI!y zEutD6T@^q5n$wgp^&dDs*R8jMaBLHkOI4LO#_DZdn~Z!6dQ*iz_v{;I-}3RStx?7U zq-gp(P%4g%qtxlCUq!NKa}SpqW>f3KDLC0}Z<%?GQ!KtW0apxeCl_l#*L&IYr0#K1 z`+;7$z)2G_l;@WtGwbx@h%O-Wf4+X4V1v)IaGeR`qh`mpzP7gKY_xLyny-I1>`MGr z)+ATnE=S3TwN7zOvn%1Ga)@&?cmkl=1-^n*xvniz$Q(F+=Z+j-f}j4;>G$OqD(3x7 z%Pm}lnuXj@C#Bt>sHD{J5wV%H-D=v;=gT%r_q_qZD|*v3y0(*HEi}vKa)ZPkD_IL7 zs-QHFw7I&j%36m28lqHpd~@HFpvpPpMBoHfHGDKvlf-IMOfPnZZWE0OHObOK9ro*C zvF^1*4{WpmsHQ7w+LO~6YFNPt&IkdHv;qG{M`d=9f8#zk@RAilc}VpRmJhG<%zuCW zu7fuY@Vb@HZ5_WXo`3djQFT!-28)DpnP&OtTG%zoTRNHDMj~80o@E{9uU+4BNWQ35 zqp-VOvf$%EtCGfE@in8for;xJwv3OlXEGw5qrBbGWjuUPb~Yj?p<9I%nqn^5h|0=<#?7FU?X=t_9R>w#ORKk7=$&x z)g%q$Cvakb#%`)~`^)eFtVuk{TrSzYlJh^wuDjaJHxR%>yTC#e+^o9O zo`VJ$D?N&exD-5?&-73S^QI;IIdAYV##(ytxK7FiE=#ACG0^-+{*9L|rp~&SdM6~N zM^s59YFv%~%Vc~4Bsv(UYCUF{sGsrjB`1BpwOpCr30_2_z>707@$MX*|$AIOc zH72_Z!5wRIIS*}QRU5|{SoWiRHk3kYhYyh|7++X^`Dt9Wgcru4r2T#b_4IhP}Q$pS5!(zq!N zyMFQEhn-Tl(!HM?t@wm}U;y48*V?MVT9%Nn;hc8&IiUhhg-m&GdMP*up;kq_p|!lJ z@{F!)Co&98(bIur(HaL6x0OG}My9h~>cHE$Vl8g>a3Z%!i5IjN`hEmDa*t$>h@-6YGMwVng$f*cS)6@2t6^;Q_Zr$lyHcDRgv(615X%c_(6S4 zLN*Q%8gXC4H&#IxGR*P;hlBL<8W7- zfsrucvg9-=>E5^^FhK3BCeHFYU=j8Ew@;+31~AOQ9zK>Phfv0&4^39<@p4jCd9_bk zyg$5pej$g`rc6eVC4nV2T!~x(3W`PD!H3w~=%Jg^rgt7;r}WiTL{Qk*d>WrFg-kyv z-*C`?biE)^P&N{#OysEXGg&ffFAcNcKN+)dUpDvFIAkqvZ)IoN1Q$X0bBKYYD?_sm zKfBrO4l6ybm5G4d-7mlSs?!lisH6GFW&gCR`S-NRMKw}^M1M#%Lw09+#*%xZ@`#J2 ze5#**9CCx|GTngOF(ucI?5^!d^CVNCyc zCCxZG3`9n|z9a8n)l?27S)`;{(|kt-zpVmCGb0}XziAz>+;@Q`Zn58u5hMe(n>xA{ z@PaRBuWE}Df8MqPUlZAtg5qRU1%Hr=L03WbWH+~X=B3zc5Zb9n847+!k=5)LyhOUM z?b2y;?l|+dS~9t{Y?Bjv#TGI3_92j0hjI6nB_|O^K?<%xex!tW9DraSB4Xe=semXx z-+lefSA|U*wI3g}V1Q~56e2trFy)fammg|4BxI4N=*A8y`99?dwydCOH=g^d;EtAG z4u|Fc$Jd)RM|NHJp8W;<&_hH&t3;Vq5?OqwMADdnDojSAswAsu@nj+a93YVg$^j^V zDhEYMo}?>Uw-xjt-ta zz1Lpjf1L`o=*tS>6tV{}voe~lUHZ}h&S-U!q3Pn+)fO$Ie7K(_D+DNIh1={udzzeMWlMag zC3W-foxXrjsrmbWl$IPLK&7+bWJEMYh6kW8h2hVzf_zn)90a`It;2ADcosL1`NYF^ zLCj*@0TyF$@u?4#`*!l4a!&+XPKhkpDeK`AnLNs%X;eRxm0-&lD4}mPQ#({PY&s-x zd!JJ@zP;dtDX&1@Cl)Q{$-C{>!{8^6p#o;caBkD)snSC87?OZ z1-aB}1el@Enht9cWp&uZ3ZD353w~vqulr6p|NMvk&&gXF!k93zeMM-rQ_2OS@$Rmi zOHR`ChLm|f@v@J$!o4PY;dneSBy^$QBK|1(azkO7$;joBmt`3f=~%@XCAtuIjSrNf zhkujZ*O(bJcEXCNLSW&pP@L#Jr=(4i(nBBVrm_VpPTn|23P&xq7kujrR5#c9) zFqh(S@GZf_D!)ALWV;Ojt$k?E-}gL!Y^d z!m`9b%>nQm-<2kbC~48|BW~9mrrAYT?d1!V7?rU29b{+y#+q}Hg^M%-za zf18s=S20)>z*`pGW|>OfnavvGlpaFk4TR$jgk8Wo$&9whBHv+rx0!EYJ~7Q#NH%~| z$q2`NyLEl>UwT32W;g|A;ggR!DpAw+VPJK&p|K1bnYRxokLLiv_!=*y=t)xI+qc5F!jo#m$ZaLgcbR zd-5n8)MSd;kqprXEih)XvZz%0*+rO`n|(Eq!do;(f16TuP62ZR`4E{1yCEY9eEF$3 z^vLfjT^iP_Fdt)~Wlg5LS-z@aK4Dwg8W-j`fEtw(eRFM9iMp?L#da4>$8YGPBI&-` zIk60Te|R?wp0Hg7kcEzgwoX8j&ycx)@koAbwB*Oj54~-?PeSo!4w^p!hDF9cd~BY0 z$>r(6IHE*|K#>Nmh==t1mFMLdV21Dbs`a_-rM-Rz2V)*^Ie%}1f6IpJBzuH^Xexe@ zM!=PCE_peNc8*!wq3D|l$7z)1l`R9Kn)LFT!KNRNMIF@B;%G4{uemaG(<&iNl=Rh-I5l2nedo{U9 z3x7E@Ro(xhC%<1a#k(Iut)x@|Nz0cixhXPDK!SN_& zPIVCO8h?!`v~n$Ae$5uQ2&Vc`1TZd9NJDdt>|PBN$`z0)gf6y=y$_7{^?5Mt6+~-4 zkjQu;-(!F*AcmHt9t`6G15QPsd1OA{txttL6q}}SZwzZWth)P{F)L(CJa9fxy4mfv z<=4|sKkq}ebR21W5u3=4hyYic8yA#(PCB)J4z#0^r-heWUTk%PcQA%_FPuWBT!TK% zg;l37sXXST_6DXUSmfZCfs4$m0@u&Vj@8HiTL#L?BR;i)z%sXV5&OVj_`@sz!=Eiu zsn0t0-;9-s6p)4fitNxAlx@!v4k4c#KT$&*rzxeZd+&qsv6Af+bV{E!^6Xz&=XMU#;oINGCA797qXSe)E?C}cCLhXUo=yHuEqu6t9!JXhL4gZ73ns6x5Gg#e z)@by59H3wjxE_2u*dq!X*YSQKplqx=QAn&}>x}|J6rVo-oG)}LC0wL2 z9TG0$OyD>eZW~_Xi+|2p_sNWlR7zGcZpF)gC<1saSIawvpcns?4J4Z)P!X9*tgCMY zUc8%w=lk8TcUGn)YjdAneB@-Sxfxa1sP<@I!&j_GcLtH8k3SNzXA1WkvoKp-G|aqX zm+~2oa@iEg{u05c<0HLhsd9-qj3GPOx7_fV1SZY-5`TC)y`!v9#nXNvBIG z9K#35m}bA_FZ1JoZRtINf+D-l28$OtP6%R^HfvGhMYtm+oP71!tAW1o{PW)YdQKIw zQ4|)RITa7#ZT+D{fg2rLE#JxfqKHe8if(0S{V^8e|4CVRbK%RZ0dtSbiuO1ha#Go!He zS!|aN=E}6>$M427{(yOGmKygx>|MGUspaEy;N>AUM%|njzc8PivoU4O*M zzbg3G2HJ^l09c9YpEHz}!vW@F?r26S=4B#K{Lxzc>&DaG*cyZqk{XZ%$3%Ac9>{<> z;$SY!q7R>mCa}MF7hn34&+*FCFD>;d1d6x}iJr#tEst+CXsGHqLv^+oEZ^aS$$Q?T z0U-FFsT$**&qN(M+Lo5vSgA8FW_1d5+N1h`yh|J(^-W|YO?`_)%&`>Kc$kw#QVIx& zT0X)#lrX0Rn+i`%9QB&uP9OY1)~2kaJiNtw-nz1)$=r%&3 zBw9q8n1Oe6f?ges-CLWyshZs$+0;InHH1mwT?Wq0e8LWMr~rr~TQZ+H25gIfk^u}; zFdRE?lDsow1on3=*os+y)Kk&ocI< zZrcZYNVOUNH@-ZzlU)2iJ|>GPGBL!PFqV^jc+7F!4^_hiV3}`-(%zQwNDHEtccw}6 zINF6@>`&bqvce#Va+mq+ffDh}VCyJJ#yG2doIKgf!Q+#Eugby$q+d#p0QTc#iWAg1 zkBi){jvVxLGk}rybD$l#^)exI8<>eF1Z4)oL4M=nH%Jg*Azr+X_QCHLetYKh0+29C|eF-MV`^H}xi=)A}o*UmfU4GiRxCIdm!3%QuNd zM))F3-ro}ff(?)TN=-FQ8<@0loadA96kB0KA;lg>Wxd%+`Sj^n1yQKt-CRXeep;kW zOt+NL&cA|o3xpH-IGa6<8Qr9KPS}DlXsIpFiYgrk_~kXNBWIkK@>O!OEI0)5IaEn{ zl`@l^VcQ_0YvO`rQo zH-Ng&HMkhxwoq%4`aAvw&O!g@S?64~wIyj1NniK?xmhEph54uHWKQ(0%>|zW52+I? zlT2B{ALqdA=0p)a5Z(H^xoEgUombzpw002Nnjc$TF5+omufKweGsl+k%EhNLm8&e& zY;>C~O2{uG$~q0;XEKMi>OeLwr5=hRmdhA>{jkv`jJHjdY_?nQ68^f`#0_r0*cLwI zBE@cVT2(QQDerED_07U(vfzeIDWsWRrDxZqE=g(!MOQ{|%5CsUJon!(^g30PEhHz$ zF)@7hts6JgP{oZtSFOSC%IY3>BVUI}UcBMsC58=m(Qa@Juv>!p zUj9Q)*j?@#rht!i@=?LmiaQ^>S{1aXFN~>vsJSu^o!{{UD;!qe5TR;Je#}n>J)FGd zpDu5&X^O&DbCv_1J%Q))@8#TEfU8Tnn=N@Tq{^qaiZr6Ox+SFNQscjTO;*fOrqJcW z^SoVyneQao&U>HzZ5Cu-wR$vpQ;+vc&&b`+uEKurtC1qB9t+$;bwTkh5>97xy)uNg z95h$~9yD|oOn3(#!YgN>Q{9-lMGoZPPYcb-{$5TBVFo^O_C;PETQEwd2tBOwr==Z7n+$s)GnP-rx!c|y>Wu8R_L@mgNE5zi*-rviXI+W*Jd4QE6IK#YL@+EPMqflviW4^2T+yd%SHFOhMrNYK2RgLyW89@%cqs)#^w;7-Mq2U$q z6s?A$tjuPk?!p$CquHF4RND`P?o`qP!IYYK-?UA4}0{6&;fSd$Adsb97 zWBuSep$QWO5#Jw>r!QxRiy{n8^oIoIM|kp9|4c?=+A|qs@IfJAvJYwMQBnu-a(E*q zf5`hO%{;b=C1MdcqQ%7czJO2V>>;U%Xk|6G67ILG0(@tWG;5g^4)5?OFslvq=5RCY zsNHXo`$QNj!L9F@70bE8wr;q!9Nlf!R^N*34~!iCiw?_J2TYU5@)5RRrM2RQ;AQHm zkk)H&NDCC%l^LS)bx}~AsufIA&RXB(0)KWm%C2a`wvqR!I9V$v+CjX`ht~Lv-QvDn zY-|&G^UcOkR~DqXA>wExTqjq(jF|d4cF*}qMlq3x$A*8ItslfSis)#EjRnvsrK1e~ z%iU2_;Xs!5hRte#Mdstn9}69+c_E8S#k0D1kg=2|@2R=n(`uFX8N!%P0^6)p*`PaR zR}x|}^=0}whgSEWa!T?ex%lF1cAT0Jt%`Iu4fLx>=-&^1v{tyEPU4bj&B^Rg2;c~# z2(qT%$v?}2*;T4K5>m_1;>LV_@Gc*}S?!`*{TuRkae~s+OjIARc2ii4KQ&*7T9Jm9*BQPaUZ-3{$npG~eBWnd zI!2Ds8K3&!*{kDzK%fyF(NPN>hUtRo&xf@KQ=U`8^dTQ0eO`R%5AI-IQ?cOzhd6<> z`x5hZbt*Y*-N0Q%EGwcqaAA^qAjc%(702OAH5t?ZxYy*lGvc{QY9*R&ED#Kb8<}vfuu#`@)7Plck<=a*Ety)n85G(Kg>SRFSUODq*x>?kX^&KJ;rWG;C-=~nvCf3cQ8*~_oJCg3cpNPOBd7V5Jn zfpzkh9LSwB%qMk6w!0O$Gpn`l5t-zdul|!B7YBOp6QjX;3z4QNLrzexWesFnBoirI z54byHFEFoN@E&{whdE;b$1`5Nq60HE`zGQ;)Xte4-@Cg_?EwdRYYqD@EM5X}qLD2>FpB$+RtONK<3F^TX@k zkxh;?n00PiBi2Z`{}PqL$P?SoX28Gr3!xoOs?okuVASZnhBzOVkI6?N7Mj0Ms5ak? z3IJ5+*48@XxT2Ev!57KgAL6rS)F@EhzT5&+xMa9=FYLyf5!lH47ynnG?#*W2eEE+V z+I!p^tC$aN2REzGjykwSRBf^&9%JJ0fzlHsQj*LPASVO_Jq@ui7JYOcm)66lrK24y zjH17raa)(160F?0bw~sVv!}|wiIdfFW7XV?F`B{wAe)b2jLF+~XM)d`xoi@cKLKe)Eke^{j{vDDYHXOlvwO9UR8QvTah)Yh=GV(-lUQo z7{sDw z!p{Y9!0Shi#m+~kgZ^2Migg~3n` zzV=L{P8}}vA>^7`HHQdp1&`8rQ+r*I2XvoFrUx1aU4$%OD}(RRBV=gMx! z%b)n@Qp&6IoRQ*tEEb-@7K%;ibJh8|>=*PZL;sf@x`M?#=D$J3%`9u+KZxE={a+8i zCOd&hLOzhOFXU>6Gk4|hFTM1V>=$wnQ}snT>gJC2H_VJN#!$!2EKQUtYBOeXm+u$x zVEYii8kPW5^C}0qSOAR--b{hn-a4pvRpgqy8H=;M>;q$Q0EnS#Vo!^kV}>fsl>h+w zx05&3@Qv@HaC=()D4)VK3SMW_!D!~Y*bvpd&8-Z7Vp@jEp4`28`6@6PgnWcdd0z&B z4s~S{z}Jg9e{*O$s^dO!{mC!;*ASs!<*ecJI*ueyo_P7bT;gyqtlBAHMd*&Zi?WnH zoi9Go$QlZVb)fQO+&YztWNQHPfd@{*VP{7!% z6*n|c<5|dNeuV!GA~NCOQ}<=4W{n&v)AZ>np`MQq^ieM;GzYgZwJ(ciK zS}4)z=e`vc#yAFYX6N_}o}wQnTYZ?g>BziRCGHN(r)7Ysx~qadDekZ^Ax8@(DJyLj zpBQSzbJzVGlisA(Zrhug>RkQao^2gu*?gGU7`lzx4Gh5>B}^?sd;sXk5e!iCx0iuD z0V3MF7w`BnV9K3=Ix#nh?KOX>BV=4 z0}>Om(=v$TKpR$eSh`q6f(*&OGdvOf7lRhoPfN%3)k89_di$8%rSkqkNZyp|^209hU4lNM<))FFCF#!S)xL)P++d0Vbshdt0nx)83q@D8f z^wfvWQ4R|+_9=zL!fOLg4Mjs?)k4;q&^+SM;j<5(2~%q3cQ`STykGcKU8E6!`B_9={Y!T%Pk-Y^vB*_1;ehjm9$*=|1l zv~=*=la$=nWYY!W0m60z`_cbc#2f?u15`aK@26nMJnw1P9;KyrC-yX z{ioroAe6-BMQDhq##Q4aaww+9@+3k{2zh%eL}ZxKR&CoXaJhu0(Y!C zrIne-2n$M)^)MI4F-EZ%mOJ9Ym1A-+iojo-3*oikyXLqE;jy$6pA0ka6sS>F4<{c7 z>om)(a3O<}P`Die!?-hiV-C9!w6g!0?eb2aeJMQQnXhWm_ml3q+__UI!6`V46PJ;+ zh*$3xcz+h+--N^Li${~UecLoDU)jyj*9&yJTW1BIN4_`+(Z40R;`{!aPO&Z2x>eAg zMjrC})yMvm>2<2MjC^exIN@gn%*SABbTN5L^Lj?7AsbYCVYK75ex=dx6kr@Q7R)Jk zViG(UUzWVkI8;)#u%PS(wKfa9M+|LA2Z}qRmF@61HTzlc#X%Rp_(V<$$Z5e$=>RcI zh}i}>CqhPgMh1}sEUYoZ8?GSEN|=Wk;iMtFs#*1+o0$S~jX7FLdRdO`(+J?i%pk|} z$`Po-;R3g`e+nd*@L&`Oz2@h0)t=*B-+1?0(mKL)Wp|;_2ScnL13+H_kVc3j%K>yNY5>Pk58#Iy`;jkk+eV<% zgJIBY@+f;>X74ypdMelcW}}?cJ7QBX8}t>#rfja3AyoPfXz+OFyNZZZJ-Hy0$#-?1 zKmj4kaaq1?@*icT5)e}27FbRR6y7HY)TIJpSrgLHZgr9~ps)d`u+2u1d#{<4$Pyx-?@Z{n6-OdzCmIDuC_G`Y922KFj!_vbVNqk%etLB& z{%>`MRiZ@ z$y>40dGyE6IT=usvp4n%zg?(%74KCf*I=r4rauE>skcNtO>T)kD0~zHXb2KIj{F%S zH@yMU>cW>Y#_A~~J3&r%LQu}k_Yu)kM4S5(-~XQf-6AG0KK0W+=k@^~9lBGl1o^@Z zZ@5)}h^1Iie>79Lr7096W8)gvLjll`-Ve)_t|-!9_MAhPlU9C=iaruQw z9u;3U>JG6uR<6u@uTuL?Z$^UmdKGKjUw zCn1>QLu}%zp%=EKw9LNV36tQ9nip#G#JDQ0E|ZYz^G-$hf^H0H4&LkS+DZwTss z%5FX(|NkhacU3dPUuApxWM~5byNr}YrIfG;Y;Mc>B|HWlM%fAoY?@<%O^z+?CzsHe z$it%#3J2!o1*Gk;%i%H)s`K7y5%e-!KqfEFiAVw~O=-DoR-e`QKwBcYM#urLAZgW? zmyXCB_4Rw^zW-|}w8&E{y6t~{`MM|%M0;6D666Ob;bwviZ~Ct8(0bA^*p2x4ayUMJ z$1@>g7*tx&^DORY>dE<>*2hs_kk7AdhSJ1J5zz_$-Y9fm{Ms5va7uuLw@QYN&utu> zhP~HlAe$kt8j_(<5!c?R5H!H@@e`DSI|~OFF~0@!OS{pl1aCkQWYr-^wsgv6RU$x4Oqx1whp` zvJn95%_s!ZKo5(A_ko#XS(@BH;uel)$|OjxKG%8}R)%E1`i)cXOt&>?E;%EKWopgK z@)Peh4iH>>*rBGaKexkW9u4pBu+i{k^G+GQfXfAZy|E~afO}qk&GdCc+XZ^@8*jH) z1t1HJtYb-Ya-i|3v<&!gnD==zD-K+0K4>gK0P1JYFYoPH)~+|X+ATc%Jw;|VlV5>l zZ=P9in1(ulAjMK0BNX@yxXqjlGutag<^X}#7KZL`E@W*B(LmPr%537jK_SdDm#_J6 z>mA6+xLWEAFm&U_O>Z4xNjC0U7dw0yR?WN=pJUYX#W~^jFDeXKB}1nce)LymKT>m1 zGf@o09C=$_Ah%Xlrp)KJKZfUFc&- zQEJ>c1n0pCGt*j-!Emu)PUxyGJk!QzHQp_@+Y*A4_r{=RdSNSS&B))^x*BWF1HgV$hb(jbS-Hb6{&fn{c=`8!hS!7NN{^VZ+ zRcG?6epiG){0f+^P$+!8ZxJc(|28;WZ{Jwm#fL&{%s?4I9pW^Ra>9?*y6q$G_TxUb z2S6qG6J!+ci5s#I|17D~pBej#Q5e9~Z)E&dO280sHBxqHakXJ`OzZ&jzU)!J8Dd8n z@JU=4llQ)ORZb-L2N|^SD#PE7_@{5YAhbyFQa(mr+t2Z725;6xaz#qFmLALkG-x!4 z42Bv6+Rk-YXcV|B*};7)DNNjw6LT)7i*u=P#Z=*Z-HA9dTEX-zl%T}xx}Pl`K5^5D zP8)&qy1w9Ba(P20d6>_ALp3Apiar)jfS>YR=xZA#xG%y*I3riKyzfS{GZ@Gg)pCSP zSLKj=aApFLfox=4`Q+q~y0X14+K6{TySdQi=yS|gl-8aM zfH^IB_tX=j6Ikez;RtiWWbv#f75g$EN_Qup>=!bb1tU5M88#-PvQi2`>5+;JLA@-h z6KEETun=f9hiN_eqFM{5^&yK)mX?J=`|Qa+^M7`ZBdLV}-BwuasV2vk^X_X*BsR?4M8? zQ^-Vgy(~GlUSVR!qKyJ_bdj%nveBmbROu}r2EMZ_Ys_rV_}?u@BTSR0Qlt6qobWV5 zbqG-hz$M+kW-o=W>2Qdv4z<6Xk-8RxjUTlel{4{YWm$&g2T?g|?DT3`MQlgN)TjJz&HBMvcYh7fThostlpZ`rbp3F)?nx=?rW<6FW4s0yVnbppi-?8nVP zoQap8@U>?cch+UoZB;6|SS@hbgLm4fh9QSNfBnYSrFV)W=@~SyL%8j#$-g6V_)1I< zm6@RYFN}gP0!;bw1%Fv=f=#5{eeH|gBIDv0(YJnzKp60696aJ%*#x4i;TbhdZ?4Mh zV)}N_!$9a%--1C!fB>fcP_L?US`CSr(aw(5Ju^dYDRM*|mJZA15i)NNRJdK}1?3P` z4lOU=*ZpX?qnvxDr)P|)*kH{wTiZTsMguP*vtDM+y^{>Z#x%TbME=mLNG4EqY&bS! zun*WAXN3%_l@2=xVO2(AwHjTocn39tm~XVMxs~L6?LZLa{L3Mc8kCC*7)N^Bk~$(5J_T!*s<&oU>d9810exOxAksWRd~G$XAJhG$W|Z#hlLQLnrHLkL)nJIaVJ7@ zY6EPQXO`e4KRZN)`s^*t8RKlg%FJ&`C*aRZ{Gny2n;^Ap6 z6CmHs#%U4@;8+=fgz|uI$Yns<%`{CQ1R{2S7HRO02HRptvso@fhSYoPUN%YqSL;)% zN$OFv9Chp2AuvFLrpI~k=Ma23jnKl6mIk=zSx2r)YYI7?h(9L8H?n}dMTo@{m5lLH;rD1En z=Hs2!Ae4Cu{1mH7bXA*|&}Q&AfjGC~FvLvIp2CPgX8*H>;SJ?DgI z7dh3bHdXNd8;Y}UptUIfK6{AuwXeyKxg}rh+Ae1`LQxndZdg9UC|-D|j%!7>1X*k} z4$sTH1_4b~tG0@n32uu6AuEn3%*_#%c4s$)3C^0^B@Re?Ttw_)$41rEsFoyjPlM2X za*Ths-t{I^9a{7B_|h2=ithj`l8bp?b{j>}dU~Z%!VHx~h`6LIalPt5$U^f*d+l(0 zdJi@jBB^{d`~;B#YKY`vmn0tVWOSDIcJTuhA10|V>;$xAai5u!Z)|P*6m1{KoRY}q z&%SuIicsw8;PW2})n*2dRY+!_G7plM0^IPKFFR`}9i1t}QYK-pY)^)4|4Ke z$RPk<^(tW$07?=to{`PSdJ0u&8F!3K$wndwFzcLrA0=&e)B&?-E;b}Zf6jzgT28*I zf37pS{r4^WthtP8fznDYZe7Dx}c@L51xc zNDp`(Bsa1-4&~^q$ON~*2rC-nD?bL}Uq(qXv?^Ix6#}N23YwH<^Dz?!CKI->iO28X zHQ>l)fw9RpVQIp=qY47axyn^ZO^>*pg?`l38ORQ?N_NutZbKD#8IL7n6H8M7n^a{~ zMOrpFbyt^rz>cv~x(Z6o;Vlpwj~PN7OAM$=HY#O=m7aDsD`nXUd{9=oQvo^`(*6`B zRaRuXTLniUhg*fMC$3)$UR`$sqH3izyY9;bw1$$&f_VKAJ|8(_tK$f2nP%Nj~S10`_f7RVkuI2SBPHUw&@u|6ZSvs7-8=8ZqwtvgU<0XLhnob8-c ziT_3x{n!xb2?z^LfRkV^_+i=-O7?odk&(}hl@_5k@awS7FDl;PRTW$%-%M7C@XA&` zrMU)6+8!ZBHY-_(A;l4urGOKxzH=>@vbm*xhIU3@hLkVA>^utKi^(r!l|b0>6OIrc zdviQYd0@T;x{3D$q<@kj=)@l<-w`K~G`wFp!@Pg2M z>!~GIUO*GH@{qDl({yzo@nLl?V=+m?Sie)&>;;B{>AxY7rCWs%h7 z7nTWR5dUChK^kiYL>kqh+7itn=i40YRyIEP9MdKXCI`inpRA>?rdT`3l|rOE7uV4@U!= zpcQMy4+Gf>B2--ZGEu{dmm#3U>E0tnNj6KTS04`_t6E|C9`Gwwm)Vf)aXkh0fXZ+V8A_mfTx8Gm9b#3i`*!NkQF{Om9bfs{mX|r9(js|Cj&SbE0q==5Gj^D z?6i8F%A7(5c;Fd?m2JRHx2IO}sa-;jPKa0@xZS`1eSlzq=`EI5MdC-r`~OYwwzC-54=RXf3iZE+ir<^4Io zDeE`dmvJVO{U;Zn2}cD|bb2`zj$Z2ToEG8Vb6FK*aDk<{H0M(qCuf)wYh?IyzN(Y| zh=1Pvi{Oa6T*Jb{4U9blw7#Hs9^s901zF}PSXrZ8-~++uC4J7PTgdzDoONW%NFbKE!QT)99ts?Q0#lRPG1Q zw@Te}#4vYq9?JPhl^k}4@ye{+_3MzHT|kmqtY#?O+6FCxu1V6tp-OuCbkrfMs0Zp!lr_H-B~%=3`xC-3h@nBu(!Y1*UC zi@@oZuSGva+A0xvL2tjT_n zaOsy%=#gwomNVRos5h`QKgD19y8r9thyJhceNSg;NA@v|cy3e1Slq?lF>_;s9tmGuSxWT${#1RhkE#;h8h2>UVpSXoEY<`N`qOBOrD zi)08HQlvu$9(1FO?#1se6=>vJP(oJ7D`G1XH%+akJcqJXvxJ1zU@j?GK66ZyMNr`q zt`cFuh8V6t2~3goHRL*r=<&ut2+kFN(Q3o)k9#Q8U3Sf#kc9bBtL8Q{3NDJz1gE7F zpBHDFaOsg0fWmT1W-FyhS#R|MTw5 zsu03b_Pm8QLX$o!19vIN^qK5s^tswx=ojZY<^Q|OJBgV3U$-Q3h1M`{}tKr9TL?sqx7`i=)wc5jfmmS z$I#81;ZSPJqr>jSxq1vt2dj4w5I==XkSQRNWWz5guh<`aaNCPAID=B^U=xpp>Q~2d z+00$#(HL5a+)+5l2CAT5VPg2^LYUJl!lXbri9a<%tSr{WFS9RwB`Srm5O*VFgdczS z!5e;H`L(d+ja|EOSUG8L(ZGkiuj)Y& zX3-jllFN|5Z;N&lT7!QDRQRX9%GFTq0S7?b=~aY;0~~?~5?3e@Y!hF*Z)=s2spuQq z`15~^OOx?z<~pN;uBvV+h>OZL=U${8pT%$p7AoNS z5VHPOPOL@sbEn1YG&8d$1KbotyM_9fd3sO<{WZr=-z`i_8wl6Fz+1u;3QfnxZBZ|B z$iDcc4}f?>_+bpLWrBiGhiF#Z%HG9EzS~w>+rDu(G&|r>Q&{j>2|pU5X%i})X=P&U zZXG5H`hK`M0n@HS0RQ*m|BwU9c0OtcZ^JlW+vwg4B3?AT4T0xmY?XGvW2@pAkM zUkhF%wwLcHwj00&A<9~<&@ixAx?iUEPcL7QDWU6k6$^MsSW|3nO1#LBVFpHD`O9t> z!}cKXpszjj1xCk+`)_4?!KYv%eb_1y8m?l9{VR@y!h!)L*g4D+>uPn>&KBYarRkIi z$q?32)VxKCcBoi~6h;(`%-+zfL%2-3cIXXX`f2nhpRdszC*?a!s@GMGO?dXn!A!Bi=w)BG`2;7bA%#;#ZsfRm)Y*lmUm-e_$XDr5E7WYM6;o&HRdO|x_x9&SUi#C zbw%%i>>zZ@k!mtRb05UUN6L!PTTr|e{&B)Ihf(2qm&&qwk35VMY^&|GH@;+ez$W5L zhalz^;iQPqp*@VE3&?QoDeszW$zp%RP4dmPRY!by?y>*mEZnUJL=X9%1e=^(fRL}I z@G^n_g%#EJ(K$Ryy1ku!&Ic{4R?0riB)g!P`pEZ-owNfc ziqpl?)%}PZS>BF+a{8Dy9ynY!wOl3xigUBbJW0lg#))7HUU2zCTvqSu1~y$Si3GBr zD6mwdc)}ZEf9&Q2kU8crUu{{vQ2D)KZe{ts2>?(;xXvxgXw7ybFE$23DVq)C3Y-h_ z0-(o*fsnhUJ{oJNrp0?^8zpg48T_Z-HJ_JLelX4;ZqsndV!r^L5VG;zH5pSX8W3b< zH3$Xp8J&SjiG=P8YPqXwX~If`(yrf@rAqK2>r^a{u~=J+26ZNPGlv`^gT%!PK(lXS zhIW*Vux!z!`}LyL`qwyO&I9;HX|0A5bZR*{L$qPg;bavu=C2v zv5-z|qfQ^%nTw<+a$rK=vInM(d-vm%ujl)~PMg{AWhz|1Y6HBRfYpWTGZHiJC}<{E z*@-!5vUCG97Fwx{KdEM)dAAH|?X2~ut3KIXJU{+#?o_o~lNGO*C}Ga3vI7sr*~``? zYU#*SEO@}VK3z`JIm&1#LccTb!sQM_cD6Adw4G>tywX=N^fmmJ{$AbIbxD_5zSL*S z8@V%iTiV=e)YFVz$HIzzMLWHw2)Nn_8+ppZQTy<5{lLFU7*n!Z=nMuWuKQ%;wycXm z4h(fIK{qsHJu%rMl$fhP=!%}A*Etr9k7QPd#k&;B46O>WHguuM`(ZcfV>ghUabR(w zz8kD8qqM!5^;7D&A!zy;o@@CRob)(Eh8gQETX2>|5ld@#i!HjaV>L{Kt*0WG2@1@6 z=|{|sf%ax|D@gW2o=r3x9}Q8HYTY2~WFWRg-0;6P#XPxGBI4UJe?r?~>s|l0&w(F! z+J`ag^(F6z`0}xj@*CfXzjC@}mE3(B z{6(J#phBRX$VBi3AM0{$V0!hJFMndm%Ki&-6H6n%2w;eVW_e@&gBKT9eD0I~z+#$X za<6L1`{KO#guiFD0fr`RoW&(EkxzLaG<8y+$Q@{4!sOZL^T{I_<<8`-P8y}`QwkYjs*_K;163Jb zdRiW>W~-o01Fo0h1SY=;VQT6S&9N%mnz{r&e~QSF`5DNH{Irk)>2xQTfuU2B=~%AI zH!i>LWKHR!P9!T~?GMmN_ta=iusoa0-$d996J8h81f|_-x9yE!;2g)5LNZbTy#|gF zUNGO!*?cy6I|fm;(b2py*sR z61e^ye~o;JP(&H~PU*up@O#82>39~s~X0k8972-7m;*a;4UGBydiCf1%ca-bLK;$eKO6n&cSca zXR%0waXaoU7epY0R%|`D7E=af|`ek+L3iIp4c8>+`-TEn?rfJv`QsdO(5m&BS;&9{Azej_^S2 z|Kiv|4bd53Y1~&~9XsDHYQHy0lxTrlHcNp~kp~Uh1=~Letv=r*E`V&}lVnWuIcv$x zIR(~+xsECH>+8bX9XvHYhounU)@J1u3UsOYbE@}cwVXx{BfQ}9hbFYiXxeAt0tTE% z*b#VLM2b7nsTtx^qP-&+S@067-}+ni=l>X#GU3|?zOT|1hh$ez^yH%)-)9(HI=E5j zS|o+b1K^#6>?u;a5aKLI*{pXm9|%jFh7S+;@yS~;>_ct4HP+D@yg*!zE!28%!CV8% z&6f$B^k)ocCtlzUecu zTH*0^BNhY`Nqk$HbIP@I zAgeo!rQ7s+-6|sNIX%v~ zZ-;I{ELqrS2NAYeHBwkl32j zuo_Js4MRX84@hQ+z#k?K`rYCR^^Jl{l59W!#lzc`G9u0s-sRxvJimC9F=ZW{QHFO_ z0@NF1^(764&sXb^>yrgUV+m%kBTJ|YZ;wJfGJ~IUs2WqB(|o}|-DIlxL{-%WbI81% zz6jQ&A!G_m7$)FzKPlV@^32#p6IZ4ozN(LdFuja}w@oOHPnur1th`S<(k1{(yQ%7q z^6oz?>SkbEQw9LOuFw49{ma+BlwO;cPNKC6_oBBzB$9G>4E+5=S-#!bld$*tqHnu{mJ}+Pb@*wa_>P<_$ZVIvzO^Q7qa=cG}cothX9rct& zD2?%%gIJYP{vDkf82JSlCh$*0xG|(m@_=X_NFI$O^P@m=OZ*^V(b5vEG_zQ-&q)5$bfgd-GRWW4hiDEQo>H98uUNZU-evr^?7( zJkeMjst2&AGH3ansZAH}$WCRSTJade{#GM+*NlKZ$o4qQ4g1|XI%XzYdP{Di#tLIS zs`oUo3NXNuxYfWiQGE6^1}!ttmlX*Zgo8i%#(X(>^>zZ8lW)zW9(#qNZ0uQix?GKq zgOFp6zA#=wr2UW^4ofJ3?JB}EIV2bzoEo6H%L#EjMQQQ&IMn~qev!<4T%E=866R<^ zv=Hp#r$vPqgw5di2^KmmT9$LJAHpTVdX!`+t-3fZ>?Fe*RuK?mmK7NQ;#sWuVRIt% z#yZ=p)Zz|^L%fOu-Vd_X!}EV1bPv_vHd~t~1k?`If%@Ik*qkiuaW#26W;rI&HkOo! zC@;RG=H9Y>EQdGM&p8e$h4&|a6Ew20snUL4@5eP_oZv8c1s^6=Bjlxky#CH`bVk`iZ{4DrV%1PlpYpY- zfdqgl?Z$T|I25XG(&e(Hn4rQK2Xo!5I-)Gim~qfbPRrQQ)=}tGrNRR@b$k0*4Lm8w z@u6n=I?;hCZlN(<{#6k)x`$gMqdb8uS642uZ!G!@X~Xi`vUgw&geUL7Qb{d3&*{|c z?G*}ps)sQQ@5{zLA2O}ZR$+&F;il-HXj4~yO8KK@5)j@@adkN_ELU)qeJpOi^P!j0 zm@7o+Iqo0!0r2y8XTa+4+1Q{+xeZP;v$`sa!={1_vgKqB?NZfia^Oo6A^~M`$}A8? z5&58tsID@PeO6q?Fd|7yr#aglsQV2=5>j_)0O|S&QxH=a_G=-W>89``j;rtYxgrk8 zLe*8bI>+SUW5)L0Xol@~0YTV_u+`1*!pvC#44KcI0C;h~y=F1ImjN}Ea=D!RhR5LD zEzo{25XMY?o0FQD(ASFOIKVtp_@=i*&X|t0+Oen|8B&cfCp(&NF_QRZpADZr8%Oye zsWh1yJqmEdScm)F;tHs_=zT0!%E#w~G&pMHx+0h$0bcmK09OCR-`%vl2H6K@Zrbti zJ{FdO+lh@4p`R1Quz|%b#XIwr3PGLPN@>aF?-*E3ci;n>&q(spZV_ByuyfovG`>`6#xREzXRb{Cpu5xL zG?OVG3xm;@?#r8&<6DUFgsQ3rZLBi}3|JastIvvnLTQM?Xvj#uLynAb3N3L*Mi!O9 z3P+R7r34f-pK=CCC$KOWC4PYvndg19kyyhBQ|}#OiS(fnp5R2L19FS}n~LJMVVFpZ zB!iso_LSA%E-bVbmZL0u7nUbSV22=u$u_&%XYQhKsfi z6Ga!rFuSs*#sB;m^DY($90EC_BpHa~V_vwGQGr_vTie1D?B>LF?<1`ab*1nntG%!T zpx9WIjCrn0r)CN>@$HYmpxQ?QfD3Vo!Pj62*|{ZekV+=5&HrrnMkhkDO? zgjV`t5It3$oE!D~zCMF8sw&w!fYNq5?X@$I}_bfhIjIyIhkK%m=}~xF6ZU zV=}dT3M9TTzleIj{A)<|L)nGbu3hsspja-=#HPY_?()Q9OLiQwo^V1ZJJ}Gi*YNOW z6sMJZyWq`J&?`11{M<*#(A(W?l1;nwBk3-MtY}?GhX+u?WT$L&JwzSGwLd@JwEohu_nxCB>8`kK5IIGm(NltJTW zxra$f&zoVmp=Sa)tI!y~_O(cym^@M~+75RX(?AD!-FMAlqbGVuyxTtBcVDz>XD56_ zlYdcmV$qNVo4S8J>d!K)`L3Ggxy$gThETA|3CzBy|M%zWHJ_%K$^l>%0aq_1^;OEs zI`!DJ$b@ot0L)&1Gi_GEe7W@MIGFce_9p0(0)4*C@TwRlgSW`%L%Z^1FK{tn)YJ6? znb%Li%#u@3_#|R`n2$BOXC;CIT6UdN z5L*fXBO?W|8DL^VPVmmw@2sv05zu!nBv=(e3nKF=0D-O}t5wCtT4a00$7JCxDh`fL z8Dwl7=CxMfuaQGfr5|q`7Kl{hOKAE7E&Cv<&bjCbQJPIR#a37KvQ?8!mHnThzj$jv z96Plb6#FZ)O-2!d=2@1c*K!Tnt}4A|L9(ETLIvDyYVDbrr|6lS2VaHDpI*M||E7w2 zTqca@js}Y`JgX)FF`QDovD>J{%}U1taoyzFb2hC2GhRh!360_y04-U3c7Hn2PIo@U zO&=96`AN)pVYryQkCDia-@})LkVsR-n`W<8A4AJZ1fyyCcEN|1>-?D*9~&JFs=$at zB1M83%ebexhMm~cun_e*0pD!k@!k_iD_Ihryww&F5MEFCECvXnz)_V20K!t`$vS3C zU$ly%y#8+x*$OkX$&}tXH2hX6U;P2NCS}-Xo|d#?9~gC=_>tg!jgIVx<9Oyg7~$sX$w-n#sp{5b(uZWm(Bqv7}Bk zUiI|C<1-rZ!zZ1V^t^u-2@$>KtQ%I6sqw1-!$nef5Pl1rp4KQPJaZc1pw3-HSn7{C zfg5Nh6huQGA&dHQ-*^}oCT(vvs`Sw_*(}2>8((|=k9-bbO-W4m`FX$x6=OUWz~)0>yfayXvT*fylSjzVvHgfX3TdH6 z3DMngGz{H!Wqn)3IfI?aS2jn%yeqO>PJ)drgH|Q`Z;5*_rU?msCd012MKDK>31z%y z@EKt*%r(BAS3kjV6Y7IZS(R$5yJuSGUg(+;?MNjh1yM>fxzG9bIQOl6dVt~ z^SM~$T6Bi666xJ6lPntinz%Pv3?MZ+hrWT*EHb`QN=FR|k%#-up4!;vxbKQ|#F-sT z9U>r1!|AQ)hy0N`R(9;U7MwO1Lf-RD0NjILYV+awq3gYD3!}tJ%oJrZI205v?te>w zl^ceB4e_p{&P1f#WSwy;T3}VQhm=@^g;V>+Qi_FGi0E++4~jb$3%EhAMztws@X|G( z*lGkKPu`sT!e>6;)zWB$NEZqFsEAH3BJ5PIS?v}m>?lTqkys;)DZr$VTME{j@VRAU zOx=0DG15iD1e`7m|ITe6{`$IXWSs6kF5*zLe$@@bKHvV~m46Yc0qT{RuPEQVcr=O3Hfi+2pO@CkLPZql|8C@|NrXB59^zy*^Do-9}W7yPbNpJ09yWFyN=DXe~x6;FUz(uYJ?fxZ~Y9#2F@`c;9IqG3f$YKzDg zpI1shi3(~k!CaeZBPEMFF}6U&^M7CbUB2J&gDBIYZX|KPGK*GYQNg|?lIJZssC@uT z`wxHl-+kTY3oRP-hln7-0`J|Uwuar?MEx;RlQg$F)~K^YMdfMe{S4VIKpEwoW4mB< zuaV25Czs@A?PIgihgeI3HdbCWp1Yy95@ul$-h&eFPJ8Y3|Zc<>=tvj(bd%Sm%i;yA!&!s%y@85 zj+6}F_|nr)3-2Jv`nR%GF?@UVT_-XeC}DZV{>nNPC_cHAPG(nT9U3h z=!O`MO_%Z+Jy7#5ivkR4C{;}5##3_ZEtyY0iAcU}d1Gc8-|QXUDdekU@{f{|v&v6V zbbY#)e|`Cybd%&`I}7j2enBB5_-+(oxLb3=*9*jrYKI(YmF#hr@5Ogte@S*X<_d8v zGMQVITEWrvcash7d{=`{Bx1uafmkSGnzv% zRU`3;9RA5$1xgEuqL_?g70exB(ulVJMT-)btJ%IfY7Dd8Uxau|v9|oc`dAeCf$xxf zs2Ydig6REEa2#>W1t0;$j6F)5gi8t){wK*p{7G{6P9|^4mc|cC6uJE2s6o&P#q)6T zmh8kWGf>OBd)X*16q;r5} zhnWf77j{D!ZYoojP3OzhuoC`?%R`6FIYfk(=l7bPrYQF(A4$L8!nhIp^RXNV!$r}V zqS4#jrRiI6BcG;8S_7*i;k$#P~@N^mF9?}ip#s%(%+Mo1{5jIJA!F4v-w+xC|=5YtT(@p;B>05S@f^} zNPa8t12$~=32HHKB?KvhtU}MdS(gNfS6K z3bg3zyAiv*xh0*m6m|qjsS%7oP+bG7^|4yP&s)1n-C1!$>*(&8(?PpSmHFFzl1K_)T7j|p!t zX7!L?cWaT~h+*cMMC_2)XS?C3#*nMB={HxtDZMTt+sbDrMR>s082fL5C}TN>&O*9K z7Sa;l)DW8)AsiB&P0pjGK3@Dzb^{kLV>Z(J?KPtjV(`@e^sY?g@_(Cy%x})|FyuDu zh%yf;WsFP{MpTY*?1^3#xIZn6(U!w$!M5yN89Ymek|f)U2<*aIfKq5c{>6H4ktXG1 zGG3GcPqswna&SB6HPCKlLk5$QI4YLK!EW!df}5y`1WAl#2^ zv&joELu#A|1+WW8Y$l1459D2g&F-p;$?0q!40+Guj5DGQ>(R4PK|Tn6lTkWU!H=I- za@9BYmKlP=S2>64w~esiOO01iI5%uI74nf8Ebz5c?avlu*Abe{)uA2Jjoq@|^mEr< z_77jwVEH*}obBnFljf*$2cy@sxW%?%M&4?3RG_D&&|Y744yc|P0KW!(_2EI48oW{~ z?Bp{<0X-4CrK%kciX`Qr{e-at&1euDY}v4P;{|<-Q1>tubn;$UrBgICpF_X< zU3qfxFS&F&g)V$DLrhn`hJ0W3^Z*j)8TRv~mb|vsXtWhWyyD){A6^+%4^O!*bvx~i zxff)6fk{|Iq~5E023jV&25?y{x!E9c?cr0-zQI&(dIG3v^_5_(=mRF`NE<2K7E88& z$c$GOgI-M_Vyw)_WM_*iN~R2D-HQgm5tRK(T!h`O@|P(Bf4D7?&t%%e5xFfgn$6&j zGZ|VId0yI^I#HCX;$SK;7+_Zn(N~ygrqFM8z#)uDUVlNBtA)!!ey(!FWLS6fS(UVBA-68ZdpRJaud^+tDOzy-DP)Q0X29FPBq!$_?8eS}17|iD1bU zqWGK4pK?-tD3gB)CQ2J~n|EY;AiQVn|JL@F^o>%tD?fqp(FL0c;8WY7g;-?;p&Bg} zXsABaQXp?4TZYl4W5?cJ_?8UGxPE8=(5_UvFMfXcny@H1eX2gMCms2-ueYY*saX!1 zQnJ$k*<=M)vd0}vz#luKfAXQs9JUbrlF7EtRQoY64R9Id{kf59YRL&7H0qY&gnyg? z8K3>Op945pE_0dHDrpC_7Copb##Naf46F(pR!bE(tHVnw=6}lPRv;0&2tCCE0)ASdE}Hglv3fGL-5h>kb}P7`>}F_%kkIc0kAC>IC@;(@R$b_@h%USM zCpcr&y{1lNgujuDnvDzW-sLK)>+lOu6sHbXtRA9!1`;le7U2@c9RF1AX~7kPVX%CeLTeEl-hpd1%S^G6FmJ(U@%=>Jc3V4}9W=e7V;v za$KjD4&RgJnF{-%1w+IV!lLxzY;F3`>5YpTzTOF61(K$v!rEK|b*LrHx6ATOZ$ecrKKgnxV263ou68BAgrl z-e_yFtWMg=Cpq2YIH&GUKZ>KmQbi^aE##7yvrQAA7d*ICbs7y{!Ixlw_p-3HndO5$ zK(!BwYE)!G3=l|Sbnl(2T`JYX)ip4KdSQ`R!NKo@V?5on*kP54G(xHtE@UfGBX6%3 zR|kzFS3YcC@8XrHOndTkNv1h%G&bziyR|9bgJIoKGDbe7n? zvH(h(3pqElrcR{O#!NUe!bQuQ5mIn+4QN@Gg4Cx}LfywOh$!@X`=Q9wA7W2PGgAiP zia46xHS87`wrkZvqgN%G zUIB3+ir?X>&#@CZo$GMvV)M)15Bb#Eh`Xb-5E(lmr_RT3l4S@C ziv(|lxB^jIodTWn(f-*FvzN2{dJs%3(e zH*^OL_#!hn4Cj^RyII@ZeIV0ZW~&(}qa3mYtwTRwfc2SEaYtQ|)(rr7gy{Kn z=<8q{XNzC!%8Cg0s)AmW3zlvXp^mgZ3`_!VZh^UdBvo9{);_vW{I-O#7-AA0Bi{jx z0n;kz#J*v+7G|~=WDSSi6TRMMQTGbecLdksRR8nkYu;4S>wVvEe`|BWAJA!Wa}gpK zQZmo3eaHFklP~$_?P|YYi0n8O%^2cM&S9&G`H>psdyRtMLPlVViikrb9=5Ufaq;Qp zPyD-T5GV8UZ%kN}CA`(>4T;+h)1lH}hEpY-rgA$zqq!yT?ecFgJ{CG#NUL0lZ6pjh zFFpw$QEMC_{yXi+X@^{!fYiodu_6%ID66|luwo9t{L?$3G#SGB?E(ib*7dO{&H_gb_8Tk>^d(Jn>6vRWZZveHxT;to6u{82 zDD4r_;z%>$u&8Q?*`DE9WcLfLSXCq66a4HnrK5b*7@U`rZJ9`1;mxQQXaphjO}CV6 z@gp=D+2RH<1Ixt9ARW$O&(>LOleX%Vp5K%g4O<~Rgp72ZoQahC=zGy(w=O>Pq1slE zp`h}IXTPl?16k23Z?)^fa=mj#A%&?0*t?O=bD^C}F~mdIITuizTO#w<(`BzP8wYgs0`IG+4MxB)}~VabVt#XXce`a0+y z#2M^pSm`(={lu%u1pAS{di}{~q(5{|RYH~!V?xpdcSUx~E-`J4)yGor^#T@Rw+iE_ z`F_n^;;*$OncMO|h)MiEGcvV$icJ(ZCzK41P&bWyvk-83x`Po`NM+LBIFjSW7}R|C znS`WpO9>Uj2Z$f~4Q~{)2&p-0u+J_)z1`Q$LhN0q*wa<+HgD1Vm$KUy3Q&2}jUW2M z6oD8_cHBoCchV_X+n2H>EM~J@X{%Mw)&60&s?EJ|Oo!<11R9JUp^I#G*%= zx=KuI8zkgl+Frb1Lz>B*q$oDGmQ?9EfQB3Rgo<%4(w%=oy8O@Uh4 z33C*u6(7UptG?zpul{oy^XZfKy|3TDd(%$~e>a=uAm56_cF85fDaM%im`#nIbL|e? zv?B6f{6pd)VaKNZ>QGH$sjh>vaiUXYFa{2ijgVlwM$o1$Z$x7p7T{|a|2>Hm3X|Ax z^b(;yoRA&(@PPV3=0VMHsxexM6IEd;#r}vJONKYccFAts$@@7|8;w)Q)+cLXFG)_a z0Gr{upr6x9WSfg$rqv_(P;H&DAqyBW`cuqjKpkOfWAI1xh#0*-Q< z+nBt492m6o8G*(=CNgd$`Nfcor1g4eGKY@|NL6`8fC9La-V9mgRQB@Dc8MwSi9s5~ z6t!mJ$F4>y1LI&spq92%@VIh!vQLEjCjiHcj0rNRcTDw}kS1-b7+dinl3)#noo^(D zwSs>Bz$Z5|ciUGM$x`@92?2?x5C4VC4R4CVM zJd0{RWFG-+qf&JjJ=O3I3d7|QF`c?;7c8jQ=SpA8m1@3g1Xl#-<>}?$`!GKL$55jA z`!(gP23TB_Za0-`kYe||BbE-=n=(*BX-!}gxzrp1BQp-l-;4$G$HIKZ0q3$iK>^_KVlr9TnHg|)CKPlbTs}O>?gKjvP>T}iU zJ7;C6jb$nZZ<$K0LK7Bq$7Effo^iA?-sc#xDOdrs==KldP;YJAQN;(%r2r+h3p)p- z;D=LlDF_(s=A0}`)AwJLgJwF~Jo^0SP8CL< z(d-f6zb{*b1^LWU24zRj>|yD0BJwS$MDTUFtjUic=+9&vA35StkgIo?-((Z_78-<{?N^CTAFj*?MZ{E8t3c}M&c9x5E05n7EHnHAZ2$DvU%u`;&040G|jeMPU_0Pty;5Ux2_-*h0t|%jf}yT~e6A7UoyB zec*&X2X5=ywzHX38l(`|(JA73v|TtR?Z_uQ&l(}}g+M=U$Jyz3!>|s3vus6}G_^^5 z*WKA#@Tsu1TQT&40!3aa98J}eR z&9)OZWt?hG3jP9u%}8+Q?- zxU$$-MFfMT)#V6-D>tD+IA5h`_=@7ajvJ30evh6s=D_Xx&gBo_8?OrwnEJor^ePf? z7t{ec2l8o=w8Y*|wTrw*1F{ox?LZx;y03Fxjc0(bL`)2~KrRWzGYk<#8~Lt)PCW}#kRg*AS+Z{ndoFVN`=sf6 z1Bo(N-hELz`;)i)-?(j6-=KuG2LB-jSYnI%a_6WFZye4Wz>lcG2ptNp#PKo`94Yg| z(KXVHN0XzdU7|vY!F>t|=f`2*a|22dGkEQZmwi>KmLnuW$SgB6XNc|Z2EfDAj%>zU z*`DpQFCw>vqEM#e%YGvpqNw3a#Y3lXvVamV=0lmiv*ddS!72x?+S-z-B|;E?dF&>} zlUOoz ziicgC;`)cf(d50td)@u6EC8JFC?}~~I$2#h(q{-ik&$gp6Pl!j`G%yV*c-{#){Qjc z^*c`NwfRT@CVf9PCJU$$V+5F(Um%?N4{t>HC^aC|e?j7cg(&(a-0EUbRGDp=tVVwb z<+tjP(fPGjE5BQk0J`brff2&TJsN~gK$Xm_O<3h2-;}uOV6lpMPM#BX1Yc~+6B3H0 zA{8O2vMb-0fi6!J>InX2=ms&GY8akC&<-AlQYy?6E0`+^ z-D>uOh43ws01|l!BwxHb_KwYD5lq+K9D;lz^t-no3yrhl^^6G4y8wP}$U1Cx8#ua1 znPk*hRW&snQNu!6I*Uavxpz0k^KgREftA^m_U_gLvlJ-}x47NqgE`!vzSAB)d`g}y zQA|?CN^mY>w08{CWEk=`CS$Dyy}5rD;t0c<-*D>jC9fCN&CqEIwn$6eS!Cxtg*1e6e z2Xa&N6h?;xWk34nAn<>$tS@eOTezpViF{d1zz+!uX(o&2jK@L}&t*tt3PuZ^^y}Fsk-m&BJ#Y=5LFJ_2hiJx&xlV#9eOcWnVvcgy5DuGqP~!Z z>{|jmM3{?fK;h!=q`~;VH;^IxYkHYH(o;pGexc^gVLEX#4ZHht)G<5ZFDMO2gj8|= z`wFIrE5tF`Yc1qe=RyWu@OWC#b`>`;+XHi;_dOc){`c&hz4$w(9lwR zwR4Y+W0`fY-WA4r@!`}|B8yHnR>KA1;6hYoTs^G*vf#(7g5@%nbLrq2d z29msa@vjML`<=_bH|g0m+XQcXm3PnI1Kbh|L5R;5C|c(Hu83eZKFW1y&Wa4)HrJM( zi`k|qRc6rTDdSt(+F@gb}F>&w3tNygHNSV&WxRLDO97=cyu zZN!%rCavP%%uW&Qf{Ag{ubxrnj z2^D%eYIhKF>L>fLSSZYp#N12Q6c+P+!HD6l!Cd*-8Ah@U;*km|#4boc&>mD4LK)8G z@c+0vng&w)__mS$&+7v@bY31qkAuN{XNCES!MMeOLCX3`yW3g+rv507Wm%WI7%j6_ zO%rT@obJHy?Gh@>Dv)3o_j7OQv}(9R-DK}L!O5a5wh_hOMN9|6LxE6`22}#Xz027b zdQ79iE`Kb8$CKKt&xJr>PTJO$S75k+LCGJu*9c)(MvZMFWz);rV`D9X5FVL$@4U7d zL>(GR+bp9)`38&7wc*$*BXQ4+^0`cu4I#s^*n!4XcQalS*R z5IFuOp|jl)IYrs~ffpeH9PdAT!e3+(=C~^0Ls}ToJ5eti6To90Y`!tZBh+WLm1ksW z3nTvIy^vV|Paj7V0Lc4r8oyPpe@XTr*MF*WO`?s{j2w;`7soq?T3Ib5^~-y(*ASsJv!Kd_2t zG7jv;-XcInapaiX9R*d8%J zK?bip)m*Op$;p!v`g|Ct0gaP(>Eqt6W29mAv|{BiY){jWHRThygYtP(LVqTR=ALhR(cO$V?}- z3hoU-YC*R(t1p`BSNmM-WOe(4SZMB^&}VTH5~C(>l08r>fQ=bru&lmk5rV#o7_0rC zE8E|aE84a^E_sNel)X)%%>9t2Q{IH{Kx$bDz#9W4KF^==3$Pcb4aUG2bmC0E_HE94 zYnZ2fM7Nqa=Jo_0BP?fH$=ur{VIx>I@7__X8}mRg^sNVyNmjH#SyXeX3xLUSh=S{3Ca@QW*K zc&27$_?!iye;^y$l4d#80UX9yp7^$qdUK<-Eh{vaO*s~`l99NsUwcO6kSQ)DVz%@I z^dE=502>0Y3LCtt>c^;=(j8$k{%iA$>q>7P6RK3kHPmh2*cM-6_BQDrPavkBA~t|9 z$4!|kp#!0R4bFX~AjnSev#O(6Ept>s%N!d#gB}iuu*psSBdm4PL`FpgnTEt9XJPOX zdU`tY@A$DY98@_rr{8?>^NvbrN#Vh%S0K};{jlwL}?(M~=3 z9#L6PkH_xVD!|CevI1|Xx*UXW=3*N_48u(Qh(7tGCY-jA$7HM6O=e**VGO{;1TKsW z-vUShtd7IU@anL5eV07Q2sPWdqkDC=z!&_p*y4q$iN@}BmANJ?a=G+L<1jyw?325M z9LNhvQqD1&tJz_RQ4Q;(e2J?jBr`fDbK7?}ziy59gtd|u2F6vg8cG9;;FUX7@l4Bl z;QSzDP+qnC5LwVI3jp#@zw@2zG67OdNF`OaIZq<-UJM-9g@Iwp7cJ2I<)Su0wurbe z5GNv1WHgUO6WtCkO+ERAyf&zZ5!(w%X01}M^m~GGQ%2u|kr-fKd_Y($iiA(|>?r2I zvn3+FM1s0r-P8L=1Ey z#1Fr3E-QH!NhC>*RPm~XvKm+EoJj%BE~^f@h#cNc3zxwB@c*&(Zq1Qh$DQx_0`qv{ zJiA4?yCt&uE{fC?fNB8U02)G}yD5>Bxa$I-fI@AkEZCA_TKB7D_36rnJK%7&W80iq*B-c zUOO3)owtIqpB2b8%t=*7t4HKOCP#~# zVXWV`gqXs+>b7}S7_M*JynhnKn~O1SBmO%32TiS#>jEJEXf%V#QXx(<6Kp!Na43K# zB0G7LKF&#!3=jGuJdpx%0hUP?KQ&CE{K{e=(;sO}R0?%7q@j}#;~(|{SF*a;6l#Js zMnAkq)f8zNm3H=?yo}r$`dUJssSTiE7zzIvJNL0&A;RG(F^A0LGX~3+7vezg(1QR> zCMRXy1^~npvsrr+U@^l3VR5lX+?&$cB`{sx zrc~C#o2K_1miPCB*i#NmGkgj~wk+%;dkyMbnbbBAY`gC#GX}$cs$6OdjW)*385G4} z6WMND=wfD@_ylXyd%lE;#D%oT`!JDHxRt{u{jZ4;T5k4a<%o?Y_)u0e6$lRz?%~60 z@(kcN%PW7NLEirj-CV#67D5`I+YRnS1-gptoic2HeH50}!zAMx~h-khv#`yOmUUO|x2#g$=NUo943-@NNYw=h` zjxI%NRC1h09rgOELf;}epb)v0t(pCklfo)V5ek6v6W3mp;hokg;zIx4@g>H^ACWT+OBb1XLc(IE#Rr824`L;9CtQq%OzW85!))#mc5yfg^ zo8+qJ9bXG7l#uok#ZKhs3RoTa35@bJ`FS||Bo53rur^r$&iN<<=^+` z(!DG-yNj1xwvQrC7G_YunNuAaHpQn;@OUZdn>9%FlY`ku7$V~0#2y$1D8vm7GbL9f z0-zzn^*~pBt@CMi5}yHE1Hk)8ev;=QlwO`vW>4rV)SCTEgs1G!-VDgYMm`jzASBmn*ihLq^jJT{^uVeT3Ia+| zDfDm}gP6Smn$>T7-bbRDr0lDcNI8JQ2boy3gd9#NiAx8vmV(kyo_c#$;Lr~ z#UfX@C9XmLlr@0mJa#%MgLbGy*%*+SCK#sW0P)3U^rVJR;hGym9$L0{r#wIn5opbK zSH2)q8={g~#asPT4>^u#3m>VOy1An(b{-IgWV*O1tie#jZ83iDb3VJwt_>sr0WR;43nD?iHln%4hx*Gp5+O3-Ud(Y9JFiAiVqT(lkYDhTIr383ji_3Cy%V*|9tQu0+3nl<))hQ z203hui2Ix<^9-Ap_rtpB&4jr*?&brKbUrT}MFkw4asKGCEe4XidE_K_aL4Uhlzf9+TXKP+^4= zf|GoDTNa<;!&@@*dRT(!xHsJK<+4B^p%HF=jm+zZ?T4?%pTFjtv|m7s&n<;5No?hE zG4DtoO=90Zc4p9Gc@>ox;g76!#l-%81!VwA^*@8A^B3{2IeofGX&WASrv+CRVKKq+O&Rc2A!nOr5}KDAGLC!xo-fZlkFzhv zosFjN4fc>+f1F&|{>rF8zgys!(-3j#s)-yDQmvGX_Cxiu!q-D&igKwv(u}I+2$x{6 z97hA8=|*5#U_pEeb8*0*8)xI_0?~7pJW)-Wz3`dr7dA(mj0m%8TB>f(~*8 zo4)sqti=e{Tx;Ta`ju?gw=qDSNS4b*6C87>gSNIzUV)3#7 zvfjr|f+F)!@Tj6mHtLUu^ zYi=p{RkJ=wdS=TFTTUr#M0)_Wj?)8Ypg?Me`@*nTv~^?ZXqG;RV|% ztFRDE3rPjndbYjw53`$w@9*^sz!2n-mehN@i9X?M#iVF9M?HJ(`cydw8MefuLhFmz zFv4!c@B3BI>Fl2iJn}t-Ii$oC?}|`M;0O@vVr0;k`m%3V;SeOct*F63b{Qwg{_Hnc z1e{QxeOb1g!WtBKa)L16j?Qe>vK>>VJs!UZj*FX+nD9OW?G42ZJHzK zSzRDpLc@!o6bUfPz8l`Ca}pyWioI1fMVZMwR=p^DwqWK!Un<&xxhO$*(O-OROd%`c z))s-8$)})$rhaiO^+T6I6&-~x0}`6s(OaaydyO?8YzyCl(8VfeB{EnSMy_qm7{}fo zi(~I5W(9SWRO8t@+Nc$#m!&k6MXFQpgP0NmC&Sz5DK?*K(&Xxq{a@+zf%#>iaxY`S zR|zP$tIrs+3;@SQQ3V|@8W5E*#8qRJyQl^cJ_2LzzPcw_#Xzu(^pnLQ-9EizLqk0nMcL^^|l2#^3X0mmta15f(O zpsrkb(mP>XjWaBYa&M*b_Bj%{6)rncQu+2IRLCD>auhqF76S{LltmjN8lB3HU;L%V`S}u&g`VE!jAG;t_4?u$7q9t?3}k8cwqFD* zYBwk5tvd8xyaZ%gR&N(JfWRH{PIFyuDudaX!H84feVH4sQvPnLNse_+Vk}aRzs}G$ zdf{0%h08>r`8RTs@pM#O0E9u@mQ>a}F}@F#xqtiw9p?mVhu}wjCiJyPNfg~@hR+2O zt#As9*nbi=glazNMM22fn{gT1l_CUq54=g1!nha-eG20fL>Gq~8Am=t_$Td>oDhRf zZa24_Z{d#$-Fdp(j*FY|6%lW@3(u4hA2rTozp$0zfWp_3ldCL+{A$@vb8#zD2?d{E z;nr9sB{c(p`*QF14t*Q&?q!7LdKFqEEv)w^B$F+~brchc?_A}z=baGjP7p4TAu5m_8;pw6hOA;dsXk;)xR32yJZV%8H200_v88B5Ho;SG~n zDGS(Xg+fJuts3`~{X`KMqism%Ph7itLr%M)6GEt=wI*D`-@cPzP&D~|QyX!dK_s~%EnSgWo-6J-HVqx75^!o-L2=YQ^8sWDaK<_(D2CIZa2x?sUt5yztK2UBS{O0 z%U15b2ROmMvwW8lm~vQ9izO#TF6zsvwy;2a4YDB!E$pzb(>*k^EKai{RF^-{8>)S5 zS7vrB5i%9%QmlF2)f`rLuiraya{LCOu5{ako75^mQPKlH$V4e~5mQ9CFc{0l-}?q4 zE01EgS8da*VpYsy_7+xbH7prA!Z?PXOd4zMo*t>?q%%>E|2<^beqf(?eI?A;*upps z-V*y{r5itVYNA4>?~Sb<&I?~!es1RcSytdeo(!wdBgrPE_u(cP#SpipQ6PMghRZ6) zdoy#1*Vp^v`xif#^-eh8Q2T+BTQjHIBk$;!g@P_4Y_lzt_YdM$TU^}mJqF!=s$>s`jy(YKDT+Aau&F`*Aq~Fx zUwj$JB@{v&s_-YM^gofguGg9tX3=urQWZ{j=5{{>-yEB5JAVGD%xZ4vge-mK%A$i? zy-4_G+Pg2>Fhm5#ya@rz6kd8phIzB_sa(mh?8D>GpJ-wS5w!5#%yh;=o*$cS~LJ&Al53y!yv`UbZQ+ZmQCU&g*x%K z5#i@+hcM<#yCHrU={CVSIfe2k+gh`Gc%Mm3^=25Bc`}eJry*IAoISsb9a>!VM5-YW zxc|qOrlvd?0+?(|Yw5O7{R(BM723zub(u-oTVNRH|5$q1xxM2IEp0p1&fdxiM)>v)-U>goJ-%RVmWSf;Pi(?qKah!p5H5qEf|37;_PW5WFYlXHf&LKg zl8c|_>MLcpNMkQDu)Rh1XT%yf&Sz5ULQ9_>xFUS?~~ zN2I|y{vh_M4k&&pOPVO3KmHUh3}=2HsHYl} zSiNJ2LNQ?&HI3W)S>d>*05@M2i>i&4yWI~W=%3|80+vENJTXaND$Xd^qLR{-6YoTJ zP^84+R3J$hKuIRtA0i3$mBVn2STvZroOgQdkzSCK!?3tnis0BM&K?dZpQ{CtKNgd< zQ5_3!GRHV}zObhy4OYg+Nw6ph`JzsFzlz+DL}e)cbiM_cr+f>U@n`3s`ik_s0eC>| zAE;e`>&#-<;zEOp79c}!ho8;t)lD2GMWo2+l5H7NQ9j8zg&R6haX0Of*TRXCTf2w% zRXHZXYDom}L%OLceldYSXu41dTNm4qisP!#LZ0oQ2{2_oJ_414yD&ZQ5i;4sI+RBu zgnUnV&FCYVAa8)4+f<3Z3baFAc_*LW@NjZ>U&fb;XNIoruE3T}I@B93%ghRfbnDp$|dZ?{3HZqOB z&2wZe-%Wmjql%M^fhv?B9udg~Diqc!uLGY8TP$Vg3frY6`tnui?<*f8A}4^g#8F^4 z_9l_I?Lu+ddFff_Hf=4&xo2i&xdibbb8-5dh&m@Xov&4cJR07JB|4AUk?$wGJ46d; zgy*Gsc62H>KI(@cDe#2L-A9{rw2Yt!Ck+g|NRVTWG0v9S=nn~w_W5mn!~eCm>BDEG zCI1HL=>TK1=)T`PUIt(iCT4!4g&-r^z~n@mm)?X|Ihux@FoblBpKnS2r)fJ$ubmM( zH~zvGju&6N;q2{6P+ca&vRYaTa!cotH( zp)TOeE-CXKj_FHZaj1tU^q%PwNgRwdjwL?>D97HZ<)u{DE?1)sG!P*jH30R%V zmh}A~mkQd;A`cfB#3u`fD=&wxSG#p!7EH1p0p5)1F`t_N)8$B64eU`Qc}IZmsHDpE zhJy{`ho)?}-)mdSNufcEhByZDRPc`kUmSk^3twmi4TVNpiqOL2cK#`euhovwt&U}y zAWw3}d3<@;W#xitK47lH@QO}$gcRZADg!mF;0bk(Tw4<&>Wj?v zsZ1{fBTe0AMV%CSozOOKHtw!%%WQzw6fk~1)4SsQ9}Hrcw>cdITwc=AQHgMF_1ex_ zj8C4w=PalB4cTu}|1&w+J95}@Q5B~tU7rAO3TCv~!{6@n+Q~OGR{w}gJ?l*$SL_hG zwKpQHxJt&M2F%_je~pahzE#Xw%=QZiw)6RGPC!WdL07FhORP|hG6+6eveHGq9M(v# zBTFZp*sDF+MZ%jIX5C((`{? zj%i_d%m$9fWmKFQtA1oAPydJ-cd_k2dgHdG(uF4ZNwTgvNStzxd*M$o@V^Z4CVa|} zCqk^J*ahkn`9mvZSeB0xEVg9W^64N-#m$?fZ1iZk zQ4a}~#3nV?uva8+!b^gcTl~KtZ)EfYFPHA9M1{5}BPTUls{Q3xg(cAmtRCA(q=%hA zlulo|{#Zj-I7d!RPe0*AV4453d`T4oVNLp4=dSb@Ew*1T6?lG-m?X9>P|AV?eWh&;3fF}>eSKW@ zG}29C)z)uFr6EMoM2*erTgn~!Z}Pa20IyD>t<^gl!Ye3WImQv?X%)Wk*ndL2mzgOt zU6=-Z{(oV&Kbg)~6P=a}}qmT@Qc&eKYL#SUu`tZqErs$CJlFcj*$=8Gk7I4t|S`zQb{Q>j* zvF|N)D^dQuNdmjGu*b*_r&efDPO{10CN%ozymN1L)qh#}=ZvI5PW5(?swxfbTI&@E zVS^j1oDUEM7xg4qKpJCQ701DQ=p7E;jn!L(hlWCFNj4$>GQxiXY z7B;7hn6?OfChU~CJ#DLm2@)|_!RESe_pl!cnp&+5A*Dsq2ZTIuIzXO~P8W(yAD4c$ zu%eNMys%ghg%{h(h!fAviNmm0%&Drf@<$rq_U2^e~#$UsSku3h!if3ufrPs#Qr*uB1z zvIXTJrT%ayOd^|@dSrl@Jr$C-+ixr30iq+zakdY#u&L&UuMg|>Ge1i_+rie&!pg`j z1}JO(TzkX*G$7WiEP%Ik0A54{5xbnY6m6Si&F+|A^h9b@RoyV=wDv-%;RwXP&EDuV zHBTk)zZfsKu8fuBQ|+n#!Tfg_Lu5o1Cy*a^Cc0ACEzPbUo++{d_|$ZhXYa%nV`+?p zlOUyi!gBsZ7%A5ZzAqGfJz-1J1Ld7Wh7``Ab-ago=T-aeB5UQ)5JtX16qbrZY!X2) zvd&%0gsfmD;T%D$8i4NIi42A;iE2nu9X0y$JXIHMj3x?R2Y}=xuT)Y0% zbJCot%fV{;V4G!tlzPI;Aur?g%&E5eRY&&4lu$z>`5b0%ndQlUF>Z7;X^Gg1tclv? zk&U(-^`GX|PlTq4MvuBXvUBAB!jTa+s1;<6d~G%?@r{$XT}g6~%@LUju6twiyzV5- z*Ia{_hPLEB%x-38a}dOW^I9g#6^Zd$ZV@}&vI<~&eL&TtS4016Y-~H-)`QvmYB+)R zlx3OAp@>t4J6clTsINq*zj_d;x)p~E-=TL7s7E6mVeqj#b4+kC+bdU{Iea;4t~AU& zX19NVQV6&aWw1*42)efSnOmT`1eVI+H9t(<+<$sibz zSWaHF)DKXyp{e}=CWS(vhmvu>`3~_}!-9YOZGKn#&b$%Uf(R0{hb#YM`7%kI^1DTx zylxdLV*yw)%}LEqDXEucR9xV8^Y(@{fj=lFA|hEKOtbW(^!rW3PAOeK3~q6<(G@-~?v*_rcBnmjG4J662WB-`5EaR|8Ng#~U;0fBt^Edt@*STW#@Nu1VNwP|lxZLEAOHJb^FM!|L71O_EsyhqH`B+C zd{e~ZY*eVx4DY|_33<+YfxiTgyh+TmjK`9#NZ=7^^7}_) zwdCWFj|CAXf8FY)g4i82yNkaTF0fEj=zxbs8>&}u4`&Jckm)Bl- z(P@<64Ga8{9U{*kcdj{j{2k^zp4NRi3}8ojgQ(?fBITv6en8>*h5v>5$CmOJKv{-< zaPlL8q6`n+&$okDZ->amk0s5tJBR$o4ZK<9Fte{c^nGawh>ZqN{&JGQPB(7L10dfr zgz6+B@GVIC;=8_%n>T&T(;{Kr7WPkBBxI`4#y3K4LQ4&!z;*tUr6z;JUINv`3uf99 zBEk>Px)=xbHadR2S!;0IX;p!gS{zZyF`{>$Dw|3s#Fb2|H#m4%bxw7}f!xksi5a1Xs-S5tBeP*${UCVtH_fc=}8 z3EwTF!x}%L@)_gt5Hs{D{gX*!7zwWVivHD%j&KN@SHTEj1Fl(Y)!yk*ozvqz`rD}% zr=zTn@>v{va@NQuy0KEMI z+KH45&t7@jU!e(N8GV-3Bs*^bus@KF3@eR@kWtBe3Hi=bvS;i!*R#aW9rrYFYE0?8 zIyed6gKZGvvSU=7tDD3Z6SXOWNRfLifZ77qshE`A+hkJ+_D&C$Qorzva^%pjifAv) zemRS%Em*F8^H}bqna)|IKq_$-6E}i&@6g*&<;yDg3z6WSBF$RnV;o2 z=S3ADt{Zrh3PAHh3~B(zsu_N3!*&;@5-@56w)l2i+2kzi+zKcn)BgD42eKK$dYzpMHIE<{>sjy!W`i7z2SBmzy2+5?7 zfo=S$qh>3P1}8~kb)e6=7cgCbvRg>saX}=Cr~^n$qc~O+du6Q#6ptyIX+gx;bOmvJ zlsCZ5!Z>OUW?9?@aXW8?p^zRM%ASc8lFN)bZr!*8Hkw=VXo0ZE6M@3zAnVK!4H`EJ zz3r)|l;c`tl~dj9f5kP-cqR|xmnG-PfEs(Y7as1#wTAD#MVhKPI?4ZB%nC$m40;cs zX5nY^M0I)CZDf{mxgraiy5~wlHXXQ_V~Wu)Ro>nJT!8d(eoW-4<5Y z#d0z*^Ng=BZ(RA7Orm8eDdaa+teh~4mvKJ&skc~(Y<#2^&Td4MA`L6G5o43@=h-_7 ztv1hqVy~&zK=6()(IzN=Sg$WV+pH3x)j*bQ2aRAJ68VcM@1Rb+H6t@9S7vt)yxN)Y z0*3-93ADLBs6)ZV>DhYpskcg@8B5h72a(d3(@}6KWMLA;`P+Dv;Wfcn8B*(1zqK@! zM_MNgCbzRAL!>}%8jJ4&EsV#3ujSXq@?spn{}~*H2Nv8Wi_^tniN#3%a-Xcq*t}#W z-L#}WcG}pl3eEVBKlmFRn%OAjSObRrLk=t(R2Uhg?R$v#y%6DO``TB0pGVkBR1%IS`gbDfV+xeVW~v7Ma}t$K2M=Mf0xH)W+R)nLJ+J z43A0kFh$XM*efsidNxy4b>ClzBrK?Wgop;&$8`eB@did(-2`FiAS50>v;4n8y_U_1 zYgE<{aX2uT!`T}$?A8Q6C~uOfB`75sfRSG<=UErLQEf z+)fTht)>~Dj!h;fr=7it87)SOyU16LiJlF$D*POZtBN}8r;61Z_8y3?WJUCLk$3ae zXBL^_)RW`1k5f@5u}k96Xz!?}x#Uv$K7Td&sz|{S+e(puct%Y~PXD_>@Q*X%{=TA2 zFjpyXs2bRyEyuf@*aTGL;`YrTm6_aZls>0&2#80~FERlQ@c0IeF~bsLV4OCoxsH8!5QdtZrsvL41%yxDTcJ!5>88;1hy%ecS{K zgpC2CLC%b03J8c@|N5}wyMX}lKt+bSGd+^a-6+)kc)rx^%xEgsKC^kqC`pVE<3R~h znV=ccAIp=y^d#bqU_^M>K(G5Inr^LU513v~AHh(p{)y^oi}Pe4KEJhQqt)~?$Lo09 z`w?&Q)n|R&mNOX`fqz9nU_K`)kDQ5MzRK8V!of_Ft)2={Kx!34@39$vCJBW0KKXRO zsrF@s5m8RWM?fDL48)t3X3P7(cKwL4T9rd78U&6hqUVy}gR&9T@ISzlXxus?2&VPOWnVc2evK=*PpZ_&7 z;Oj+cche^fwB@sBF&=%-e~|8R7m-iY_7R#l z1`)=;!Y&aDC-!{Pujtj4Io zZ(_&{r3D9d6Bo4KtoRN_RC*o9-8K(tj5+cuJ~;2yZl>M9WZcWSp6(9K@EG%8GpZmA zAgXZ;uoxc(8)2X~`>`Jwev5SP1_VIC%7(Z@*rj3Ibla_Pu~02Z`?!zD4&wvfdpujl z7I|3IN{&L#{G8b^A~V*YdV42Cu$hvl$`*jLLyncNAm40Z(kv2pQ>i=hS{?Lb!N$F) z>0~Qe6r44&btgw%m>Ek%gtox~Wc%q9kE#ses6GC`&=H%kdfeu zD8M;H8j)9~x&*kj6Um2~@p)_7VHWX|F*MPOe57Pfio4!aC8rQxTgWvT_ABiv>6Kv| z;LR0Mg?NKj55$izc<5Hzg*XZ zs`+X|gB~eB?;X-UPY;OmiPXpxHf)cNgt@sTK#$ezC&8N!Q~ZEXbCK+|hwn)m?SVs<1%g{LER}-I_Hyw(-v@yAKb83riz5Nv z-L?Ptq}Uw=DS_#%V{~+Q{+BZSF7+rePGq1KFD#CtF@9MFJVnT#|E-}? zd`!GRb`_;`^N!P^3LKb{gHKHj9~rlQ=PXAo;38FdvW<<=dm+%nva|RSBM;Bs_h*ct zbN2Z!RM-0$7RZ_%SyKY!gvu+hR#S66*f!z?9A)g zr)8#_#6pvcPYy(tf)NFDNatU8!8eiBeb+F(LKoNl z`KF=7iHytQujWn){}jYQanIkk)Coe2n{{NoeBmjd4ESVO zE2}P;{2nrgTPR@Og^fAXZ}5l=AoEiIFUYU{syzuS0$~JU>S89wBR=*QQ*ssAQ}mjY zNJ;kekY&u;#l#^R*~Y;7b2MIstLPO+8CY^`IZ?PzY+AB;W1OYF)w55uD|0&8R{NA@Z_{yDZx{rfGXj+Wm;1MkjV;d-&qR07mw?9s9(s2bECeD99_?yd z+>j~VUuM*i_3RBE{Y=&qlM@0AD!E#sn*HI4I(Ko+2xo0>ZQa?D-<1KY<@at~4TAPq zXe^U4M}Ce!e_e(H>~!LKgIzvm^fb}j$k30+O_g!)S=vi!fGiiGUjydNh9Lu^I*kId z3{Gu&{rT^mdKge+O}GeRJ-(-MHPj4l_r5Pbd=RU;%zTN1rWvP6#2?+#(M}O@Hpmf` zyClbwX5eHwCUP#CWj}>2DAq~Ybfb!FV&v985E$<&evqE!6Zb*${YH)`n(lx8x-Sw| zeH9@Qtb=6$#0*moF*MQ^;p?)Lvv>TF37F@<_epUkF}~#7)@@uMG6PcB5Vh!kXyMXL z_yTY1?&eZsyD6K5Q4#y>KuuF877kG04l!L95?ei_CTjKorHelm2zLIxc4yg#V79tc zp3(ons;zFcnp@j`wl03i{ESwA>Yq~q%~z6`oN%I$WiMYSR*6D+5Gs&Q5*|x)zF_o6 z`jlhURSY>CjDzP7LS?{Vycc~_jzW)13S#m>^HmosCdGxMAG@~0IXZijS`Y|X!r|sg zopGSaTHNBE?J2@NDVp*Tl2VlQ#eFT@iqAh| zp(yz39KcM}xMXC8SRBkTLrvHXKD9Mm8*7W(o`0X|PlWD?IBx z`lT!!jgrlj#rr^x5xj1PBcW0Y!2-#c;;RS5*8e&CD1v?Ib)^WHI;uP2b0#NAkS*2+ zawTTJi{%Fo8m8DoX>&BKG_3U`JT>!$_uk8CFX?@8PgkHb`#^ZmwHyAUcS6E0*+H^1 z^fZ+gyRai`1kmR)$iO>AGV;3)!nx_cSA6EIvwExGzuQWD@Eal4X)3 zWoUc&4Kxo2HS{dpApjg`ZtzdyHG{}oG3LuP-9dV|AsKE`GFPvg@rt;Vx^F| zlQb{nf|zPq{?MjnLD;1%bO0)H-eNXzCres>^shcLK>6fkIDcOjA_a29msV5O*ALZ8 z(xxTkMb6wsv*52Rua?-+kWQ6N?^|V9>1q{eI*YFLLB^_g%$}oi_)Vk(`Eqfq$(`)8 zpzjmpbR|$IHqJmRk|>?*MOY#*5>w~XvidSvN?M;R{Fi#U{)-;QRXFtRXk_(-XMoha zjT?Xb?mx&mwxzcZuRM}b~p!c?e0Y!AcF6W^9$?pL8%Zm;>W zJm5bc3vt=IDsYlsy%0-o7isoVMFCv~ln$Bry?T%VZJ?SlVFZgH`;t;4!Q;WVt>jqN zeHQJ$4BH0tTWrdy0uuhdoK^Uc#ZQhx4Pc>>jpDHKIn}=gimhB?cR&mq!TEwfK_AGX zjSAs>ycfX?l=Ya(vA;|ufrKG{v}$!$Ql`kx+a!w`GOzD>!vaBV06wwHs1a#~B2Naz zBSVJ}(wNjGkZrWX>~2m#*LN>7nhONYgfyB-w?SO*`z?cpL!w{xAq+ z+?x~uKPD-Z3Bz`B9-wDm)Z!Z5*{^+qn6j}QD)K$}lT#wv%|u6(&~eH}Ho0+R>un%Y z%jfG2FpG1&<4HwVIOl*T_9IG>bmK_?%zg$`)9{nx2+^L$C>XL07Ae{lM?dOCJ_Zd= zlqF@AQ5^=WV2BIZzhm2%Q33qgXgWulhih4OLZ?gW+=3&QR=@R%KWZRJ6yctWPk{Sk zp+X0giCDORe*w6Yth7RC2o(ga6`W+;Bj&Ulth04`hyzJjFi4i18Z7C(@~{iDP!Zn@ z_=iV@K>>Gi+(i2YA4Ig2riiya3^l~!#C;Oxy(_$8K$vBbNT<89y}l;wXo5RsXtz$R zgYC#cP#%HK#cc||8*u6asqw9-bbxsy^Yp9NuWR`yQ6*`f>RPhQmY5$?P*^p2VTV$R zPJqchY>7rAnMTqkCZjWdgAdkcZ(9_cuL6G&3gp-9hY<^gCnoB_ndt#dmUxP6A>r*l ziyLPhEh>ALxacFr2$p&TGuaf9FlFNm$ZUCp$DpY7l=5d_VD=;0P#4_MAS|;C#)4fs zq5ZHMT-MH49{nuTvPW-(I>$$Mo4yoH6OF^lKvu()o6=c{RhkVEB3*g-D}3AuBJo+V zGD)iN!}{nivw?sbB`FKVf-6N*yEK3?X9_8uDBiz_}ff){_6^W754 zr|~Wre9YwTE6*a_tV3+JD>w&R5)K-$#3|M??}Kj^=h*;#+N}6UZMrQWDT-pG9M`M9 zTo1qX$~9+hB>4x75WQOkUzj>S%m#zOFK&Yp1X$`^c`%y>NeG$P>cEO@g#fSVSmw-dAKW|@{Zz{4>7h^XIAr8#^I@E)sb6d-!=Eg&8xGRBluq8(JU z#)_|(+ey~y9zOhb2n~#V`EE8j{+k)!WPdDkxViX+miDUP$$+vy|Gv*^P))Tyb5qmO zK9;Cug)5G*I=NEUtyo|vJWF@Izd35ikHtz$E{x0EcO(Os@nYMq&ftdrJ=0q+Fbm{_ z=~((GA2nBn8b^~=MLs5{g8bT>z2U!-N8(3LtJY1!n`F$fLgUHt>>b}l-ok^M9f17a z?FHtVU#$T000F~`OCmg_X+Mt6<<{ zPKV~BmP$R>u3h)Fr&>ZGb;2A|{xWeja=O^-reVf9Ht;Ja&kQ6v!^ zgW!|VE|70KSQ>g>XiW8PSUPC~YeE(GxBXV^X|flf*!i~90v*nz$YiG5Qv|^}0+Tsc zguz3eg=&f6GrMTk*<#uMd$YHFK=6o+!LU`uzpYL^>W#*&&dm#{%1WdJGv2QGDK1`> zBSWz`Lm&A2P3)O8<)ICV3rBJu}naMXj~r^GhIDq41hI+Xn+-I7Drj2S)@#(h_1 z%#@HMblqNeiseYS03%JA*JMlCVHH}YN3ff&N*DxQn^2&rp=2!S2Z>Q6!O!3tse+_& zxS5iV`HyAf_v6$Lj})%ess_jMT1WM~^S{Z_YA4lRcuFx{Wro6>6gsxq57CeHLM$61 z=lJ>f{IB%9?{RDRw)5_~MG5%aj56gJ5Zz%|Pv^xKwtdl{o2-fVIU(}h`CqG*E>Va|rQ61_d~KF; z)kO}zE-4@#AN~@SG$;OVe8UGcCsS{zDug?W8pLtq4NS=!DKvTstxdd9B6cn!KQ$CFk1%Gifmvn zKKY{1Knn|&nqP>}$IGtHg0&v1P-yiuMfOWXY+pqFu-FR|E{CbB&2c%Bc{`+cdzlJQ zwV3wKT)~z>S<C%2o8M;ec{ZNk)Q=rikgs zvcXZ3p&g~nCb+Jb_G&+{`W~H!&77M0AHRtvvV>+g;OZ>T8}K?LRM(Y zC}08#hN0_m9vf^%2AEjoh%|%VJ_UnSEnYFJs6WGt7EGJjuN8TSuo0q2$ub)iq=X;Y z!enSeJ%+eK>$nnW@qUqp3`qm=lvX47p7O=QcFi&3KkW`d78VY01ahaq^w|wNRogpC zEjA!CCCLH;;Td&6-^h7FiX*#F$QMW?*u`WG=ZVg^j(sr9L?PTu7=V5M@t^p<@)p!0 zzqYH86^WJ717zsWB6b?>bQ*VQcbe^KmM8Z7_cHi#GWo^L#Z|2+UI??;wu%U%%{yD${@U!LfuuRPjP~&M`n&nK zn@#89M;EWl(T2XW184TpE}B~SIT`UvA${)kq6i7L$}m9!I@U|2(~FC8bF1m&Ahbqr zNou|4G_|b+vBj|HGot6SDwyo}NG-@1*ahLin`2-APAuC5?Jd*~WXqcKgrq(Ul9$G~ z>yuM;RzZr{e#3PyKq-tEp)&R`p@)+ARGAO_(JwOe3g8J&V;>mf+~m%3$~_$@j>k43 zhcfai)-=62&!qVIm^$N55p)Txb$vCq;nocXt=6*>Bk-Z5Qc z*uczd`8a&UfNp&ajVsCuW6i=aqA8oLJOk@I)G&P#d>YHJ)X=gf3)Nn1I@Bq->+$*) z3bxrn4HD(!VM;!pO<0rvyUpe3Q%U^a0Og>Y>;m$4xDQ@;N=wyOLuzO8Ko<(ZgX(7Z zVpK_8-nc6VXbTNSslRckW`nO|e-$>`;+iT7tG9^Z@}baqV{J{E2GE+hJx~ArUwj59 z{m*|P&9W>&wdD_6n5vFd>jH$=%_$D=YA#4qKhNKnL9Axtpi^^?_vJ2MoQuMchYw~4 z{Hyf6ntYV9$4x2YM_766;6gi+lp8YAJns*{@YMVp4h#4{^zoLco^|%YPt&0C^RH|I)oB?~%dn z)^V@wkjYhMNe%5u;eW*~nxa$W9WvH}i<4V_fpK6q(iOg4MPwcQ5gjbwn}}P~Bqm1{ z%2r{xSfM!0B&z+u+p}{wtQ$$ZxA`qisa&8D0KusoGLwFkDFeU`~o~?sB zBB_OR186?`PPVX2nW^4a{^Gw~{6y9RC@7}4e7A7oxY*@|R6rPp>AbBz2S%ReW>p|@o$SA;70Jgz($AQ~Qm5cih(wYqWhj<>^4OfdfI`~$;tORtnkhi|G) z0!20jZu`toKMK$&K+Rz-`tYUMnwZOr@5#DjJ-49f0=0LEj%A0^y^%Es21yEu=#luP zSSTJbSDyuwz*RC=Q z5GhFmSj^6t>w*B0prV#g0!Ef&t&?P)4a+4iT)yh;SE^lEQ5sYfREi%JwI?`Q;mlno zr2w{RUkS4UU;JFw|5j}ql^K&6ii=7&GzM0=C}<5h$KcCR5s7?th4w&XhMDGlpV*79 zKIa3t_`i*^{FJi$A>H({M{f4!o+i43SBG3hZCykG(aJ2Y!|GbHPBJZ~K~nDwADQRn zk{-54{9@={3Idh={^FUaY&GpaK-<9`VS9A`W<^-`S1_j`)=goWC*T%<&~dPC-9 z1q(FAl2Ker1TmiH)_92B%%l&mJJ zQjxEB?%y1#D-u3wi5{g*J5f3H8*W_TZ`8{*d$hdWst!pQEbF~Qo*;RSdEn>ge~=~+ zoeat6RF8LaFXSv@H2m*e$~iS2X)xko{J;#wzDe!gnH(wdv2Yth;nKh9L;X-gFdgJf zFvtgtSDK>onHt(l3{^mT;95Kchg5kRqF!#zTA(LLF4X$pQeST zY@snjA#K?8j>6u@Q?ePQf)w6I!tKc(lVqHuqGn&KKORS(45}f3P$4K#natPnKgcR2 zgm-96<7tL8VIsx)@1w-z1L7oShMavm$j73wwYbvQk{f}uiMKCV+c>3iIuA)hWzGO! zat3skYXMMCJ+m?6AzW4As+hFvqBtJqz>Lwnq37Tu+V9!vQ~%W0Lg;c!xx|Q+ryAG^4?B$43~zZJIC&%%0_`(JuE|7B4S(-L_aJIQ+EN|<2)dZZHKJ_eS>;(=LV z-#q@ZNQP!QEL?K%hi*vyj2wwCSN0~PBCJ`+05(L7^Ix=ludu?S?Baoti7d=M zE!8umpsapP_5xz~%oS%wY>g~Fp=!*=I~=9l_5LOomUbnTCd{ei-!RC7Y2Y=7sFb_O z8RV@H$|1+gL>=w|4Pi+VbXZr(Mc0qGZKkZ zox-qFSfU(nV&Q6%A3i3-r%I97AiOtdI&*RC@F`8JtO4;p@{2e^={A)=vaF(dcN1sT z36S;^U-N#L)ma4i856(wx$~*!rm$7QI_uD0rn_OAB@j~=?0zrjm!#=0M&&4ubQp@j zIH(bFtJ;?xGvg|@ekFxP1U{i5MB8>%n0j-TbAQb+NeR5_e_$|xlt67dog7V-E)o1e z`Rj^43ZUcd-f0Lg02zZta{sDT?Fc82uPX4f+1o-#_j|nqSxiv2hp`vW-S8JzF;;Wo z%3{E608-?5KxT2ZsCZ2j8CGl=HwoRH6Jq5GLA+z>YK5n46TA;xM?P#922FG2k9XqA zdZ19oBvV*6qQLGltfT$|xg~)!`J{Me*iau7<>=z@B7I#3Q~SE{a$`GQeQP(QFCMIO zH%Oz0r{nMVo0JY!F+JrB{+>VNQ0skx!#h>a@$H}lXW^}|A>42I#~1%KXDPNPok*kp z#T&Am1WV&&JpI3NG-gU4|1pbXGOFbrLt6D&J}{Q#bT~Z>xHTP&iwvq z>ZnkXNMW8eL@NvdY*Q(XgIHPAuMe3LkXdl7$lRIpoRMKJuykhqP#2zs)~Oi6;Xr(s z1D*;!hFQ-RrcyipJO!E!Sx%TmB0ZK)wpG;1h1-yKU=Z<(YnYm=5g9zdwn*P5iI!n1 z=Kubw)6x0goxdv_8Uq4CxSdH~6Qr?h&$UwNPM=I;YxZd;Y7EZaj8vUG!kjTwGzD@d zXgH)SZ`VH8MX)mmiD1u=ohSAit5ev3`KCM4k3et3lTF;08RI`ecav6ZQdna}PDvT_uA8`(XBR zE_ZbMcTKkSc4&10Sj&O1wB#%|sB&~>B{faUl$XqtPY@bw>W7M!Ca1tRvz_5+JPiOQ zkz7Tj`}s#+uh#c}Gd3LOWJg2r03KsFHNRd=D5w0xutdu;9a1(QD`;nKHW5O&*H|Op zwNn?vUX9G|H(}R$ZEkPfZF~hvEpaQMR?c{$c0`e_;rqDo**{B01z%K^IMeg%kz;Gh#1WPXxEn+Je zrhrp_--Keh0?a7Sykal75d!9j+7k=35@9CyNggLh#jbZ8C9?Lt2b zN9@Zgc1KgixgFyLkl}GO2bR-JlvUT;MH9lUyZdHS_j%IJ=*e|vND1Fu)$sI4ec|9j$p`uJ!_s}Q9^kL`%+38mvJ|k<3 zUC{78vb|>A3zdVaE^Pf7otCAG%q@s1g~1lx0VeapM<45OD@NR8{Ex9RC^<&cDmsQEfsIS%FCS|QMvz-%Cd5(uV)w-zz>}Sk1Zm^t1N-N# zq=b2DzE8OwAsJs=0|TZ!&i)75L+L4ZM8Op5Kp)fF#pIkp8|T3 zH0|Pa!f1{Igh&(3L7qK7t*mIoWZ3Vd&T}{Wtnpk#Q1#s<--w$aQ{(!dy&q+4W^WzG zD$v?i9oAVyL)G+2o}dS4jRbcQ@7F3EiWc7bq3FwqXGupk^^7!tmK-ONebFVC(5Jpc zjkq6tEl8*6o5V@@aZ!vJ*X57w3JVGfhKU}KwSrWHP4*3&(=Z32CSXB7Qd5I=lVj1C zib)d#+Rx7a;7>2*rnoO#rkJDRY`1$_iSPWsvoJUmiYcR-h4$SalhNN7c=mqq(>BW{AD#cqR3^E146;LZ zKomizH3XVCnT*UJJ4!R;z*5TK43%6MS)>!ov0elMAt?fv2x45v-5?{m40{*R(xA#d zrS?==+08p!PC?vK7#jRFOyIuA>*)9*w=tE5LKKK?(O}(fBpuP~`jsb~H2547LI#p@ zvBitqwPe5M#pNn8^;xgb+o5m=4vw+^6nG%5)H!vf}b-&=0PI77U6s|uv@(LuCzq5v`M^TByGRQ)x;u4v}lBemn(U4-*R zQwcBBp(@k!iDhLDV-}TcX9zWQ)$l>G;Oo`yX@<(OF2@<|+MoSPpj~~0CXy>Fgv z)nT$T-RV>$4*DFOp}oo46VQgFp@GPt^Yi9mAS)!|4%qpqJa;$5D7v*0DOzOE%oXNY zQSET!GK0S#`y=&~SyrZwGW-7Um=v|e0$I8^r-g>4^Y$OM(Lqw&G;<~ufzWM6;u|u* z?=OBNi(qLrIh($Svo}nM_Fv4c&e;kQYxqD0#VAX>v!MX}+c{F*e*v!}uH2a!VAi(q z2TMC)-b!DaD_4BL3?Lz{e89V@P)&8zkJC4MU!#=I|7D)eLk78$3ZM8mmTJqz9^>eZ z0uTk6r+{s&FM*dL4W_qbMnb&#*>A|%u2t^_FsnceQVt|+m0xcl4`jxP9|7a=flPOE zift~3B(&?Tk&?xj-^&+*g&}Pml6a^#m+9Gk4ZrlHbhlDlaA0Asy-fdEkbjeixVVo` z_SD|F49D}PuYTJ}51U3eh>{__@QP1&Tx+nzB6o%=Qy;+MEOh|!DMfPHBTJts^PXq5 z>P^m6hzEfivJ-u7XdE6kzmeJiQz9JQ7P@>5Sy3;71N;uY52OBmq!pt=4^-(Bx-$yV zXsg+Bs-XW(%iN>fV8IfmJH7G3avgdLzlc=jzz z-=+WfWte$PgO_bGi$f7C*zWD3dH59AnZ|cC4dFOB#SE3<%XLsa`b&S^AXw+0_!9~W z1&gI2lFj5KyjC+MU=Oea-1xE-jj5^R;i0~MRaV?W?=@7xd^}}^If}XM+5-InQs zOPLgiIE3v%++K~QpWv2V_ChS1)pQVn_&5IjUwvU8e)Vae?arY0K!#}^E|XWhMPmuE ztmdgY(!Hm=-TpY|qae$%FKNloG1{C4XcVfAOFF5NS$dOV_zOkhgq+ zW-5yf;0Rl|`8bP?+6ho^Hv%O?B&)@BkbR!=`A}wesN+(!fLAuEOHb zc+gjq6+6j2=%*36#MC}t87Z zoNM~pC?vvLMH2EILzon%E=$^v2I`B0M`y3rWYnzYzc~r_dyhU9${{EE#2okGs-p8h ze)s>CNm*8)Fl1R!zCS}IvQ)WIH-#?|&N5sd;UlzPNR#+W-)o9j2_pcNWG)n^5zF5c zQVLRQkVtfbhwhHLW6x7NN zSOT&|V>y4r#F>0hWLmUoj#F9dOXi=`;8X?LHi_+VJr2vv^X9)1)q^qL=>&TQgQF(H z9c0QjRdS{4xtEGv@Kxq{CPRTmqo2;*ei_|KexSDHRO$3Y^>a9*aTk zlCq*GG7BAok&`W&5M#Heo4LpM*mTWW$ zC~irGbE%S7UU|`HWKorI+p}Ib$-~o!$xGGe(Q6I-HAq)GOGTN7~I^FVvnpgR@#4)U% zku8g#k|yzZYecgx&G|<~%r1UR!(b^JF=T+uwaRz;z5W=Ntv_o`+vGfvky)_5idQ%b zy1UxcLaoMEcE+ljg+Gy-Kgba^PxH%hGC0T?1Hv*G@~`bHNk+X$Qfj-~5A5;a^rWss zncu$eUyS-04$`OAPZ$nwk_f?9aI2Y$k;@bbD9O??JrP`La=N^6NWnoJIzsA^m3Q$~ zn7Njb&0sVyN5+n|VO9oOS<7ta5w#qm>?Hp(m zJ5*?YXV6#EUdlJ1AR2V*(7;}LS~|l=QXs~#Z^eu4a4wF=K|G05=x|B4Gzx13$L-^S zOyNhC(=}-2i~Q$XN!j%{H!nRI1t{5_c1`UqVJ}s>n0;A!p*09&D*(tM#T3~^Ecjnm z-B^?CwHa18&59Hya~}N6sfYF#sE+5|F{iRPQ40O`NqqUs*I$vo3?RW4%h;){1D7x8 z#!w&0U+rCxnuU`f&2aq<@Yy;mL47HJ1UPQ6a$xC*>%xG6C;jduLJrAS7)CcHWH=FT zjh{E3^{)9bF0z#_8q7|{-!cI;XyUo6{^1@?)J$2LQsgW7gus1 z(hXqZ%`uL(oP81YSRjz7nvgBehS9A-i3L$ngk;IvvHA-|1_Eqp8Y+BQZb2pPEz6H> z?sQ)um_q?JS|@j+5C=Fsm4d%H)zko&GUxAnH8vZ8MP-Utzu@EK$I_?-RfU___GHq_ z;k0PC2P!Avbqa(SjTm~&Wx!|SbO9985Nw2Oi{_J!om2;pK2~4TpDBfpax$+^w1(ub_z940*T7^W)wIeqk!5RTTpBjHnQ0pnpnb)93qGl z5gQN~VNgFKl@P|cG9Mi=4*v*1bu6Mqm(Sh^70VV!r?*a8<)K~%$*F+A>$}H=4Cqw& zIC9lbu=M;?mgEpv%}7NQlYi{CkVPiSV;QC0WoCjFBMr;>W&o_qSHJoBjdohbcg&bQ8x} z)@cAQgyO)fjb)h_x0Jadd>TC7!0{(GtxL;3>6l~{?^tSzen)S=E*X2|4Xdt#@DWyy zMa*f^s{J%@awm8Lo}suZ7{&UgOs-u=Su=7r_-d}euYEbNy%tKsC5W1XBy%OMkxkS) z8uBJwO@@8R&%UP1~D?8VpSgrJz@vhF~5IDcWvsJ zHeZU~Ko1$Q$b2CKMuZ|L$vdT!PCcIee?B~xR2?#_V{o5BoQE3KZ*+tcKFp`9BtseCxfjP!1X=d6Vs)k%^G5rb}sXp$+O> z;FWelLizF~vVGMUL$e&Kgb8MMK#6%@#C`a>G~4PAIMUK{3X{p2cyCBP{rH%>GX9O= z(v`c&3^It%uM(M%+^9Z0fI>MFF)#eMNZF8pvIt~8FDl{shp85*46H!`A#%g`t;V!( zB357)ww!=IyZ&%bFJkWQ?^2-oV_9xnXzoi?1N8voJCH^~=o8WZ#?^UiMs^x?q?CWL~<%GyAERDjvo}TO_ z>8IHQ85pU%;h#a~kmEzs0hEK_d${;1XiR(XV;NY3*xK3K-bV(9`W730kuH78_~U19`^sPZFq@c| z#+@ShHaQf$4W4aJN1EGy^*AfC@JNJ{Wtcq=n4D_rq=UlAyF=pv*c6JdDf@Bh8u z9~=M~VG*;*%og`T5AW=muIF~*M&yw zvdvg8U#sL+mT%@X`CvIVPNZBuB#iT447GiB;y#zE+ihB?mg5wC(8@Hbv$Jr(RAlov zw3oMLj=~~k+0rOPDzlsWdl=yf4-wL(P@Sle3@Q~IIYipZ!Qzz@NFSDUDi;&wL zq%%1hjZNYAZ$zOZDcwS6h(cu5TPxRF8DZ=5b0y zEi=JjOjx&f%nRnD^5)=xh)lx1bJST*0+$SQKE6#3ijxUrEiybTVDs^3bhLFxhCx37 ze*JL$#S?ybwtLU}U?6a;{xVY1E(P_QQkHKNQlVt)Ujt!Bg&R9IND6XqSsGkC#o^6f zyQ9GMq(s%bz6yq#owOo#kWc*D_3Jl$a%gvj8G!5OuxEk6KC9;9ZJ9>}Jj<6NUiGVl zg~GC6E&};tff>p8VJjx?{=t{;D`7Mw7f^6Y;$4!xN?HbutzHH4c<4fr&6Tqgaz=Zk z^(;xj#!8Gs*ZbDDeD)S(6lO?Dc3@WWd0&0_r0-^nN}*Mxe<$iM!~shPtx~r6j;-34 ze~E-62!rZK=olprc*=A zwQ`R!bMZwp=Srs86xb2AGa@-GGEwssb0-c76N|H|m#%%`x^!7OdnRqqBQxa>V=8tJmC1l^utO*)jFy)A@8I7cwoef6Cz zQx$!;uPEat(nvSpZg-{={E;nCYgNFxE(pJrwzVh;(`gjofbmIWia8`icRu?^pXE9q%zo$5&wPyh0_vOV#`4=LM4V&_X6{(I zWXwRwFpd&#=iDu|5gFYOw?nt$iIpb?C!CAAm@ClM)4$#_3|Xj6vp2)#*n&VO6m(Qc zYIpWFn1~!kpEMeh0L0#5jkhKD6H$VJmZrs78o?T5UUOdx{rSik{eG>vu_zDJ`IF50 zOLh*tzpC&>Ey8Mj3vj7KBrXjv6q)RkHxST(_lzA7YncB^=73Lx8jcn3cUT8Bl6Jpx z%?X7cw1MR)*7oYN&k6Zn{FMcL+D-`F15FLZiw3Y7sX6(?VA4hOn3=4Vx3U)afmP(# zp8^e%9u^4kmY=r4K2JJa&aw;HWOHid!w1fcQ%-af=93#zWCsh>#-4tyM z?x|#mwh=*W`+DlPs-xokTmtZ9(T|9AD%JrLf_ILCW;X=I=dL^>w0Zt1X9T~?x>rxT)Atzu`C!?GhS$eha`F<>x2VohjMQ{VCK7@4KiIwI@q)7Jn*e0@K`F2pCMAn&oo~Pp#I>3W< zAO|1DgJd0xwT;MI=yyoG>hnulMWInnkV={VAR+>4+#}$xblxk|xP>uLtH_;9o1_*I zxy%d+IW_3!vhNBjeWH^~xkqh5`~@!JQ8=j%@al!>mMG1jrJ9@=#f6PGjQAGv0{L!$ zqlgHgep|2X7QQ|qB#u4naj0*=TK=od1@|=yy>HHQ*RBg09rpGMhb~)bK~nxZ1xjHb>R#Jio=?*Q&6_V;L#%VHEgQ1&Zuie2yhVhD2;@*q#Xa z!FS@bHuRQ|#5Py5f|yCu0dL6_HPIJ#bxRNqagI2zxCKvbC2d!l#N=7PC6+Pi=Gadt z8b`Cdjx%o*er+bc1s5)2n6`RAaI*K8Yx-0FJQ)BVB>S&!NUzPW)7B@D(4X|-^BObD}*@dA%9 zFpK=<5>4x85DTenETLukF7XK5u8Ng!96(^fow*27j@$DOg*D78jbhT0c9)<_UsZ-| zLA84E0uok6(m`$r9rN1tm!I^(j_R;ZfBsqk9BFVpU>Rhpg@XK)!?o~siB*bs_j`L< z&BzGS)5QxHgc99dD;(^y`a;}}T*Jj)j0NS&m9EU?zSK(Q*67EXZfq^d_S8qf4JYAT zy-$)(Sx3Cz3jf4ESV(}t)SMI8F(?2TEQ~iqV$ZQ?f;Z7}H?T6u23c~((3z}epMT2w z9g{c0djZJGVExwuN3p<&Y#X}UpITt$c^nX;`vP=OALxCCS5?{H^W+4S-n(+gw%{&) z;HpQj#(>8Mvp1Xq7;wtooNpQ#@II)6g1kvdIXEi7NtQZRh`Y<-Lx(FeMxA671HGAk zt949ChKAK_Pl)JJxB1i;26H+RlFzX|9OEI@(?RY}g%Tx$S+?Hc-9k6>6GEzQZabL|JU=xa{f>ld0Peh(~$ym3W{sVhUwwCdPI^c#3_IWHfw`4t@ zU%e+sZPh}AbGQSciiR!oeW0xs%9Y519ZvTjO8{(=7V)z_#j7t}e@1$%rXk)F?V3^N z%fYPZgQU1V)=3>MlHEEi!racGBVT{w2^qgR|6nrSJ1C%^2$ij2>Yn0)k_n{CKwbg) zLW1r>_anwPAkEc0!DlQ@!bF$0hj|?7wK`>+lE06+dh7S4V zYff4VT3?>Av~sLv11TWnepp~HBI{0J58i-nZ?yYB@52Sow7m80Cw&tse4{`$sA7UxCaMGhQ*WD=Uer7z1ZPm7lTx@i1ps~Hclb}p zWZMMsxE3zful}mWgcS%~%7`P!g0(O-DBTPwAT3K#;el2~!ZdV_ zB=@Skgo&xFX<#KaQWeOYaftwWpC9&rl~hs^SmY&H50X`Uc=4)|pQVya*SVj%gzJ<& zB6qEV3sLBClZ4`gMS}}Jj9v3w9@z+3AE7G)4Z)6?8N91tV0njaPhz#gT{ad{;F&zq=MY`(}s2TSd^yen zD`laGgD6mmmhr+YRwP02QS!~b!p4s?6sWYGsO?c8(5ON9m<)`eD>(j2uCsNESXtj; z1Ux-vFn+V$Am{qptkfr_#0#|~(2OrMaXF)K8!=w1?J~&QXu)MlFPC+4ym3CT;3IF0 zlQU9#V9PRHFbC=UHYxgJPB*^&ZD|Q_?|0;9V>y&$2?J*nyR)qM^9j1^joKuiQ_smw4tP%Ic7_Zgoj0gw6Y-;8S2cP< zz@YGWOQTpCh7jJOCB!B3fko|1Xr#b0SzNS*!~1HRKv=riAQK-JinbMg4{eamcs7z% zj4|*Bf$(J5i#$;P&O^5ZhpY}DqIp+f z)?)fHMW6*lC1~H-JEq7+3Dk$r`R>!}Eh007ype~+o0~R0 zU3{HBNSI`T zf?=SjiAC}|U=OLblB_&+u{B(peBL?#IxV1x6~WRL4wR8wyl}e?L3MkN_$A+e9s#qg z08aS0`3^2Dwk{(MQZ~g!bbuc~4tDRsrJusrti~93EH5-I0F;>QP%O-uSV;c;;Ys27 zH`lEX2kH0!XYAdYBfG9U&-ns=ndq2#m?zRyv&trm@6^RkB(7DsH4;@NiA9qKi31=D zi9AqVfCAX;kVw%Uw{9(4L0gt*tg`8r${mWP#EU836yan=%+mz$*7NF5GQYJGYwiE8 zo|zc799@abv(MgZuf6uV{MTBOT+$S;vCsUKK?q zI93iVjvlM%J03==ihWns3f!+xxvzvySJjWBt@RF-1Oh)rELBqoTXy$CRbhP8bj)4( z1GW+#dS5ORpb1+;^F84TWQZDQ00XU+o1g;Bt4{K*nVv-@NG`5-vj<>>2xu-cGV=+B z)1h~o#H(D=n4PYS&vG-LF=RHnAQe96ot#jLEB zw2#TGR-02HJ>D!m!fnDnK^C%_3gr7f%%2g3INf`dNSMZ23xJ0xHF%P*LuR_r7K$CA@)BAoe8il0vtOf}Eziv2o%mu3hzNk>DA#iS#ap2a zW2wi6870CWOWLF##}{Ar?WN{Y$Z%e(K+&T91&IZKA)V6<{1w+RH%-1DDT$Lg%>+5h zv_F-qsK`E1AuvYl?SmK)#sTS_k|2DZ|Hse1c+)8kNyxhrRqWtwH(P$erkE@{^>T_dWKLK_aiRfm%$e?l6`y0nZWu3K1nAg zC;P3Aa5UjN?8VUW$4_X2sd!-_bXuv8tg_-5Zc>xlSoRT(8wj`3we@6Yx$m zw3^^fKz_@!bSD#Up=sub=h(rS4E5FB&=hr&9f?>gq;}0BjJSowMLR2X50apN=BvZU zLS{Ap_o+}3-IU$-i88Z6b)c@^xY7FIrcYglo=Et)1E|smsZ+PxaH)8FFYsp+zjD5_ zE21#h=d915^MA4m!0h@0m>uN~&RDxxmDtkO59{5_zq?q~QMaBcFd)%qaLH4jbSbJkF7CQgoQ;ZbkHUq6UsHk=K}-6GbQ zP#Yh~N~x3F9OtZe-nQLf=w*NX^u68QzT%oJ+b2XKiCr>p+;^w4>7DKYI<(KL6MFoJ zGA%`E6HWATfxXQ}xh8o^U(w!QY>V6!b*3=#H z(Z{)fZBm1ObX|_EQ39@U=W_HnMMXl~Y@= z=Y!65rwFE=CM<1#EBfyOp2U=Eznw$rtYxluG5m_9T|s%(9g zj4P-0pS<>b_&FH7Q{VyX!9EjQ;UU9h06Tzet`{*=}t+-P#AFf)^WXmwX|vkKjh( z-IPn<8$xI-Hs{&vU;aJ+`<|$&%w^mVHuEvsCRCf^P+5I#1FicJ9!`!$qNWmiAEH#@ zVeuFA#$UYY0}?dmip))@FaGA@T{#bUXw8t0d(&Jo;UlBp!%$V6i-#D+=Ig+p{FUV zmO}+_0EJnxV7yUIz2(z(rHqTl)5HFh0q)W@>JSH;&p&~OfF(L2qdz%}Zx!T8YgZn&LXBVz3vGFY>FNHp13 z!dyGO`k=UM=_gh=_(x5|0%r+g@-VKn^Vke9ce-~oymUUV&7ch?VXO!(+N{!YtLSC1 zRVnm#j%aqSILZzFiLdzj_3K~ue|M?M?2{t+k+}3d6NI+f-Q%Pb&n#zMz)BP-JvkMU(5f8VsvEPb)?Xyeow%LtTJ$$J2i* z^ajEUy|~jYVk)P82SrpbLKG|DT0R%lUh`qb>Z~$ugdv&vLPyTw7S{wyf}HS$rIiTp zO+Q5|@o{;yJ)8uT2a$;K?>oYmPyexy&?=G_q85_K*Gs9dxKELX=bG>V^T~7_ndCOP zD+3F622|+jKXlfy4ofnLqN&AVaosOV($s_~iI315Pv7x>Q&l*;SQ61FU`W#I$jd$Ni})`0%hNCbJ#qS!NAyQezvpxSrq3vcTXC=D0_ zOE?m)H57unRc~)=U=ok5ak=0Ez&>Ry0K9#8!=VyY_+{ihtL-ezwA+L$n6}M$oBTVZ zE~q1UlD?Q3uW8gL$uYbuo10Fk0T!uFeKW`8c&Klc-2A5_ONAHX)k_9Ou#_7{agR$| zem%VWv{)o51ta@RdI#B0fmQ22yn?H83H$c;>8kAi1|$?=E2eqxrF(Lefg6PO#+RId z&@(VojZ@<-wnW6JxUf(HX0BeJ2tOH|ncGv%tz9+PL5x~go45jbVmWz!6_q^;YlN7@ zc1n7z@Fbl(lhe-0z1IM#bP9GOr1ShU&a~;WF@)P^_-4MBtL-B60*l35PR|0A$tMIV zI1|jj|K)#}{m0+bBw?^XwfRZo8N0KKlemKmKqk-SXK6Iwc6Mac%|?1f<`hDkKqa_s zp1i{8nN4*0*TTXUPLGR%Hcg>WVNn%r(rrr(H|B4rv*@^|{qMFDUPOl8v^+Zhx%Db4 zy1d>Y=S>(2SDKzIq<@hC!pbQrC%-2$8L*A&hzQY46biWrWW#2KtB+BwcPrsjCp$Dm zrv-%VT;CoK)%Sr2w@eAw4yv3Lf?WLkLZdc&#L}~ugn>uj68>JW4!`+Y<$UaJ5;>H_vLw2uR5GNRUmjB6f1Ux(H|$S01r?g&Qr3=Bsk=n zPdngdxfooAkhav1s)P4l_g7y*A9c3$UI(!)n?oSj?E1K1jA;&aYg>ae%GSyHEL=ig zI8g(xY6~Eb{ird@R%^{a%_>v+toW|7fU-=2kg?@#scV*EA?8tW!FXrI*J}M#b*E{P z5T_P#OAF;9U*TqiZFnUnt~LzCz99D`AM%u#W+gTJU1&kMIhd8>%mp4u?l#PIT}v?(IpKf?0+l+j9>;~e%b^LPxS>Iv@Q7_3ucfpWOLY4I8MdghEx3A zbiycfD35hOapbZxxc(73P8yt=84AUc)+eZ#F8SDapZFnp5K8(wVl$_``?8`Xf^p8e z=&IOq<{k6D@B(7Ylz5*gRuf(HnKuA3ssWS`< zR*ASrKvdmyTMJ7vr;}9gNzYRIpy$eInseG(3h#ud=;vfavz0D?RES5IXn{Q>E-idO1Gjh5*g`*o1bm~=w!kOtxnt3CLUZceZe&HC-QHsI%pJo!ge-L+WWqp zob^fe~} z1>T*e{C)a2@}bsRqR&42WmzIZvwJ9F(%eajhNxa5CMg+#h2tDxV7ej?VwpscSTvg* z;s`*L7RK72oT>F~F=EJN!Y2ew0=R?-WF@V>x)AA{CRMfpt)Ry{1tvqy<>dOcMr@2N zXPbg^J3Z>lIume*=*`Fp&4A|NWMA7;V)+8G1ez*@{mN555bf7OwP(_UNhk}ieMvZf zvXdnQ3BlZhPSd3A>CI#%*NSiqN^Swt!y-`pr`eRbm?Z&Tc`dOdt?9XubvNdU{v6iu zHua-@MtrrNi*kCHLP(w z(!$@6eG2r%W)de61*}s2(&u7o_Zlb#tRdhPk z9*p{${Gi+G%Fjpj1K_nj7(Nj_(T>>CV{UK38RrYuXRGYupZxuikdZYaln0sW%73~ejDo~M?$TM zBg|#>jxDjg>cpu{GNW?~XWak z5s|NNUIr5KoC{49pH&JZO@A@{pa^(kV)y%v^9r}jRRs>AmcisthLpFS*MJQ>c8slMTKfv+ZmoEB?|>$w=!ji;}F+2_X(xu~fF5pkbgoHa(O zgfEC$3=T_Fgxw#q0;)U?!h8otB=gv!kgptG(t&C!Zsk@i)IA6u9a-~bc@~eQy7Ld} z9Z)7t0H(gu_I={RT&2!y>vA)R8)N#g8}1i6(rYvh=apulQ7c$>I9*&MR~BW=F>lRt zo@FjC?-Zxl3YQo1>Gt-bENh-fx>I)gtL)_7ftCI5y9Dt>E>LKcBOw$wtvz;N<}$6$ zguU-do`FB+xpaPzc5|vUQF0CLP&SK5)j$>4HL4IXKaF=0lt5edEwFUFafrnBYTK7W z!IrRH&PL%%qX=0XGbp&1^^ru+2R<&JZWs23Ig(HoKpqH9w+S!#y1J8)2>7=r<8$Hm zt8648El@WaH~HJrjTr^VOs(QNdfR!9R$9ZbQ|y|^J~5e#b$papa>JRC;Ird8kn?~O z(U63(m8Q;J{&fydvU=`2&4HRho_t1+e@2xNpFBU_ne6$CCK>fjI+3N5$)v1!4K)&U z*556P$q{0 z=L4slMOka~fvYoSRcg;)RvwpKOP#a^#sWz)lBzr=;R71F{s*1El zS@nYioe6180+6v{=vrzq$@gpZZZ18!22>Qgyyy)232nUJ&xQN3@`zUXvX@G_T>e>d zT>kpdCU#!x$f&5!z(e7QU?^Pphz z#9k6)!Uok%U6SlGp52a<$_H+_6t2dj;BT8gqUX`E;z3Mrq85by3BNQ^h)S!I;657z zT^8mCq}#OlI!0AlEF=ga4^Quz$tdx-Ll3y*A71DGDdR-a7bF)XmsdqDAPeO^r9K;9Ee@86+Uesf#J|0yIbCDm;{W6 zh(_Mol%rs$9vGzWKC?+1!V)?doZu${=HnrT_ct_(hVAq_p~s`4i?JnUua9kYhiV z-@^}9rX_fSnt}*Bf2G-v7a#g3#+qN~$itmxZ%nKuW{B(-49~2RTE56wN)`lG@z&>f zB}9v0z&=k;LcV}euwa8zmP!SqxR)?D)NQ;ovBF_KG*hWt_uThm8-2-H-iPhLUE1|^wA#hheJvUAIskHr#M zl@}P;u!^=inAyK4qjpM7%wnj$-EdMf@M6<1^1H}MFVd(^qnrrGht9y4-|+)vRdD>@ zXbEvyFsP}B{#EZNr(CCs@4t1@B`5Ola>I-JlUOl%1UGKT8c7}n?<9Y)(C;`L9!4Mw zl;>Cg*C8wjs6NhQXJPBk!j?$H)Hp7S*1GIm1VSm&qZxjH(&0P6M{*y1GG>9Mf-v%R zS*iwMsh~1kD6+Bt@;fIKC2S&(npf&R?$l)n%+8{# z!cuRj<_Q6o++LHq38AC%=o+4HSBEMSy&hivZ6{Apa?*?e8@No=lpSD%sv9VMqve@i(S!$f$?Y56`C`7A~PK z^_Juprh)dR?}gPa-6j2o1SKUWeN<0>EhHnyi4b?#|7S-5>bImknf6JIvLK|no2dbv zNte7Y3jv;jrYUbV4bA7X-_z~^oX~zx4L!rsCOppN_i`By*rvGWp|oC07DSS{R@Rx+ zU$?ljGh@jMfu1O24=yjkIn_nPT1*a zY9U_s2a&BuH`y^~NZw(YUpTzFg*OTd5#d7il`AgK{-G}|c~%sZWT|0vj5?DsEdvnE z<2?3`a@t{kGv~W@hd7~0b<2|Qnd>+FD3+z;+gw&dRR=b4>8TJ&g&t%t>A0@$8f(c3 z1BQ@dc?s|0PncnXX!BJv$@xhD?8l9WH*@@G7Ds#I_WHKSlR07_8Zx$Q5}rwJ1OHjd z@x_s-2hy@8cNKSUyB`KM6G9~9aL!!4hfd|Q=UeLv@7zT(!MKX1A9AhB-q?N*G1^o! zrGhXuI3<^h}Q)6+sK3 zOfp=^v6S8|dwo2iAyVMTY&H|gvZr!PjxQuV9#4QgZK-sHBc__{TbM{_gwBzmpuxX* zANBj)dh(z*{bg=uG?Qb6a66(#4Ej`v#+mEfnZ9#Ok5K0Zb5oiX>C6Yr3Wtr8^ZGr_ z)SWN7vE~Yj{w&l0J}x&=8r(#nYgv+Yb>FNgu0pz;9KJRRGT=gQlPnpP~ z8r)G?cIx+^dO>)T(!AkqAPF>>9RKM@Ed5=PgbM?!x=2w-y5{(MQ9453WAe{F?O&Xu z-HMPMO)xQW-^!aWe{k`!A2LWhlWVy>1^vmL%&V<$xUgFcwPqqcBe9C@g!&HJL^K7Z zwW}>9jNTU?$V8}h5Eda9`RBd_<9naNPA-HdNg?!|>DA$BV$E+R=$2O%P8 zi>oc!rKR31OTu{+F*rQyP|QW^uQA~l?)mxPLx<}+G9~t-Z6n2Wsx$F05hF-QPi89* zQ}886kQr>b3258D{N_112$nI>JCV6^GLF6sQlku?S_x#?9+TalqE*o9+7b z*MuGR!LrJWD{AByiuc9OF5h-~KmMtfjkfA490(vmvO=!#S`jQai=IeG?-KzX;bjR| z#m)plr=rdJGJ*=qDCdfsK<^KfID{ovrJt5-FE}rs@t-r+ufe&%05R*g*K85dHZ zn2lr$XDLVK$cUp!W~&J2S;=66D>6p6gaUDy7YB7q`Oj8}pI!~|?St$+TDYnz z)PrJ0J6)cz+g1mfi^^Hq{z4wKC0mx!f3R_~+bulhMH_BP^3@vtIUwrjT-2ufP%f;* z8|I;Xqh&Bq1Qah|ak5;cjRy|U{OeE*569E@e3STceE57$I^pyG&;JSU(PgSgEhNu) z7?7Fg{`K2GmCc>m2Mu-oLub8TYn@X&=CZG?o2{=4;aC{uOXS^FyCZmclv?9D-B`$- zp1LY$)stj2I-y}0${(n4%3*fCzW1_B#E|MvHT1iqxXv1CRml)@5WkxE5ZwpPw@B)g zh;H=8SXR>eCKvm zJXB0TCO2;Y1m#j-vnR5t)XOQl^{mmNkLCGH$kY#r5^I?Q1B$sQks$tv>t9X{Y6rRp zS<|`B*VBLWHGgBWEP$XnMJ%BQiu>1$c5L1DWC2!|puE%SF$S7XaQbK*j!2$PK8}AD zPB<)GJ=)lp%p$v*{G?b7P8IdIfmFj&ln4WCq>c zhYETKvr+envV;&ee-+WTmgqKwhq7~`fl?Xq(7+XST99|iexupni5dt_uGxe(brK1 zWrxVF@sThkiX4gc`jmd@^4sx_9_&t2$VV)fXnkR3Fx(+??o)H5ITj*aJWL{u25|lY zq#wvKXF=w@(WeA;uPYBxA!1A~6u^Hn^cD>;`l>G=NpouR`PoP#@>lusS?&CnPp&)J z4Mps)hlmYIczJc6VELVE;^23%Htu)PM-Qfs-zfd*P2AT@2s>Jd@c-S zJps4@owWP~e~?iB;ywIvzcvS_NNIX`B`1(f5ja4Rls=b=;D)xD##I}vhrcIqbbO1x zSycV9C@IbfXDRNV*Zgg`dy;$5kK1HS9T)#VzDNcE;Atc1psS@c8`4gevWIqL-3FWh z>jtaPm=bWlerz5Sy~;2Nj>v9L^KOe&Ws(_I_qLF99!-+|T*R!!lVjkxtAzzf4p9!| zK@JMMQ*Ok_k_uRmjJ|0F5dn{9o0Lg!huX}CJ}x{wg3f*n;EU(W){#b{R#rN`1B)A5 zYyJrJll&2MQ=;KA$5hfqn>WmwR-EV1!xulG`iAcpuWSPHYbro-QUX3h z2Kn*{!);7;hGC0Nb@ph3 zC(U`rs+#&+7Qt^oB-w!@JX@1!GIUa)BiG3?ECM&f`hKH_%R&TJwSWx!0*XQRZ0>Y9 z59kXPa(q9lgwuD9)cXUrIG7pD;^I?Z>Ydi=?FCsBSLg0p?*tPRiC zX}p0VRt5dCpA+bVGXW2^+ddCI99L&ggg+>s+dn%6xGlfBGYcFFv(q;P|A~+pH=}lr zGRKtxHB~Ryb|WXno*18U3pz^y?~Mx(j9H`nfAJ4shAey+eG}34_w^jkkc7gs=zVO9 zA+G%e0D08|k#us^q-SVD86wqMBS@2*TQi_~nK#(02w7OQIbn~zBfZdQ%8)ST-Ehir z>BU)GOSlw&*t}NR#l?1wa@fr9ix|Jo4IbI^&-}RF!n6H} zaf7q(!Z0{_`k^9;`qqCzM()6+{3Yb;Q(yHJfbK>yaJrLwqrMWlIQF^{KQLDZr*&p? z3U68aGFMQZSENL$FJfRk`SEdGsk_yR`R3lo^!kh#4t~^L^mKw;%+&h&36oQXhE+K= zuD|}A5O)tRuh!bYuZ(LaIPAau;{W=t@J}k{kunlwhz>WRRwBCt7E=~tW#IhS4kxHZ zav;;f3-khaa)b~~tGayCpGAY&i;{(gqsUji1#HY)EhDeutio!Rd8{rhZu?k}>GasQ z;)U0QR={|uuHavN?cd5ex%(df*gxarHK=pkxyIDyqjna-7pB0AmigFo-4A|k!JvFK z@CVo>3x6a>oF%@)E|Rsi0oIZ3<7-1BuxPOe`qZ#h9Y|PmVRiI-d!ZAN;g?b3)H!h; znH)y+(0VY-(1cmZL&8WCen&uBG#VEm2hRzxy09K7Bm~3bxW7}rp}%N!7Wx{?Tmo^W z2KlXN(cT5bLUt?j^VP|YQ-vdklgw&Y`-$zHpM(B!8u4pfict=okNK|S!?QPD@EKtg z%F>cJ^7dac*b?f6d_=b?wyV}REzS9y4uL}Nt1?-o+)jV>-IaOa#F;hf9D7U@H+GyE z<8}-ue`noCXsvC^tmc}b-Uds=D0B8vGI&${C1+_ywHqYLQj#;}n`xG;F9Jv>;2VF4 znv2Ke>xFBLK^of8lqvL&e=0(DmI_*wx*y@Ck(J|?I;b?WHvu4$dFhkjHOEB?TW>%F zY34Omu;5)>C&)9+7ec*72*q)*;Jwi7W>e`@%%@qDielqcxZR>2jc*ar?p&Ic=v_Aa zneDBba+6NZ$)MTT?UdR z`?cRwcPa-K9oxi)to)pylzcP^E^I+IB9cb_FspszGh+qeoY^8+fGjHPe(G)|7zz7* z`KG*u8Zin3;--W>d{8J){9%@`SqRUhFz36Qj(A173_2er zZ&U=*xyfk=UZ@_cLbh0knRbwC;!#mZX?lp_;H$lAU5*TA_gSaU$?L zV&@`GD3VHf`@Oh#ct`wu`m3Yx16j_ksvh(>G#{=UE;6I0kB%blM#VGd|E-02dH=1f zAy-mZjhXHBClM^=qoDxM1PdDh7JT#a&o4f3dh)YZU-9=e-N-30rPkP0R$6sY8Mu&q zH+huyb*l6nlfMSaPe=~73T;jXIl=@Tj3-gjs|SQzO|f=^KgHhwJhgtXq0WxorXCG6 zR%2123vE1NK~M=-Fv;oP`}*3Qjt|Y>HJ-RloirhFHHcdXh?%U1$fsv58rVS|J&&Dh|!DVE`FV%nqVo1C1jt}5R6iq9XPA!QqV zBE9K<@qbo_*~38pqYyIaRH3%;g6-ap$PBEJK*i z2D2CU#)uk%MGqNV*+VmjgtpZ93RW=rM(&EdcAHxthh`K#F`$r;vFf7(yr@b2XkNd2( zu0_%C$Q?_>9$Wf*xg>Cpl7+slq)EBZ!|!GJ)M}I;ST1jL)0Gng4>q&pQ4m;QKTjaDq2YZ%yAF+t=gjtW3HZa`>LK_ypx3@2+wJLan?w?y2ClSdx|=0@%U<@?B?j<{#e4 zSKE7!!%m)l4C*6}h`Ifa)m*e>LFnm(p^~8sOf22Nx)~BU;0Uz&$UWF_=y8Y0x=^6L z=M#gRPyv>cKcG3Xa$)K%B7x8a*f|@P!5@ld49C zAyBP)gM4+rK2^ush;Va(9Kr7fuxu;ZpS#^DU+YVTBO)R{t#*F6MhfQuGLm z$RmfnzB$Kzp_Ir})xQf`veN=0Ji3Cu7s#O2#>R<j`&jg}m|vAC?oNN@pE@azNW5*N z8(B$h#GaD{Ng88vqvy(7MB?#M>)KJ|d|;;BuOIkjv@^z@ywQ^(F^q{U-KW73nHFwEaN z9>_=371jE&x5$4ibGnW*;sH}7ZZTr~mcA)lZfe#HyX$M1J}mM)TC-LGzO^_=-@f>* zYz<+qeyx*FKu`ZUlRs+0%Xj1H=lm^S=06$5>gnaSbc05zh`K?iuafssDNv+*FkGD) zXuUgq$DFkhnIj*bawBq{Qn31yqKW{9jL*3_;(;^GS^NvU8{*-}r3qLb(NRnikX(wn0hRvx3 z2;Ec+$v2PU@d5pPK_dlbAVhvP4);dW2myob8=}-tlohHBCzjPN6S^BOgz~)Kji?F+ zceu~{$Tpu*k7`~1xB>rYnV?4o%42a_@XUV%^(@#QpiNV}q&CE9?~Aw2op?V^-(T<;d5 z^cEMVY@TJMS;;f`6HnvqE;JZ(z*kYV$1IKzh}Km!1P;O&+e>fO1S8U@F^)6tQWtMr z-A|PJ>|qDO*s5PG<7f8uB&{`!$eB3P40+n_v7Npe@!AS*>?-q84+s0FVB+3j*4rMn z%{TI7qELys`egc{X^kQb2oRhUQOJI-nF|mBIqHirIntcU-W#Yab&zo@a&y^tlM)99 z5l`X4=dX!Y0|z{ju}gy(2#N@miA^7&+N+bs4EDM=)Vc~ldgaDW>T?SBW)Z`w;0!qP z(#BP{nZjhOJ-$52;0mU^lN&tnsh^ck7zuBVA1e&o)DFaXgWm#f(6?BEK@+Ct?iK1* zeou%=IltZy|JzlA9Q0eCWYy1r4D0*2;7r4N6M~HBxeslH)?g8#{7Frbivl5L@_3SK z<$zzV3LUcp_9jCkXxSYzef-83Z*}#;N=XN0qAe7a9Zg=b&ty6nuQ;+421!eGZMp?& ziqz|(NX2jhsLfZn#Z^YzKPS!@*NnVx!!k`nFhL;DBL33}I8GmhZf?-XcIj-*4PDHs5n;^~4a6#ovt+U9 zBO&IgRZ1$7!|qV1fFmj=1YvGUf?+yUx?#ux`t-BAMsv@&WqC0yqBW^@sXul^r6fN9 z`~nw1Za`>V*=1T>Gv=TdTHhiD;PUCK&s-C-&&0X~xaH@&P3iYSp%W&cq1g*fmH z%^hSGcfiGhZt~s7hoEAFib=9f6rho{jVBbL6rs0B0L7pgaVbB3BR5~bBY?CCiDCsM zovP`Qlqq6v_x9t$fA#uv&N93AWiFe!#W-~Sl4djlVm+#~uG_4DZeP`0Y^Tz{iM@Cj z$AuuJI&RQ$blbZC3v?S$%4PaD4QdeA%<*y5M_`(#XW?eUnIYgkHVNH+q|mro+U^H1 zCY?Ivb(`4UD%uRW2|&mBroFKylOgCyHh?@$X_1gqOo;HL>>zrN)rrn=TwFDAo~NJo z!S~2eCPaiC5GW&Mozi2w)K@yJjw2Y|RKn)yKiq7{)rcQaf4j)|$1*cZ<+_}Ju@p+Qoi@dfUIGsO&?jQ=!MP?AS+qKA^-%3EP-m>qYsSFM z!sfTFG6URS_jU;g_)Eye9*h-vb&R_wY+_z|UrUo=4^ngf@+}d5un;38mxPNMG}D5U z=DFXUeWt(23Jx2Au_U{LvXGoAvY`pFmD;rlNdY1bBfE|pGR5O^K>e)jdkRHR(}faA zM>A$B94Xiy&h8~@3LsWk$O$a2+QAjVCR8yf#=t%GiP{oSAhOwHr~+Bdsk-=hp0g!+NAK;te$H~ zYSJL%QrRO9Syn*p=z;Qaiw^gN&NmC?g{f%}18YMg6S_+&sq^YIV!WhG`(@M%<%}7d zAJLb}r2kK?A7!;`M#V`!uMYG&8C6|%>{B!OP?u42Z$|Kj_vGMEJN0|=3B3?u;ogG) zHlWVfxeEfKXkLY!1OlRMkodW}hrl24O4*N=iN3hR9{=U!q(CyQj3W0cg&8SoUqrW3 zMTfXVP!>!e`wEse<%r=3f7?Lcmp_vI;CLjxpf^s~lqKs8$7Z2*b;UJy&t}G5wTmV1h_^0(woS`2!iXrQeWvq$;kK z5j;9@YV;Lpx`<*)YLG&WPEEcmqa1_7k93ggyz+rwx$Y9Awe=05Fp}K31I&*|^EK`f zC!n4BChP)Y?5aqBYtrh6W6OpP!g3z`I8@fF!$+hb5m6)ml*pkgXm8wEnU~32{PX48 zLO&CkMSYRWKfV0E{Cu_E3k>heFGqiG%5Q()#R9BuYQ6?sjma)PmtlVg_*h+hs|A$oHYKrq zS?tuYwcGOzfLX$|*3hCRyh^vD`lapt<#+y#_ZgH%M5c)tSIAVY@Xfwql;wp0$ug#t z2|R(rKWajA3Cf6XUt?J0d5h}N0s$vFin!lpT0%=1}T&&g`@M|m5G zO?s{D*9SP{$6-?Rjv_#I{q@)G$v)og9rkE3hj-H7%xFki8C#u^ID~vo{BEi3=Zste ze;&UcZ0^6hJwV(EZz0Sn#qI}ndt1ITS5Thv!StbUV6p~3$$bsXH=)BW7sW3o{Q~qu zfjXQt5t20BE7U|*@zTrBi5_)KqI)WQQmD++zlAmR;9819xE|UwzIF;Eg;5O{GzS$V z9AZ6g@W#<3j&XOtCoI<`qeEUZz_%ML&rPDvdMc4;Bdj`&);XRs>pRlm-i)iez5R{N z1%F=>={mfo>q-|Cn^EC|1mTdG z5ap1dwk+9H_@Um?gBq$h5xI)-hx;d({T#$+EN;rNFF7giw_V zD5|3o`l4a3^D}|7FecCxK9{AE$40(Ffl3(Ascr~(%$cI;hHToZsR&WzmyBRyh$tg= z52k-ll`(}x&-y>RXd-Df3strU0D~TP%LkF+5XgAFGA0(aCUp)L?k4HYoPWOBi@(d& zW~^<&#b^GFW&6t(ZdNBgr;fn@{X>co2)iMUsje!2pn}KS7a#i1r;mu($yT5~Yjc=& z2_3Ln>O>dnF18*2^T(5usBju%@)rJbj_qeco|PoICu2Fx;3S;LZ^~7ImEmj=tI@+eRjO4s0bF$tl2_HTfHMEEykD>KxEC`CL|pBf08wU?0FY;?G2ex z8`NxW%YN?_xs*8c$%_^W7~Y?cE3r6#o>qIS~fJXtyFtPHKwV$1Y#~P0my1mz%@$v&bgm^EWHCjLlLpTSoZ5 zL!_NfN!fKx>ON+b{KAaslzk>MwtcjcL(cfT>1J~<7-%XlWq)Qp;V~j}Ci>;ia(*o3 z1r+&YU0ZTeZ)27I<&_L)tyMpkKP62Xn^68l$fieXa2!Ca;vePMGZP}CLXH4;N`{J~ z0tGT2aG3KQV#!)(_(zrshlKBvk24Ul{sp7U^aat&Zm(?HU2l#+XMyWa$pIe(@SSRl zg$0djjxAXf;^$l$AWn$SIRH*0f4F$a7KQL622)-bsB^gwo2{saYf5w&C$`}ugcXRB zCMQ(L6{PYdzC2&xbs>LB^_72q>>4+#cCCa9iQsI7I)w z@55MTj_`%m&nRF8WNwo7_CMwlk6=({Ai3mPS8+Cs#Fpu!L9J+OfJX9;QFr>w-1=!9 zxbuDyT!7#H<>&Hyb5~q^rl-=fAN=!eSivCAvOTDq8va4sH$SVhO4AG9+9_s%kHcGc zDU_jA$YDZ6GX5!l^tETAlCo7Wx%{?s2oDd7-U_RgK|N(oH!(T=Jg;&%;(edxgMNrN248A4kktZ|r?0N>r#fUZ_NWG#?(4!I zjMVdQ&EB_*Mve-`49(ns8Ei15U_cmw>Eu#*?c!Y@VFl4-O$|k6P_F_2a|sSY^N@SK zJAH4L)JHuh7FPP)@&}xxEcD(p@=Qw8?uB-}Ze>1YH;yTEEaR=10HwB4o;u?_p~~?W zeBgom=OZox8L}-N18ha03)~$AT;Sp%qDP%Kb6C-S4k^5&xIz4SnDm+RUVQ#}Wr%;o zr=iTZTB$l<$S-E&4_xJc6kaFrSR8yIf$0}^M_j%kB%KQwIrD)L!Ii@2Vrj&8Ofv6VA_9*pY%Yu$4wFCj%5xlS}$-W z`QdQ--YJ2zTuuh6n6tqT%~=t6C9!7u%dUdjI^6@h#sH@A>v?_K|J7Oa$$=sYt(=&0 zE|0TGzD!1^(9_3!e!{M1YC~U92te6^|45Y;gvk~#Ol47BTs0u~3!kc`h|5nX<;97E z);`;N&-!dP*%1HhT7=-CuSWDI$Ck3>=a-ahQxM}kvsoLaJIEC8R=F-cUjtRm5jVt1 zaTBEy2tI}DuZUbpPNFioaWElNTE(Tz*tFVq=Ho^vydKL2m<F_<(bVFdBHU7h&3;*+78hXL5+58PBA`{dtIce|GRXzHkmN1p54j~+0^Z6l(p3TLH{)UX@wFG^duEaDI^kC={8`$C z?7?<(corq0a0hFsCY7p_Aq^I6H;gkm_qTSjwqgp1#`3I9KQ!N7D&-+>UGJ7RL1lw>Urje5g&q^q4eS+rMygZFslR{+*9i(bY zP7VX;2FX#j6eE6hlnI>6!5QJ+DO^s3Dns9~@nEN0ei7rS}U-mvch*!h=*nxrp0t99H34!KWXFHCqltd3GIc zi)g)sgn3)ucwDEZr@k%{ErmHQ-pTq6O)vWD*hwZ4I*^>4lp`?xWf3Q(&>GkgS6vYYKa5zd>UTy_1>!e#B(0(YEY8iZ`}<=nKX_ zsuH9!$ffTB8=*PsM-tk7%k7B+j8P^#n`(*awf_gN`lfOc#tl8neFfL9U%&ok-xHD{ zWzom<#g#*gxTaf(aMNiMJwRFMLFuI@4UzQ)z1dMS*kw42ZBA5{Tc!6C$7CG^cyo`49Q6dYZCGt!}a7D2tT1Z>P%@DynV+~<^d_L~gdj41VxsVHe2BGG^ z-U4(#3GKDp*EFPaG*$TD6{}aD6_VgX`&OJPUb(m?OPIs__ZQf7`l?CdL$Rc(lZX;q z%_n)-Y&-eb!3iVBf0xQvX3!MjJp_01xH%Jpc2o#$Yd~Otyn%pfTDi>KDl{XeXk~wS zue=E4=78+mGD~KwMB9?v8LI&J8n-+Hq7&~&y@I(&*PDJCr@v1YjYy5O)ohF(C*U1L z_Q%EFm^JTIHIp!xU&1H_DjiU2^J&?cbsl~B<>&lJwcFuNjaIP+(iT(Ui!r+^hP(6vAG8S>#)r9k!bTKl`VtrC#R2!K*?8T0HqaC7yk z*p`N0Xv_&vfnpX&9kbe@@DZEs%V01(5!q3aHa3BdePLI3KdlSon1EIX>+8Eggnbh3 zo=s^#h10r#d90RrflP$al(=_96W#Wmtst~L<_wyvhWGHQ_ycUib_ zQK~?I(k~N0!3;3PB7vjAa4yPUZFP;oe8+Oiku0mZs zP|K36#vI_Gu#nT7!N*h{a^B+Ow&BtJYYdf!u!+0|p;Hg2>EXYy`T__H`32WDD5#?6 zySbD=EEE+x@ogVlp)Yy$<`5T&Z~q@W?F8ng=V_N^OcA}(Zj)SYe)>y|Lv13|X;-g) zh%P?0B1`2DZa9k@d8un(WHp^Mb$D$z)|Y&({ItaZm9-+0%n%A$NWS@};W^{yl+u|T z>CwYgn6ZI-Tr@s5S2+ZV3%oPgxsS+|SMeg;AN@1un2lU11T&b^U`oA@W;4Y}B#(oI zNzzZqy-ghA36T+q6b7Khqwl~BQtod2bnq;b`7<+}Eav5N>fE05^8)ln)tgPytmV}= zDPE=fbMX_a7wTJr7>AQlLS#9F>w?PQD0^`$P`{w78A7x+*8G!18QcUHpEzUv+pj8F zETY*Por!Eobw6;3>oe^TmjuUvkLu4;dyo`)QS|Qeq^OZIU*~S5+grC6e5ufR7pHeo zBosKYS*)`7CGq99Zg0CJYiJLrY9cfKT?zmZ2=X;b`i0#5W*~7KCkePF`vN5RNc~l~ z*!lg0F?O}-_M`j4^cPsUUvoTm5ds!XBMy|#{;dI>=N{ZB7)0Lo$>>q4J140Th z`8KhdxdngaLIR}8aEq226sa>vFcOkP7dMe;o{M}(7+wZ__^*VM(~mIqP4f*_x&QcK zF3!RnR%`95n0oP+1+vK%bs2WvQ@sN}W)$?H89MdkT#P*RtTPcWFfX}G>Z-C~*%7Wy zm~j}mPvi2`VmthR#25nV!;mdv$o*XrG&_h8&Y1OIQiR@~F|dY8NOUb?hx51wd~xF{ zw-^g!ke4ktQB!$o-5H9%CxuFO?Bd57I$A*yIMnNqhSb>51*Z6f)`G$zr|3uKxX?Rs zWqo~N%jH$l(`Q*;^4{I)qx;jJ``xn0_d*zW;$6fwA*jo5%d0r`l#d(U5C19anLdmQ zmroVFe`K+!9|ee(I_b4u<|FGH7|Ts41A}K3XBD*Tj#`+M$KA*6;zxj&u1)k-=h8N8 zVA&v;TEl^e>ON6kz|^k{86xxDsbn!r=#m|&;7C-5G(;R}iF&6}!&{plHuxij92FgK zqQVCTPB;RI&!5!+<2Ej#U_-Dm{Hf5sD?L5s1zdb2ryE7m%A!FZk-p#-fl8|*V4Y&T zE=^V~vqebx^u4dhr-Le#bp)j9tkkODddf`^@Z9;3%zj@3lgTq`85kcpC6E?Hwa8I8 zt*FxkN!3Gi^2N_YC=h2aE{$K|b5~C@rc+Wgd<}50#AQ3%TPy2J{vGq2eth5Ol=^r9 zr0~5bOfgQVY(GW(yk}ZE#)eFYeq5pWgD|$RwI3t|P)JemRAsZOuDVl{n6?wH}H6lNPbd5dUs{v`c9;u^C` z$Yr%zicy;o_FLg=`5c)hbMaeYBbYLQmt|8dJ~Ez=g6=PS{q<|=4&4H~B1EH>IWkfX zxoHTL`SYijH#*zS<}d=+htF$Nm?sy%$Qx<8CRT!5B+w97SY_gYz`2UgW@kqTzs-yF z=)d%$zlyUeD@+gw?4<}9UH?Pv25n7G!nbP3)C5OR7YUxpnL!3Uzi%={CXt#U%gfxj z4}u&JL38;;s&H|HNvsVy%&NC0+VYD8(V_}Bw`Lsh=JboVoZ?5vwJO9He|ixBJD86e z@W(Q}-oOk|v@3tQDq}(myKz^&^4c}uNanU&hf{6a(@vr|*oT$GA0A>Zhwk$$k%8k*vUc@O2_z09)I4`c+O3HnbZyPZvlw?NCrwOlU`|S zROVnYdSCAlG*i`rm0}fT*D^TUq7)|rfV@RIvC8}wCaM*V8Z|{{1*=|0%?6H7T|t!pr3tD1{tuSeT)x_zv(9&x22b({GuZ zVJq2LJ*!m4Sk;JB?n|y+L}lD0l!6Hf&ePnfpEyOysf0#g#N^R7H7fjje!hrUXBtr7^_CGeRfq+q>+%FSCPLXEafh20h;0*y${@GL1HGN05F z+wXWXtqcZdwyhI!`VMe`5V>d<%82GN@gFkR^-8}yZXRo94E7Sa(9Seyq=1RzwiwL? z`hh;-DphwT7Oa&CZ-S$byZE4f;5^%IC_w59VKOsMaX#MXXBybFRY-=go)h_F^DH9 z*n&MlL)X<^_4e@Brv-8uY`K zG+GGLapw?Fqk5(bZHi(C&oBZ2PMPghCA^psKR~XkmxSHhBh!2a>+ogpvz;6vyDEd^ z6yyo+jIlB}{vTuBadwP9_0&`Tdj?}eD-DxSiG)Ap(vju}HGq)J94L$x`U{A|c57}m za*-L5uxk^pCN%setYi+VY8#+F!SV$E^QgVasmIJ!3p;G?M~%+uFFD0Rk%eI-SyHUn zxu5YmTh4uaI5oAt;BS&|2VMDjb>J_@MH%XUa~MP75y*AYaUht{=f{^fpna-(LzQ&o zpAefR6^S3f26Nk8lI)>N6vZA_RGWio2bj;XD- zOd&q$*G^BLkY|Ik0N?$3m%lh&`qO_3!drcTTwsM)As2<1cjkolU37Ldbsq*abF|?C zcA%|s05nd7*+7~aW+0?uzo@X>B@{o7y+1orU5Mz>iC^h!TmEL;03kWOPpTU^UFoSp z$+%E9nXS6T>>d>6J8UAa?3ovR{Qw~go33R&BgT!-=P>XN;6ekGW2GN_0Lb~$^FqK` zjlvM}y(+-t{BnDuV31j`%PMvG!X~G|g|^ig6T#dRWOCAUaH9+5X}EwEk|Ecy#ZVn) zfFj_W1UH^@a$I%`(V~>XH+lK)e?o$pNJYbMq`Ut`Da&xHqbY{-?@=jf>i_9<7n~CP! zbG2dj_~t{)o5g*Oc5Jl~G*7@}q`5%UOM92pN1p^^jJ{_a% zYC;{Q%Q2z+SnW-mkIAx-hsdT;E(Uv%6NN3`Q%TH$&Al2ZcNTtBs?rrTM8W-lgysem zveM;IIPu~y&A%#t*s^U`jS)XB@`^*{BA9r!?k}M#>!PD!FUykJQyDX*{3p#n(RwCCdtl`O7Kj92fM$gX6H!l0`-kR?H@^lveWn?ZIRfAflqDmP=^HOO^V)OT+rEL5Ku}?K)HacT{%e~D9;P!s}R4WOWM}!dJm8cL?qa4Jc_*6o4p-flzc+4JJxw zpv>kbMB_^HweDBv=PsV^t1p`LsK;;kD_1QRD|d~JX^8`}A~Fe>RMEnz1EXf{Fw_yz z9GpW=TQvWknj-|eFb#Y*H~v`Ge|P$+%CWsd)}@I(!zD zMN*Y`OT3r0P+uXXkceo!g?QL6{9Or?gc6$yg7}hn`OD9_a7*54E$&@Cj*P4A#_B!D z3tGDr63{K7#^!28$D4K*6L8cm12?BVe>m#jp{X#2F}b1e5u*CzP;^q0e6Fh z$tG6x;U?;XuZn?A4^+U<8-uAr!{_iUai|4aRM#}k;|SeR7Po0NC4CBaNUOs8=O1~~cw$bd$A@D6Wzs2mba0&b)z3<$_F3_7v)skucVAn28`lY?wjqNAjtc zoX^n%vRXF3k*dAPR#~2%31zo`SRXS028uhbF0RoWwN8$Ih8k&ZWaa71#vEx%QebbX%adU0 zreDxP=_eE7k7Wooa|U0gYVW>@>>+B6b!U{MKjZO-F5Ee)2pv&$)YuK zJi5K(DwKj<()J7SxKl?FDxRKi`HR;mt+#?#U53U%B%jC{7ml@rd4Ck9OqqC2j_c1| zb5hg>xC< zrmwyDqA%t0f4%(i#V5j?_&O;r3C)URE&^fTIpn>V8F7M{6Huv$Gx4$A3o8LTG;`ol zbw4K#@jL4$?sVfL=!0$`&M9a#)mj%|SN6lXXEn_Rj#4w2gaxOsz`UQ!*~Xqq**0C3 zp8W1@`@JmZ0At*V)_WY8PKY$=NkL8)&|fF$!A=vWkpA1@7-1dfxgg0Qz65`wS5N;= zB*T>!pf!dT?BZ+Do!z&G!B%nvk56~zf=Bt(;R09X8#GzyjeNbhD6FwnouTiB#4!x) z<)8Y-o5V?|o0N7VlcX`TB{{3!*qy#l(?frf*CeCWz>S56v0aG#?h`3*vyJhC!62_a zFWb>Q&kC2VgB+DOe3pCGQ!z&As{_?+b|`;mWub(Is|E55{nS!2H7wPcq+3DAMa)oD z#X&EeDNN`N=sKD1A!fIKoE)PBXH}0M11+>zk#Hp$43tSfsEVcuRRP-vJEN`wtYECk zN8t*jOrM_{(^#->L(#Xu5y4$qiiuY&8J{8Ls;zjhlLvYAS<$@gqzP~qvro`AKN{7( z657w3u#1^<5F=b96;qL!ROZnvszaV1kh>8C%d)p0qN;3jW04o8_T7X`a3Qs&g~e zb}G{3t@BErL8eFDtWUfJl4O7tYzyLw|@S=&xo?Pdz1^X1;+L zw~JDgf`Ybh` zUgtb@ScT2PZI}z5v!(cN{8x1@*f@++&HaYU)926HmaFJx$S3@c&P|J38*9$!XfEh? zmp}5){1Qo#J4^HVE-k4PTVd)np<;g%&VG_rkOuLxn`;_U<)C4XN=_-N8NIvLItJ8&!&@ewWuu88CO*p#LRnruH`ee z955Na!O7MSx5ZO8oU3_gbHksRzMp-X#9Kxs*znzix%%9ZPyWzJQ&dGreoj)wga}DW zQf1HAFzHWxL%1TjmJsNgO~t)upk-hOa<0kstVvkxg-u~;8kaxvSPKP@z$Jy+glHWG$l(3jfC-~I;!iKXA{^6cB99G{wzJ8W;2Uyk zyKs3Z3opNR&&RlN;|2cbbb-M^bQ((57f5%t@G;}B4^)c*qDAAgc;S$n1x3fnUEL41 zqNG9-2(52)d~ED#Rk?`&k))u1x$>JirC?4&^5_=-omrcg-xvB?{|NNjB6kdB$WXm@ z07EN+T;~*+bvl@9zPw{GZ_xF%wj6?_3d|{DPW7>>i|vePlWoM!0Bp`D&-<3htM-pv zJ*+maMv9<^BA$M!>5b+=)#c7A`<%J(GJ~UvHU+z%nI`gqB4K8r3+q`<5z;O@4zmalexjoy#X$j9+Q4(TNnyTDtcRoVuBr`o5NReb+hvRB-nB?GQh1+X%{MmF3&kk9<$5u z`ql#3E$bcN^vk8hQGk#bT(rdnSfWWNKBadFU`sl_`|wN-oS=N_=L#mfJK;2_EW=SI zO^t?p-8EnYxJ$+p35lERHXJLXGi1hXvsDGXsGkh1_Ps^T8`Il1rt)9E`X%E64S zL53S2TQ0tznseJk7w8*(jb^1Xvqf#)S#??6u~{)%KM!2Ev+^S%z*9ygOYq+?OQbBN zxaBwK@OX=lD)B8FKE@m|C{6BR)Q{ivQ5GFXb+lUET3GNEE^KW% zs}L)A)ADQl;ZeP>CG?roUQbiOuCkU4qbj2HFu}rPet79s<$xD=QSC}V-0flHC9|T{ z3Z#W^8=ptju`rJ*$LK9jmf)YO$EWnR^U3dVHwi(VW98PIB~-ByGS@XCRPqyarqp{# zE6OCLEcNu=;<$CmSt`I?DH0&ONQBjN((ES1uRww zn@a9`fAZIC$GM+LVWHvSrj9-0gkU1!-PD%#c3J*6*uE`b=P=?N&{W0tZ z{1y>K73U1Q{(t8X%!o19tM4aWxbr@i$;Se0={sL!>+epRg6-%?L7!aP>|C zsYsAn-7H+M23PS%ncwQlx)Tqan0Y6K(afuXhrhT==Bl%7f?0c^%OCnFFsWpR(wEEd zGu(s!fe6to@yK7vFA(NOvce_t(YBfdZFDlA3X%BTMAaFA}yDbb(cPt357A4cl{!_ef@l-D^%Qb|wdr0zv4x8UKU)8)+R4WKdCD z{O1HSQo&6jBqlE=VC*OiqM_9ucj_t z7iBf`bscd}9h|6lKn)&*9{_<)cLslq zC8F{bRZ$34#-7u6mKTXUvE&@n`a)Pz?^<&)_9_3>)niTNT7reLyGQi{dHGE%sVocK z0~;@T2th!+%JMx~GAvs61CVo3;cls~?yCgTDUvm56{+mCMYu-}&*sDs(=N`T&b{mY z;p_-#g74i`T9Q23!x$T==)H@&GBye3ssYZrP|+EA1r@3qMCPki{|ivzosI}p1N;#r z#6y+!;G{(UR$fov0qMkUmAO;|Q0*+I@=brmj;P;#js$9<@dDBH{fq=)s@wrO^SgB) zcGF#+XIr}A$K9{@#O}6QWmX)!Xz`Wu50hU*n4z`XjpT90eh177|2LNNThXl4H|k=JMl%i@Vpjlp|53n%!q8Wf1n9iqYU{eZP9 zl+5blni3F4X8QLzmyPjF*H*tSl__AHqJQRGRA!e2v~eNl1PT&f;%Xwt(Ui^)a}l_& zzNV?tTryo%)i;4$7w~NyD|VkU23DkV#+6=zvE~eY`5!Mn^P7_k+0O=R!f~y6;6w;# zWiZT%Q)Ea|LN8$MJhui9`}_Eak}x3`$=`R%HSYgLkBe&*fq^|Kq*+U$URNB}B0^y8 z&V*%d0c{S)ApTx`^$vSbAE=MiHihKSZ0r(Fm{fYlwFcp-Tr28pC)WLqOK%oN(Xhev zQBgP(7CT03QpjlsM}>q}D0Y|iHU`-|h-!w1MW=!GRSkKOA?d(yIRPNoV#P$4flh`v zrFdhK?#{~~|8add#lYF6p7F9l9r8Z`b;{14b3OXw%#rFK{&T#hpbaevL%#1f;mo4e zYIaF&bIfS$KNNg334?3M;Ys^ zt|I1Sr`j0FX+k=7l(01U(C>>`Lv_>ymS)0E?ODwLzs;E0v(W+=uZphV9F;}JT9`HE z>VvxVr>74==J^N9CRXJ8hzv8IB$kKX6K2O2Jd6X!*^3z&FhHAlT>N#q=Wt}nK3HBm z@=K_+iU9C!8HH=D>SF_s)TI3jQ_mn{ipu7)M3`{1!Vvhke3+M>lN)TlrZsgc)aT2x zLewt#nrum|lu_B=m=Klay>#QojhE$6Cb3|eFrhgoCRAA3khM+!vFIc_F&~piIlZ=Gg(sFk0Lj0QCU<_DQkG7g38nR1o9DfOslkiVt@Zi#kT4F5%{FZ=~YjiNt zI6(WzAk{k)N;A}CbW&1hd=Hq_9=;x-u<15ds=07ub}3I9+s1N%V%s$5El zbCgUmFcg70Lk4N7Xn zyHUssC7Ud&%neY8MiJVw!A?8Z@CTPay?j$t0qjFODh2kPcm)%YQ{GF<6cKU0s#asIU;qRaFmgJ=zv-*QTqLmUgB#LI$YGvwSxUdM zJ2G$GnJIgI23O`6e9qA3_-vnz6S1<%E}zc&=Z{fTh5~3na*8ok^5$~oi!$XjJ)F9h6pf} zA2F&uoq+jL6cMmfNB!$(zU)h8m2=^V0&|WvCj2sw2X|x_u}_O?dUu zdr>8UOv?r@Q$Gtm##h8!zxY1c7P9!GvvK&yQDulN`4op0oG!nt2j2>fk%5kj5^%hH zGKkyDzJ@U4zMgLa<35_T( z!4A1aDD4b#lb%KXb8dR}#b=#HR=F43K`%^{p4woTwI&r<{aF4i8?aXQcg$0w0RlX* zqy#6{6;iIA6>h^7)Y^zy5bKu(k+o>Ou3a!d5{k6>tnVHu{&xYK#;z#dyVg>4)>2+x zJ^gTRPj&zS4j9mfN4w>EbmH>nOk8AS0)kz>El+RIyR#R{^W7C6$1H+EQE%t^u`Amv z8|zMN*0VSLa92$yyi3qu0VG=>#$*L2+!FpGoy?@Wx}2DS%!4aTuB$d@n7SjK)Ls0+w_msy8UM9jJ76ZKB4)I zHUH;i6vamd)q%=E`u|wuCwW%&MS{@s| zfOId@30B6TzAd0lCk7fK9-hnZJHuG%>2M)inO`R1-=WME-n4_Tw4P7@$p+9BnU1`w z{G=HA)de7;o-iaDRuOIkiG!lPERGp$#S2j&D0IkJh3WXP?D@CCVdZ_Sf;n=ncx*xx5U}% z@~z9a{oho$P>aq4Y;NL~;6RFgrM`$T&Y;msiU5NP(pfg5#Rz<7(r#mx6|mSf+L{L0 zcC31NQFs)CGq{9R)6)Xx!4U#`tvCllMP&NcPnsO-%_@k>>ac0t|c z6sJ@_TO>14*yI1ccK>F@5p@s33(xw@S;|@!7jJ449qB(EuCq|Y_7W<K*6pxD5HN)aMqAhZQpXO2_ z6LW0K$-;qOpj(GDqctT**0eVL)Dmpuy@OoLgf&wdqUh$pJ|mNInYlsu3ie3c%&y%J zi~8`KXCOrxH#6v|8OImejdp%vv2}ZOTMX=DJmRqZ@qvatR&iqE4H10-uk(H?P3N1w zaK|Z4KNz9(?s*Z@GzY&+3YNSQa;m+x;DkxQ3q^6J;=cYZPYtH;$fxs4{qLQA6JlaQ z5UN0`TY;#aUN{NSo$7W9>m)cWSA#ETWj*@H)ZRBwetyfkbFxpq=v{ETcqoN8aPDOT z4iiJq>c~9Zp(C+$sxHfRT!ESpU2?wL$mv40Cj^0Cw~4Am6wzCl<$_VwPIe6p)VQ&T zs!Z;FMcGaNoXQ^I?G&y}j>~-w9?(U5SDlc3;0!_!@+xR~IQ=`PhjBKCQRWX%W7x?S zmiNGssjGS*nKmkU#F_A`&=)|)+XVJhB%Ak%DCS#@08C^kCX<;K-2U=4L^PS zWAP$c@DCQTY$BrBvQ4=5jFWu24SN#y=8O04$uEftSW#zP3z>HEgH%=^OU~=BUz6Wb z;yn^<330qC^1C)7APmP4PaZM|!qY+O?^FPNNT3JZ*CvV=pM<%Q^(-AKAd zO~aL2sL|$18S16i$~i#^{Dk@JVu0aVP)b2e1~)t`+&;T!EI!`IBLuPPaInc4(ryB4 zcy)@%;eKuo0q6Q(07hHS^hk#kd+fxT?1t-r=#ue<|-%CKUZ>YH>$yL6wwES!;`uba-M1JHkzSl zq6Tg91E10>MG`!WO=%ijNWwACYn`D)9jq&PVMQ^y{b-K3u{EEYTJZKZ4i`%=@^$kk z%O8%IFo2E-r zpO(F$o#K@yi54qQraiJA9)3o#QILCTw;rBWM=nUtL+F{mi3947v3jy?6wPD7q@xJJ z(pXmI8bK0OqS`naCgQW8;BVq z^PrCjcabwqdm|T}qskM21E&zUwcMnU7g{ctKXv0@zx@-DF57I8`7#?dQZIgDoa>`8 zz=^5>uwv0D^x|*r2Eo(tTMPsvN(}CewFv-Yym0G7t-?cy$PZo6z@Nuqynj zf#PSfP8-T|)B*uSE}Kj6*xb}4r@P2SFLu)KidZX`aY|V8pEbLx$9=QXLBW`$B+z_u zW5%jyd7W?rrmKrD^aAS}86hVYeAL8cV}dTE2#$yEGZE=-*Xl3?-5O7*_= zY@|G&4`lIpO7UfwdgCi*e>J7|7TRQ``_kgH={Bxfa6v}os90Q+*sX60Ynfk(gX{04NuG-~<}>!XY(AgHe1q5{49}!cM(2 z36%hM(`SW7WFTY)7A{&CXaj6#lwzjX-`NS7ScD%3TT8%2DyTWd+`-Gd!|?I zQq*3~+z1dWxHH)*U=N1v&TTZNpBAZ>>|+QfaE%mpP<7^GE0cBkhH&lOfP+n77ZsZG zSV&Pd1oU-*d%?USETi;IA%17V{vL(y z#W=5i$xj~sZ~0|4F`%|ezSCZW!SAljZ!ZfAS}_(~F0I{rMbTC&L>3iIwwuTn*g~;~ zN5jxzdK!qnYb_(KNU@M+3-^6dYTJwh<8xzLmzytq$yoyRcBn&QXrOx#cz5EOX`CA+ zK$QErMmnEv4}tajU^EUsQM&{l>eaE6{F-~2z19OkEGT>h0iVYmCe$YV+{$bVNof{S z>zZl5Koryk4evRilB!RNk7Hfo8JtsFFMIj@#eceZUp|te8pG4Ia+VC|-!~^f`=-D0 z15O3QaBknNYi&)oQULlE3`!=;qBM{-iSS{W=0s{CegfV<1f`=P`yGKH-mw!>5i3-kYX=}c$W@fBPpZ9~V|1|?0fs1x?llyN6I(@-CfyU7M7rejNTUi zH(PJgB-<)k2x}9Vt6D{00DtMMYxB1_~-B;A|t>o zg8)Km2^>TXkSIz;f<+nyS)m#Ph=Cvgnzd(cyCZhpTm6Cl3;mqyan5~jR>LS+didw} zyTiF>e9t)+JTyx7i4!M9fx3-xeNe4!C@EkE@W zxoa4l&_1a98;#W17e%zKH2ShA+#j%PmsF44eB7tVOX%IMsCJ4A6kTax^q3ydzAuBH z(D;s8BY|^!;^yPN0t=_B8AVzJghs*rU`a1Ne$$t45l)th(vE*S*!Uu8OS;7t$c+DED+^rOE5xz%cKf$7dZ#^V|mY#HP9)F?9|5U=e%1% zcEZZ9uXd^@=<`8D6LRyFn6Qtlxb&R=nQ7KHWhX9wLg+|qk=?&>Q~0W@y0Vx19dcph zf^}01>r$NWQD4)~5a)rizLn0T9WG_$(GRcQkoP5hj5C(4A^F{!x%(Owkz@r)I0xsUE;>2b1!1h&hby3b`8 z-&xW!LNl@|&5A2DLoiw3ABs+mm#Q-&>`r!tFWTZ}MWo{!CSl}IsP>!FZ{}|wM@pz= z5EYRH5Y!}Bhps{c$Y3?_x%i#Y$(?O~Kc4_g`-wjMv>jzF=I?}i&q4uEAYc>WJj-xI z`FL9O?g{N&ho(<8AHhsm^GF`W#Rl`zER^w9;c5-)lWU-K6~`G+@T2>|Uuf%sNxH+rZ1{@6z(0?iib)DXjSOZGSzzWz4);*;)zTs4OdY(gc}}%U z|9JvBD;$WBClhyC#J7X$a`~wfZPiv&9V-Th7W~~ek&a_wYw;#J?sil;qF3>?Uem@xQ8GUxg#~=Mp+}Lc$(&_SUXgj@f0egMg4%{ z`Zl9KK+Refgi>~vCxwH{m%qKc<4eU7(#Rxj_^b?d%b(=4|MS5c(c5*X7j7MdzRcg$ z!lHqYaShD2P^s9ET7b3rFlhuphfm#^h$VOc>jki;l3^Ppl+XUj8()ofwJ5lli>~Kv zaqpAfggR0<-%$xwf7qOi8-Am|`KP|tVWrT}$$93I1lBFXMi#u22*J0XCG0_YPI!Fm z6Wa<)lh3HLMm?A(@*8MJa}g-jIUz^EiG*Jf_mX!4W<@3R+3wBseTd_6=uG@_Hk{Fv zAOO?gQ-levcbfB`!?$EAlo{9KC21z=5i%O-7r4(A&EKUKm@F5|p%?Pc%26K1w_r}l zzy3gb)4?}$f6<^}-g5+2Oeg`8w({G+9(>W{^~j`&r^aPpl$AOSuC;m(=bCKZ-9#6D z=r)dy5fysniN?MG_$x>KKN-G*KJJ9+YunB6xvoJ?k_l$TeB_LRFW;r_;HW{8N|nG0q3FxqXvQ zAN<(=*)Iyu*9%AKS#{hgo2XdaT`(5dA*Q@jn&0@a&;Q|Tvxm`2*ET9gtyZW`IH+>^ zT5qJu4rrTE{Dy6`2%|-$WoLJK{VD<$JVJC_c20 z%L_=xDcu=n5BHkBTeMyEfA*^jc~OH55?bm72cw=%?Dp=RegDy{pGBL2FST+tafnXz%XZMnfRnChR%@<7C0usfzE+lnRZ%FkamNN_+C^qm}Ftce`?i5 zW!@{Rbqy^|D=JqOO8YJ3cz850jzefCKki>%hM&s1exgTRtJ$*u2D6N17;r(0B`pw& zIfVwaWwWDMMwdTCRDe?vV)W9M8C^*~UX`2FEF+pI91@S=A{WlL*$V8qK2g8xf2~d% zx>-l#`OjMFh`Ri1bkR33OWxPnyp4=ZxK}?aN<|l4UWiqZ0zJ?e-v*@~wi|)eYQP)( z%;uk2g9X24-+b|gkP_&8tL){E($|qNhHo|ACG$f@zMn-iBQABYt-Qco@T9vzd{M;X zK@+hT!#Gfo27Pb5jr;6w@Z!O#Sh7ZBijx1G3KA++;3TQ=Tm&8xcR@aLN$;1C$ z5tt(+fjXHH9Jrygz1zDeV*A^0&)hrD5OD=mb8l>IwpAJ0HeaOdz4?h2ctk(I)E=g< zR%yZ`CeW>jXMGubZp*(cfA2FL={_x%<CRW)5T)ofa z;dfdWiY~scDri2tu)_NtB?kg+RUz6ezFc58=6~5vrHW#J)+qQdihkH;h}>?UXjtqq z&;_5@lEp`aZ-9fs!yLF2f~P2q{yha{Iz&Z?0r*g0nF>7wH=pW_8@;I{03rv-ofT1T zoTQ9tShmk^&7~oSnyOrbsdh%!Y=kOv^RcMe$f0d3qLNyB;YZ}VZ@wfWTCACS5_WT^ ztwuG9EhS^s3qDD?m0T*nkT}hW&iS@5{B4UKx8t-eva4rF)s}J8lTHDdXce@(Asb?> z_J8Db*3Koc@=1$)!^v;lIkRF!GLu!N-^k|7qPSTG3|JD4o>0mmk7!B5kODHF)7b#E z)VVS1r2Zjog;=WZ#Gg>0 zn!9I{3qO&aJKIaE`#u4(1h85nQ}52^fqADaRotJqPITF-w3-q{52mf0g>G$03((Z% zFU|1xQ#nw-+EJ%cDRpr=>ub5C`V8kpFss_m@~W&FkihsHML1&@*eG8RJK84wQ3z-Y zt`r)z`DTRPi25i0sEm%#8O==LaZP{`sow-gNX|a_YXTq}g%ki!5+dmIPbKSvD_Y6x#NUxZ7#kG0IOD1Y~LYWlq}2LF9stWSLD!6LRo zICQKz6Lm{Bu0pI@i0SGdGsvLR3jh&eM&iB7L5NBd+>#Y+Y}aM;X!JyR8#dQv>7aA$ zD{8}M{P>xpgTq&zBH6&0CT#9ECHek*pFK$r$(wssCx5vm{uUL z4PfKbLN+TNd3jNBCJdVt9MZCP)(%>W5L};zoHH3vjkYvRV6L0{@w&-H_};sf;=oVl z?`FJkN}&%^l@a<&zIVw#Oyo!IR{a}WrMnhIc-RTE@`amz9J}MT-1}M^Mqs|jbbDkx zGzj1b@`nvm4dbdaoo9RD#$O>oBSB;$Kp0=b>5TF+`vps8GBpq2iKU^;izCwOPwsppx;U-7ME zvrNu4=D+eKA3u0QeVOpNDKI8(opWUb(@aISwF}7bhM~d*37MAa??Qft-Hr5+k^{_T zSfnhFU|c|2pzw$Qko9Oq3Vo^kmL7k8G<;gaM=11#?~MY~ev}Yj3c31{j&V5zDVzOW z*t$)m#$nC?rVnul5z0o00*}ijX^OJB_)t+YL*GXU&1J*OubA`7Enkj-9&+VF97H%V z?9N<$*Nn zYv?zkB>v*wQE|)gwiIo>Aew$hU32D5Gz^t6Sqd%~h#+_p9%*S|!`udCf5vw2o?HoI zX9;$s{ypkDTF*#@l{7nDVje;dgN9Xg^JSrWSk|U*qTL#xP-?gJ0f91cIoau10lw-r z)cJ88LdRp|uedHK;}qAkHl4y;;Us9MY*?W}rqTo<;JN*&Ri__J1FJ+PL|sTr0~aoo zn7wr{^}oSHQ#T5$HJa*WRkREmrQwkZU3O$AmQQo-MEQV%c zIZXrVD;$M9j$5x$vghigcL(FHX6By?E zHy*z!tS#jxTS*($-xkRazH$7Xt)FRm4U3m&W@-x_Ym}VI^5{F+EoY#3nV`MRuA01H zn-F6Z-Ko>JLcJJGgX^&%d7jSUjWFQoH(RudxlIYJ1WgiZCN@KpWI6lo-aVHuhbMaR zD2RR53D+8H+gu4!#wIm!ssRx*$blTm+^kbfP*18p@VP7=xu43YID1{L)mLSnsREQv zXVIci20&u;t-r10ajwngKe2YFDSau^5p%zMD_7vbzyYR#;p=q9_jO9w#Y{+b6pldp zi*_j=8XD7gtiDl`tprXhpZYmJW6*hdLaL<7=2Wzg+M2Rt+6$oz%LrsE$$CwfvFrqS zVD#`c5o*_NdsTVO&Nkn=FU{%%`Knpguw=E9#$j2J4-M!CF3i{@sH@dAe1jY^OV3xA$9(g2dN00_LW^A;FXh^-Vr%^%4MUbgYDlNU6yx;{mn7x|@ z8)-D@cZ$RJ6-3^$oF3}=A0HJl2)Z?2|d8s_eh^Fae_ zVVK~(RCFy87gyAv0oSNXKy%hc*2`@)kFsHHgi%n1cZ`6K(;Oaat}@D?Srt&3OxC#n z<)^aYD@5=8RE}nv;6>WXiDDWN9#+!kgY5GpN}c^}n@Lso*&Wb*uhHJ4t)tJ*IyL)? z=I{2M%>Ljp|6^wuARm%bWj~wrGY!p(G5~TS`-`y0K3be__;f~QtJ4Ryka2Rz`e7zu zlKffA_zaIR$+rrOMiasSMKkk(GDXI}{3%=Xu`lU~r+!b?v3CfpL_S-0 z^?_;xsV@lK8NctZBxFzkkLaR&05p?+9R7yY{kk0HD&;Z#8**)QUcWG=Cy$QEdeD#H zSA5eF(R}LXHp3An4=#bm@Nttr+I$ppZ}$A&_-NeXMp1|YFbxW}njL$!gAIS3_27}$ z?UyyLy0GccD;b&m_Ch|*rcqlC2j~A!+D&m-AR|Q4wh(g0;291ExvST$m6Tl#QIY0s zP9V9Zv`;c0s&IUq|UoFp?S}bD`zQA89v~6ISwiyV1KPOWh^v?2_=6Y{nrl{=O5vG&D zqvfGwk`+WlOxQZxX7GttxSNdd%My8#5!(3Nd5`Ztj2WHF=R`2Et-jK_S%Cg1bG*p= z+&7&xBSlxp!Z#E^HXkUuHt=!z>DHc)QyDxh{J(r$|~Jn}!KBuddMm~SuPkz&fGhxngjn~2w!Wn%q+Q$ z$|XMpljj1xVh>H(kO|T3(~_e^*)-G0{LfcE%0LdP{=Phj8wwGivB`Q5%W|4P&VG9t zklM}Kl5o+xxbkynhd))+T9cx0GAJageth!Y2wlkc-QIC{;%#RUEAs8L(u#*~x5;bi z$-Oh|h;;~ua)F3jy(@Bpz3t4jZ9LWULzkxveWnmAh2a;IU!Bo9Tv>7*?E$me z*T)mkAGIp4Uh@sGuA~DcnKeQ?zEG-8C5txR%>ZlrLQn?Im1V3zfM^o>0DcX5`)K~D zWgDspJji|%Mn}pN=^KsfFuTu)N2s`2dUx3T$t(i=<5?t2tRED#O>&C4+GV?VTlUCa z0dtY>wy9UGl4Ju1X%!-D^T?oqvNGbG1|1C2w29IS@bYunp0(sVSFL{=T~^*Xgz1LI z8bkyR>4paXwr2NG&3t=D4NC77J;I;IqYvs?R-GEwTcpZ=#JrYKeB|TStkrWPbr`NA zo&x|~p-pvkg`nJv7%a+c_- z)*I9hk1Xt@fcXme6*;PZB?MxnBzo|6d4g&+N}7naVJVZ`!WyJ9}WvqedtbWShGv zfN_R9g`!-ts=HX5^1kP=qm^jt=O@(a^kJ;ZsK1LK<1lwY;mvorv8ra=0XYr3^5=P? z-cSf0fpI0S%YV5lQq<}2g;MelzP+_S*_2uD5#se*P}48E7>=?WJX39zbEC#c%>$y@ z6ICea>(^Gb=wCuDisTTM0m9Ld{PL<}v{O~%dqXa+c@&(0`6X_yFD`#0KRP6xE;hje zIh?*wVRaqW@=a*=A|f?^d!m{VE}0m|ikMd85Usp7ftzIeXs#3Y$`+L1!agnhEFYrG z1p@emugbMVx*A3+18liBL|2_aR`CwXdkjw~w{QuAk~onpaG_;i{@O1KzWAQbjMxil zsB}5wA-!Sxh039}dW)$VCaw*e8*$kO{bOXrxo{UR02v{W^Em z-$sQ>Uly-1P!VWES$7{{;196BBv1oLC>j7;&-g{r$5cbu_nwEBt^1w)m~#1Mg!@t| za(UjyGaQ-YU;m2eOaM4iy;=B(42cl$gdE}(_;mClPG%WpgFw$|_iLpmf;tm&bSVUg zb|CKo`%*a2tY4MvN_YNIxG1bRyXsz832s7p8#J`r$qgfclws5tM~mFt2_tJ@zYrB=et7Sg!SpCgp|y+AZsndnh0%}(N?sj z;1Ab{ZL%mbjScLl?AfyLCF7M2d%j!y__i}KE!Pg5uEs|i>fEZLTK>swY5^G^fW_Y} zytbQG2`S^kSGW+qYw*C_R-RY@48UJUc`#Xm8AyYp!Um>hW!!vXA{JKvLYPTCb)F37 z#jLnzmWc1Ph}u^0=Tt(iXyldBnh+0itO!0;DJHXgguG$B$7PV?Sp{2K6!N-%KKX*z%W|F8TbefS^5&377jrq_0B#93TZJa%f3dR6-8hV{*q9_lp zRT>LxpTS|}t#?fCPKcBUOx@Yvk^Q*VjX88xbN&m{=XzT&BYzeD=8XLlQ?KVh%sOp- zM&We>102rlYC^X!CnYkmVittTd@OP<0s8o-wVriwk;5^7;cOgTZomAr^LXZP%e@4? zi5s~Fg$WQkWax!kpM|IArB`0jmP??}4L{;)#FrlT>97eP)%+sz%HIzhE=3oZS`}e* zi?)ZQWG(xYhHnojhc7g5u9~5O1xd)7&RYFQF@c3DA6(&G6Qyl{8Y${0_hdBx;GudG z%(oQ_CMXm{WR^;HAtbgG#Lp6UunK+seB5r`)|U_*Qr=2l0j_y@V3kDPAaSnhudm+p zZ_S%)jw9brLb^poA*hgTlLW(F0mI$LouTITDHvr9{94O^$)Q=xP7a$pYqICud*Tnm zA4r`>!4`xmaVYsZr9lvl^wCDsXok1BwWSYQaxaP8zD_o+LCoUxe-?=1wU8%)HI*I7 zS)g}&ec~BbOIGP+WByS$v`gk0MF01H{qO!i-$v@q6h&~>Hn&#g{J5HO;jg1_s0Plj zw@IjK6t%X{PKVVQm(t%reT>;X*-79mY@P25`L=$=!O18s+%tPmUe4j0q62UAmtU`3 z#_kP|x$e5V=JNXN>P_JZ$Y}(A=!e^gZH)8CD?w*nw2c!^*qxouD8zxa5vVXr?^XXR z{tzP`nM;5E(P?L@Wz-0pgYPgx`VIfzvOj<0YwwXh;>&^p+C&fiBbnSl@**LcZjEw2 zwIkKNslci^KIY^w5hjzWCtryVZ3X+K;x56Tk>^DD6 zx+Mx7x5m`}mBYBMR$K{cw@wd>-r+Psmy7x?`3IxiAO5-kRLEZ<0az6R z%t`|K;}jq>EStiTu>f$YQuRpofh3W!;vcT74;1;?C&VvO#9QtS3xKQIyEU@=ycH|X z$|{z06gy+HNchC9Z_!4bL4P1d4^KhIL@r}Qz6*nc#DoK)13@h%eglT)pmVg6VWrBbw6fsz_WVV^*yp;tD?yWt+=t z23PX)3Q>Shnv(J2>#@W-EBWn&3%DN80|HDY1|PQoq^5E0)ao_=2|L`h7)PjtRJF?q zL2PDtN|N_pt(!Upm#lq6EZMNGel1%$CG&YmOkQ_Dd^!Ua@TPE?ope^MhNfzr`_ea* zVlaa!YDMJ(HEAr^ZsJrl7*<9Xs4agpeWj*k>{h`oZoKf6b2Lkb%cU4)zt=EMXxZs! zZaR@qmEi|fkwEfapz#l#OBv;o)Z-_`$96?6JmDH-n+@=5z*OX+9-#kIIXya$QR|>E zDv8TKic&WyW6rYRU5BK`=usKjkm08U2`N!{O{Tj9-<6p_GdZflkT@bx!sP)%OgYMz zuHr;RNh(feDwk|ny%An_%VXrQO6$jTciDZKJAxt$1%XKr*L4CT`B5G{?U9UKx)(_O z^>Z`7jQ{SqT|}*|`U*)rP9?T?(4!RZMHt09H!2O>)$za-%|b^BbR)9@lSJ(orDxnC zQE}{v``^@)CVO+x$$*&DlMod)SkwHcp_edA;z(M6jMGwDqdHd+<-7b_Gce?^&BNo= z&uGXFiy9G|P!hPSRaKfGS$dXJs{KGr*G2Xb6LluQV9UYF`EdVR^42k-P+2lO)7&-h zx!U&Zl6;A30j_x#yqnh;Aox9bP=X2K}U8p;?vD!AZ1dh#^x(sH4cyfr%kOtS0`Sfc09Uutgu2JAuMmDd9Xy>%H03eDBEopYX`px=5T`mH4HNDUL-aPff zQ^LHG&y}LzS8q8-SkP-x8FZ336bt5s-W8>-Cciy6s*R>Cn%)qSz2lO1GWmQMQgqgr&0cPU+Z=> zNi*!I^Zu3lPx=n2ITRMxkN zt_-vmAq!7RDs(LuIoY#xyptVUs!qaX-lD8+Hvk|nzbuc{n5mGZ`LK+gU;Vj^UNbEQ zZC?KL;Coz;VP8#bq9B^L0#qB-Sk5eQpG68mlyUWlhs@xti^83<-u>dte;G>47B|eV zC_OWXjSESpE>qzn<75HE$&|NnZJ@fxNuVkzccR$SSYcpxGABwqWF?RiTzWD_4cx{m zaZ}1%QOha;NevRB?jn}BKt`ze-lvX#hT}5-XDOrC>8grxMpBQH?g5k#tHo1L>OquA zoT+JPMm(ya>_^>FVIU*tg(HpJSqVl<)t2W$YVgLwvgI14pq$d(^X0W?R#}6&psicB z#aSU`r1~N3(B=yLXa}deRzF=>ctX2%+-Y)mZX>iKTXX&{EwO|+xbqU-<<)490C}9o z9(*rDFe%fL%WG00S~1|s)@k7kQY+XjU<7MoKBQRsm~?4#_JFw`_#+-R_^o*te&dgl zKJNeg=8LZg!vd^7q+4%hG~_BNd-{_Tbt4g6rCeNrX57OvEGYE4agZGBX{Xz>ykVaK zN{Zhx_NhIH&^-BossL|QBsQwV1nncoLHhSVA@vx}l@<9bsFLdgh4`CS_n(tD-zvNj zr`FWTAMBhc=ed95tG_2?0sqpfD+kH_C`#oPjRyL;3=yRQ#@bNFidZZP>d77f>NfXh zxbvPr4^Wr+U*yD4B+qcRy)qD!Q>C;K-${8C+(4?~? z$~91WjnxZBw+@#`5V((D2V6tSS*}Z`gdL1;G4J$j5Lqd#WAC=lwFZW=3TDVxFWqoX z_H10Ht=f`5&lAJ>FMI}7yHd4~_&vILxUiwMsiJ;vfjjkauD)-PS^qSxPDr{DK7)Zv z!lC@ZtimE(HYkF!crHU@Wr~N;8COxLwZdvRDk4Y7S%1A1sm)tO{VTZ!IFMB`p@~ia zLAmtU1kU!Hkgd5exSmBx8?;kw1~bb2R~7@Q`yN8+k=Y zhn2Rq#1U@9FVdS*iniEQo4Z!pp90*u-0h@4|GcPewY;TT{MA2Qe&~d-yzT!wSP>`( zjQ_gA5K>u=3P_OHyi^5KP!+}Y%~K@AKRIK&{3KY>MYiVoWCd(}r(h8g`r{hF1m&%p z=BQE zU+_iBF^nJ!nO?m?j%g?zRJ{%I#0gBK+i$}wzb`*x2^1Uk{UA-TXfsXg*l_sGa`7rl za%#yvT{PfM+L1^ta7YIKQlp)Flws3^oM>y6H+mXZLE{6g9hG-em7tLrW~2sSV3?Gjg(oy^UoA3fgMDHCjGV;RAL-oM z?x2F@Y6>YSCHYo;x2z#hw!09bMd>;SLJ#R={`R<4X6-EFb+EKsXLD)$_}PfCjU2Cu zOLPm3fuRlb1{znk7?nYD)fS(0k`qt47~DA=#k}pz^fcKe5V*=VP{DbTZ=1DM5Hh65 zi^08oFBlDwzkkASUN#>w5n3pJ25(xFZt(XTR00X-L!4rUp2P)P#qy_Km@}&9Ro+sy zJ)_B(2j+-Y5l6roRLuOD{_`*YXD7y+FFq-|Pr~w|Gqdb|3cZSmB=T3Z6yjvaPm+xx ziG-8SGbfw8H|dIqJ77K&ZWUavuP|l0FInEF42AV&o?~Uo37z3v#$xGS6BqfxUuaFI ztM4H)0ZhnRlD8VavQs=3V4f(0A>(4O_0^p?`T5V)=SCu=-q2bzc-b+y zz8JcgL38p9F22;(Ok{?61W6XbBc>0MTa0;fp{3MuRlaF7#Pz_p=d(rBh4E>e=khmx z?Dr1+85~4f#J)*AKXubt1)L&ggS9^nE5&Lf`H-iP?T7XjDG_&9{6z#3RoPAnM!lAk z94LYRp>D68DcNvx#Erp`Vm0cc3{SG|9mu2$;yFH}FMmaLw{dNn$)6U=`{l3wo;{oH zna_99fAEHr8w`)`ZNdSW8Zk!jQ(+1=E;Rf}m_|cv@Yeae7*u)eN{O5Mi18ZPG8Z{Q zVswjGQTXGackB~xR%YLdh)Hb3kZ#dCbqNOQt6YO)UIr8xiTOEgx*|?#M}_(5tJQ zhC)U5JPUmb6_|n){-zX2lGRWf7((@(Vg(m%b&oG1BMBB3(GkMhsz;vBKadG6(H}u* z2XpL0^z0wV8CHuqnWT@>#fQ5d0iVf&r1OCc|_bet{g zBUFS%4HZgD6W?cMzi*BbIr19q7Xbjt8xfJ1Dw#R)BakhV3|#;R>lvW;bn_N<;%A-pxOLKv$4cLFaIle9kYLNZ(hCWudsZf zj85T&4}Yvq`U|iuD(>rmZ0A2IdaqXFqXzY^(+2sdWP+h}%Z^Y$4}DMnkQ`X@2NX*L zxSudNwL?n#8a7qQ)Li$}CAs=be~t;8EH(VH+_4TuCg2%k{zA`q{(>Y@CkBajHCpE3 zFpV(deHr^)B|O!ggd`X6KHZx%2C9V6@g8hg!y1%~1sf)FBKJ8MnYmK<`V(0jE|FFr zSF!Aq1-gpvrHhIsMuASA3DDs$p`s5^VpaC)aMVgO9Tp{e=WiQO&hPBrZrw*?8UJipf=&gSYApUpt0>_i5c8(Pp6kc$Uzgub21puKPWNq(Jb){V=Y za3ol(gOwH@fx}2+#)h%{D1}O+uUmGFo($2CIH@rN!`CV{_8YVCu|0hI!T01)Om8uu zx)VF=J1C%T#O?LOR`FdNX{>d`c@%ZVH_C>sTc2y$%N%EJBWZ2fpeymC0fL z9R=56vXr_mK(Te49RxA_$a#WYGg0FV2JPcQnK>;=h{wo%kRlUld_5E96Bm!QC1|ZI zpNh343HX9B*>&8`By=9eW!AGOILYTlxdfCH;T^Db@EpwDN1XI?9vSscDU~EU1E=hT z+w~xtL!<~Fz%5zfY46zY(zCmAm}D3QkmOj(522$=haLd(R2=tJTSbQiQkHqo#uKYW zR``q2B5>r&bkT$}7?uq<>D`0J`mp>VnNc6h^bYZH_)c`^Z$~kgNo9NF^m)-x7a3EB zBVYc;SHEmMRaBs$t%-%?pk$dwo&2{#VP8XARg11Zf5+e5o&Q|6p|WzH%uHGH{J&>C zA(hmyh*S1br&dX!ov{h+2IaV%nC<>E9v%yS;fv2aCL6Rw^C;QjJJyWHf6G|QrkED( zx1T2;meu$(8`uyRC^C6756(xjAcV*KU7QkX5?t5C#P1jvgI9Iw&#Ns_8pC_m1H|v3 z^(Kjs!*Qdl8ubDUQ{ci=Zzey%L8Dupgmb=D*JfsyKYYZ-U+%Shf3Bib@IyoK7WJrsN>Wo&numK!BZcC)5oW@@jTys2MDy zh3neiiEao|Hg6h1mA?KGc!P8ud-25^zC-N)enWjx;P(m$SF)h7kmLu=xe84*UOl}U z#e2(yUH}4Z<;o<9jv|WDL#X&7thOl}cdGL5?+l7%K7PZK`gt(v`i4A}wEd6M|CfCy z!3S*7G&Y#)?TDBN2E6b}(Hz(+bXW?g`np$XIGLqBuO{bPP5#=jKs4A!hsI>kpj;ip zfY)-`)z}H*2^!>MW8c*cnmMfh>&JRG07v(oBOp24|O;wFdgRwRcfQy8n z2c4rvD-MV^5}3kY4(|~ohe}3PMe7lW4sGAbozqH)v=gCT2Ml_m0iee8gpObs;^Fh5 zJb6R?o0hb!b84KOXJ5#|^c?6M3Hl(d@ys|2VBSQH?BK z=n{2rMP8hQZ=_1{DAs(Sv+S2Z7lBIT&W-rvU_4i^`)D_xkr^EVy(6~j%)fqT+fS0A z*2?U=)fK;4me3YyL&>p86}&>VT)}OHmogkT(_9{i!mptc5ZDcrLX7i?6D0_lM+7f0 zi_2fB1fc`Wgd}#TT>=&Qs!`BKY2C=Matm(f1juAf?UN#@2 z#mis&PoH`0Ykwr1@}wFl+n!kC9_7=^MMsVmfW));JNm$&h>w$|Tlm=^d@7zeWu!ey z5eixNCJKcrg9jkLLp|UQW1ALx3Lx_=CXf<1!?J~1FoU{6?-r-E6hwS4@f)7nYJ_!7 z_Pk}fWBCe=)~gFtP}wGK)?2=Va5shMfgqs93O!}wU~80s^D9XxmiNohV9G#_`B)8{ zj{d00+#GY;+48>wjU=ArlNohEj<+<7V>KGgft-0girjlilKojO?Y}|6iURyVji8hR zDRZ0A;`UIr{{2cZDHOW1Y}p=6!RJ3~4;3P}UsdV?zFbt-{I$=Woj++DhYmu1ipiK5 zi#k_M7klH%^09(h12K?!vc8XVX>mqfqKIHr0l*w7YHjaa*B@Q92_9NY$pN&X}TNJ&W0b629CcD#Q2;7X=E7d)Eg+ z5)5_z88up+Kv{3SDjXz~VVXu%hOaR@@@8-s&8YV!z^AK__IGmy;)T|0?th;kP=pYP z#JQCsCe#cpw0XdqC%ty;ICGes(1+O9$L9k+#(MdD+5mtfdyV}-`x_?fuRm{`1acjY zkyEcxZNQe*1A5h8dj6E-mfLrPxh4RrM^uXw{*QIZ7ZQUo6(2lJk9U?F3af zvx2zERFb2>An;GrhajiLKiOcF<`#ZqW|%)y)cuhq3SSn@W%e7!(qwsn;We)M;Cu78 z3%nU2%S96tpR&d9#D?ImlXQsiFT6&(WTWI%Kbt5`W5Pwx=N)(}o+#f~qG37vYYzz| zdY#3Uky5fv6&2;k^yb9;_n243zK;pm0aOaAv=O%Zm{q4}a{B-m^O6ve0c@kiK7Nm83cO zYtx;5A8i)v+<$GIe!~xF7$>W<;9t^GDrfkc*Dl3>WlttAgZC)#RE7s?AQAi?l8qO3 z73H$`Ia1ow9=^gvQ-YHy$iI&{#y{O4W-{js~m0U)ac_#N*=gcQX zcR(e=l-l4s2w=)|65oOYq7&WuA=HV34E1hc!`@HPO=o&}#8O+vBNv`|ozN z-T*$JQV7>zLGU{#duFi*`C1D|Kj-wPu(9rJZmr10OJ0}XnQHFqr#ns4T@~14#ecpJ z47m?{N|uH@VH!=MngLto!TMdF2s{?xiO4&uX@IPfK>;I0L#6qr$}Uc`!ORXRcRD7r zBV+ba)aP$IRrCI{@dXzGy}?Q1YhRZ)VgV|Q3J2RXE(kx~sG3eG$v^l$9$i(n&#JEK z3C0{{QG>y539b=i`#AzU zx*r9KnNg%{F5Qeir!wx790`EDzOG{GdZKjdxZVag!x zR0#kbI-6<8tMD2B@gMubW&^XUoPMYho$7oScz&qDN&`QN;l`=!w+{!GCBJLYr&WrH zX}#zc*)skXHXN-Lp%?Kv%Pb7zI-z?(*V(O+pBVUx{q@snOito@)2vF0xUA)tdgmvh zH^c8UFrR=it{*43-Vs11;d7X~hKdOWj&#P`uYz)GcircK&puR-`8yd`g#IuNCeDhN zct0FRUUMD2)9VOVD~pnB7A{t+EMdBWFft@j0%C^iYCBa!l{%8F#J>B+J#d@!@Qk}DLuszGYtgv61`>R{ZSmoX<_k_T&$9%qw}JRwhNHTzo3+>MBB2q zUrNp!)Z7^8slevwGM>{Z82Es2vcH#)7mc$jGbp^BGS7WPVh;#&^csOjTQ!DM!QIdD z!T0*%5$h$eUnoo@(pXItbCUUmNB9i|xVbru27Lo-9xM~X=ba;N>6%CAmuaK!oggyg zd0e*>-L1QpH=@DgqfS@%YFwQoa(xx#P6sG*X7y|}`NABBWo@y4&1L$#UEwEa40Qhy zdOZXhVBHAf+HhbI=UE&xJWcDY91t>-VgeS^<7OplG{2i~B)?t5X($_D8eb*MaWjAj z@{pNQ;X7swyBQRAkWp80Mahb`F>>bbic1>4#bgjhqSulo*hpV|6nY6C-oFkG$Qvt$Y)^}^z2v8ui8O|?I;>CMdxde0Yx4xnZrEXzb* z&hZ-@=fp0S2fSD4iPn3>Rl&IGEnDwRH;U%=d!6x2)_X*9>ph^1Vkuf1_J#8x+~Y@$ z@I9~$^UPuM`sv@AxXHwKc0AW+<{*ky@9YWv_Hd~QDOvpsBGdjC(?hMvVNv-!sT?wd z`R9dLD6%A9f&^CD?eYY2DjK0n-c<1BO%o?hrivBjGwI!S{K&ybo$O|z7t8a|0d=hC9` zAv9;mdLu5wm z?YQBR7hQQ`sqQ&9?E$2qE+6B50H zrFk zG9w^=&+qaJU-k)akjpIZu**QKK`xP6N?fh?sK{&TKE!Gka~jn518Z%P$KX+e)#Ic4HeoK4e9E-Uw&FSm6^p3!^uH)qpnxd5eO1GR!Mi2E zrhWKO4_}*qaOR}VYwNSf+yTZc)3p+Pgw`TTcJ-H6Z~4M@_X8|L6r-$wjrrcz1W~S! zRW18nVj_WFgTM6xpI#5&0eg@G4dnJKo!6|9if`EJusy2a{=O{iuXfmH=5n%=y8K-B z2{WFiXtMH+#)W)s&7-CRGBgX zXV!WKOuX_HIV@DKi~+bX8`!L-?A@oqj|#%Q1{t@Zq@>)aK^Sf6?5}I)I2}<-winhZ z0=%p!h?9g`ckH4pWM`(f6!2`dy^e4)L@^8E5RK4E2}4j>Dab4#6wRW3@9GCaL1di~ zyBG`ew`y#rR!B&ib&%Gn%H_4w<6}R4p4g}2Ns*?1bYwLz{K$CX0I-)eAaX%OF)9tK z<2=ajAKa6khNW39hI z{C$4)y8PL?ho#oSSYl4wB$gESKS@p~&cCU(gxsSUw(dEq&j*1~2!Y8MN2; z1hSoUHnUt@t8LX+l}^UhA(ZH(H|kV?No6PSl@inTw~|0T1k$VDQyHZbf|g0%8Iio6 zL%!q2c_Fk6`BJrnO0FgI`{uF`YQ>ZVh5%{GyOQP*-5?-*=*`K1`mQo4m~UA9)f*g% zBB!9^5ltj~pOytS#mvvpQ`8f=+f3O589WQICUEQJ=lUu`K^pFZE@`r_lTOD%=l-fE zZ`^oFzMa1Vv}B}mnyFFeFG4a*hdiudd?t0&8stT7H+E;N4!JJCSDib>77)pl0dxYH zzXL#7csNTnq=w-IWnSp3<1*PlA(brKJc5$eGd8Pgt4mHBOu03Cq=+orf*+le(%b?5 zglP4hx$%Ge_NJWia&Lyu%};f+7Eq=OYIysfZ#<@{8nr=W$&qOQ~j_uBUf9CE-r8S%J`)@ z>lL-PD~a6xYM9j9ayC{auduT4n@|(9tKl%Lr0=I_XdLkT^OG6oAbkPxbT|dow$@U1j;WA56x^*U(cs_C*OXi zLl36xFn>Dz_a)gUR?sLwTHH}0`dV^nhWUWS&g6oC%pam$1fSznz0}EwvB;73u!pBw zwFshP!UdN)r}K|Ua1?t2I*$hGIF1 zA6m|OoXSQW^vo6%v-$Bch2VjI~!r{kRsYPdY zv@Z+pAsmV{o{6=g3cVj&e%9GNlmo|oWoeg1B_^}ha(c?@&a<*xY##zZ_8g7xRZb{X za+4c^ZfSBe?4)i-xAJb|e5j=z>qL=;M@uu>XJ=l~Gsm1z{x;R6ohR&p5qA=O;r-J+*9>an~zaK2^!5Cs~ei zb8+bQC8xMY6$m7!`~}OQ3AdJmS+9nR+SjtFtMt(!Zz9Ag{gHA&Y}Gs?AjEvi`>ShC zm0yqIcN9zN>syRTpqdq3O70BJDv&d`>XiA~vzp6~a;|{+as8EpoLIzDM4t`#tONB9 z4|8`39wGm*kc@W?svA}Y4>Zm~?pgkmMt&FR!of>>{U`^q@R1pBko`1gMr!Tgv|^;d z6^_F4ybVKr7BE3RKNn$3KAuE-kqYep38~6(+ zn5{|BZ$TBwzmZfrxdU{S>-wS)Ckxd8%SK3P*f|3+sW@h9(cu^8i5G9&kcUyuP8+#W z@$&A@+UB~yfJFl$B;F39)Y&J&L1I3>hdi;4GcGU&d`^yiLmB(I1K=kl#Ekq8JalK% zQmAAeDNdaojXZL@Dm=`@`E6BWF`~3YT4+oBvOb#>@}$KQR50Eq^9ZV!w|DMYVUY0J zFa})R`o?}P3%#>LMK1YlxrQpXQdtN2-b#Xp6SH;Rmt92-Tl9BxYT4YaD@!#cJArwM zs3mOy3LOWQn-u;iQ-VCCodthGEctn%r(hKL>7$T@w$LiFptd@ZVb(Ya(QFE2TtF&t zWKSo1RQ1OXeiZMqB&K+)xv`7-BWwJnIulj~p#udN#C>`E##1rLk_DVr+H|McIa1Ny z1kRvQTUvDpnc;sI>j&Wyi*I+LB46!}UsPOoxjrBZtQGactoeh`%%TEjgnoD&URo9^ zvd9P#7ITAm3kWByU(&GDj*((0P@?oBl#j3{gh!on6p zMp6yRPs#>pXfTdgmzB%TwX&$f<7YvZyJ8Wr7du4le8XtiQiOV&jR_T7#lGv9rzI&J zDo;+x_ULGL`rQFoT)6<@{>5K1@y*;n(jL->kLuXkSd>-N}J zf6s>{&0lxs@o&E%+?G_kzHeLmkX)sZHqUkf(Ah*qN8j{4VSAgQubT*z;JTHV6ArPB zwy#|A4CT2%OdU`mVM;ES&w;UtH~O2*gQrk8YblE0=Ggst6K_|q%7Vzi)PvYJG`9Mj z4Qz)#(Cz7I7_!C~x+y-g=SK~i3x!;&pfweRWLjYC)I)2t z%5996uQQ9=0(rUo>*QDY1Wsp;j~}?Aa~x>0`3FcHwZ~fB88OSk&j1Uu^k4sRech*I z9vU(pSJV0IDp+n7#A`-@Ue{4V;Zf%Dk-5Uc-lA21oRmj5-iF zi!FO40lJ(p@i7z~6b{GQAqO=_Q8c7y7yUS>BS|Y`ABxwlMnV=C&Ya$CThTjp4KZ7s6$ruufJAiu_vc<{90A=mqLSzBR)TXe9B2{vwk-O;Gd5kS>I($$3WwUS$ zS1oX9rl0uBc%on6w$ZJ@m0KeOc?72J>!CbKmuelNkdYv5usI>5MWI#972n_fgCB`< z(wkHue@ETkM6=IsRrCL%J{5ou` z_))NQV5%=i%T05KXCZ%dM2tsv9w98g84wh*1Bwm>?l^8SrS+AyGqX77N-V5{>^s1Y`2`bGP4 zuR&rsyW%h30q67QS8rYZM!jyd$;8EpZ_EjYaX=9lhLj=gtbeyA17H5c7_y%-Rm$e? z`au$~Mz!#bGY9rL-8Ot0j1SV=dMo*=xwa8VpAFBo_7juCttlkCO|B~UqQC%|lYkn5 zMb6^(BL=mlkrwTV>fqO5Yd%Y61r)AmDzr8K$d_k&Xh@pm(EGx#%%s(m>2(cGZuNEi z>vpA>!R)e--fDK$;|GuXf@C$RD9)EJ_ZUjbVn97UR=2eTZT@FxQF3` zWyrd9-it|&qZ{Cms+Pj@s1RXTWi|6DrZfpQX@@x)H_i&;>8e_Jhy{d~U0tfirm_4A zyMoH-N+70DXOp4Pr_Q6y$`GtpM9&GxtU;k^27jc#5!foYggiogA~|FWwYrfMA3yU4 zKOSb1u7UJ?B%cVvv{Jp=%^rp_x#%|@e#f`t&gLdEM+HNZs!G*J){^q9ODruur(Q@? zd?ZCr#bY!sb^7j5Hr0ole{@Lk{ZwY!dCS+e6X&E)LBk-dqt2nFzXe=^v!rIvg%8F@ zE?bWt;ejPXo`#Z=ithOVAXmcBnN2fnmobJj7`_u;j}p-aBlrE``z5%wShm;t9^DCw z?hEuF;fMLecS}Q>9@z+dtKng4ReCDxM1}pTKKb~Q z&Sq!BGa|<_UourNkNs;FcDtY)a}S2T&83dY{nlR1$A9|kFUg#Sv`+Q2W4z7DM2enT zJVm93Ibr68a{sMD`i+4iW~WVPJ^k2nQ`6$|tW<-7|fNK-P($7R>`^SuZ#Sg`5Q z;#vT-tPY=cU0)(<7H=A7JT#LB?E`$um%ZGO>t~oGfTt?q*GT4)6W^Xu7fipuO$Tn_ zqcFW5+M2(5a#p0NbdghX4Xzt(qzkGBqM7HHpUJ2-N>j6UPBOEjIKlnZ?0j>X2(jKRTIJQ~TNvM{Wa}N1n`qxH8HTtROxSRRejfOVhYQg!k9mSe&aV4^fA=~6 zlM`U1L|ndYR!z~j?KJbvCiJN*wn`80%P^+A00sz;8mn5Lf99J`-3ZJ||HOBD)ljF| zBV;@@?3FyS)`^^Uen6#wN4#wm-fkst8`ZA-+H3?N^o}DA$*)_Q3Th3I{j3I-RtEq< z7&Ylrm@=SR)0(8aesD+>bkNxUo{xjLUBO*b=N`)QxU3Bu4 z2j)~pDVF&!i*o)o;xnX5g@b|@_2-{yUjO`!aoiNXO)kHG^evrH?Ksyk-L&(<2zEw| zc@)LiOfgk%r-#U;s_z$LY>>z3Kls*tH(<9K=hzrzM;cf$EZk?FdCWJ4 zU#=F2Ty*tNLKCLKnr*0rYfv;;R!PTP#;()=G@ok7k!Ghe&`cKt!ujlYxix>S?iC)^ zt!-J>5(!)~*Um5;g%5tz*0Q%at$Ptv*oUqSFSb~|?;fy}Dnu~uE^96zD=`oaq#jh! zYd-v;5*Qg^s?)f?Gt2JH10tm_vqxvNC)7G63A`-rJH$o$;76)0tymSl!v3!!mcbMWU_Ct^N?P`j$TZQ;w4Kd|GX(|$NH||mLK`Z z-8m!+Cr_EvGVd1NhqIO{sw>y&60(V_;Fo1g^RR2DD(Xo^pf272OwQ1{UC0-O$OWE{udRY~4YIP8p z5Ru*`Kxl=f5ILcda_|8YZP0N&&UrLbCTxg)+iaOB38>R?I6gV+tx8iot)I^Xyrl6> z--ah|{GJSO`7`4${=!>aMLkF>omWuuD*l63_>~qS1iTofCVco_QF$ySn++pXoC$*S z?+jDW$+zF`Vm;F~omG8)h(C+RX2HaxxAFAI6l+tow4!4zRaipc?h;Hh0qg_v`-Dn) zuM`f}Ec3fUj7YQjPvke88JSVSgh}5g;BHV({}sOJZZ$jVI+w+pmLm_mcDq51Z-3bQ zH|oAZJP#EYX^vA1>Y)!rMcJ>v@U=hnbtfln0-yRVuv8wV=r6cS^VBBw+f-Isr477M z+*c6#{vb00T2;h!trA>yqSuY-s$8`C3lrsLke~(W!^^~==F}ihEY!AZ2^=j@tk;Ck zoyTo6{39-LQ^=e;fRs$LSEAAChYrE-ZdH;0ytY+a_ut?hgColtt-y}9??h!tVMBi> zLy=GrGuk10ml^2*APE&jGyk(3=HbV+I|n{gZBa0lK}pzU2!a@MQ7pFS&n$3M6)kaw;O;|MWAc%h?Pc7uI)G$rRs(Nn%h;|J?%9|Fn>~<**-Cn&xix zS_`mb%9$_q1Z05aIvXNAs>T23>;GhZmp_Gdd@OHUPGFyVvR$KIku1Exwq+pSStlqB zSB7LA6JnlWwL|i<2{^ndj+(I(>>C>QX(;e}mbll^0^i^bO+nN6i`9!UVVS`|bZiPk z2t2J6{3d86$7^mGJzhiukU^*I+OswPC&fGcq9UEbt!Li(qQ@O(?NoI+BEVSHstn>| zqaT?cy3wkYFkHj}{1shLl+H)V*cir+^7y~M`Y*D8WpJ_s`HX&*F|Z2F~e!OPT z9zSqOeuWpz-dwbH$Egm0M(!l(F$Sactui2BTo``6iZUtFBC%>5?Bs+zVeX_a;^BAb zzR-!LWxxN591@x>+FeoLjD|W?*UOCoco~hBqf*G`^D(7qZ~l(IgyPTUI+vmAAZbj) z^W|GhGvP3ep-KU6sQDWd{6|Q0W{D}PL#ISYS~nEfwd2OAzXLl!PlQT{U{t57cW$Ss z%RrwxgV&<}gaX8Z=YJNqBBz2oIY_o;lah_+r;U65`YqUi|NG&0S{Kd$u0ctr9if2> znUdSY+4xVeaenE%=%T2h)}DgI@_nc3OIBzoCpqK~LOA#}ti-nbWzqvKF)nr?MamnR zZD*e9!k~o(Vn~VK8vPtlK;tABBOA5T8z>)G_||Z>${O!l2TpZ~?KGk<4B`?^bf5tv z(>6_xCLtnk#{#4ZFn7$iVg7RhbWVPUJ?lXf;XJD-GRifJpRH|euBhg0C0oI$pgo6m zLHEiJbR!B~(sfvJ{;yN5=|7S${KE3cC6`ZW^FjpRa)xq=o6% zo{$X#@ZXsRHMYnFsmA|H4k|%J#_i19GaJ+l0P5+=)K}0s)NhWbsb{!5C#?r>1jeC- z&(!~XL`H0RFHNzg$?qbPTP2{Pl z5^7-w6b5&U+})O)VPzO){!$9P`Gm3DnpNm8i#qc@Av~8BJ8KOZ(%ogN{ z-V~exWK$KsBE&xcd#8udY!Jo~+3&o`U(8_E$XM=&odnN|rOGqgHErO~?i~A~a9RL` z^^I8j6My3tfON_y(sHV zW$~E70f<(il-B&+lZVYV+*6T6>ZV39JF!ZBTxG5T&udmD5tyA?D zGE?=vMlA|%i0m*?YrhZR5OT#Zz?^LJRSK zo>){_eTfc-a46YCv=x*EYvt{4XM@_E-aQwfd`v|+h)ENH@%JN;DlOZ!4CaIjS5OZU zOAnoVgjhK-fSjYt8!W|6-hU3*+24fKR>)&&3p5nb1|G}eutCBH`!!8kqr-8L3x@~R zWSqQg6R{_E`qr#{q$YVbt9(WsL){9E5O5_3BB!-Fybk(0yByE7=otpdl)$nOjs=Nu zdxn#saW68Cl0EEiPH9fgmAaZvI2l16Q&nQ|yv$G{WS*D`WmV8E(o$q2wy_g+QeYCP z`>X-arA9LfuW)J*&PDi7kc59#)LAw)f^JLl0(e^-6S;K}0AB#Tm4{%hkVPqT-=%@; zN%W?H?}~i^F+tNwX~}v=&vIXU-)?^8-(USJA!tM@Au=YIVa_avRMWAZJW4PH@sXff zsS{!$I*6Mub2EG)s+L}~t7gC#S}jF;5J-Rz;_;4?Ud|nP>q+UBHKTU*X7OaB9nuL8 z3)-wRC4jq@@`$(C$!&GJCd2Hf90y^4$P<6^+1kE;K_MMt6>|QBFMb7>Av`(oDuSl6 zi);l-WRtw1h>u=$4&}U;H}dedzrBLm*b3o8_`Tl8AN6rjvIO&@ad=N|7BEnYx)t{q zTG0YnYz2!_}re|MZ(=J@)@FbMz&l&#c>js@9hI4MVUAfqk)fyV3Y0JZ8Rksacb~Jt zkEI!dr@g`aZDm`no?RNhB1~vSgN7RolJk6I(-UQ@QVUZjKYm!lTTjPEp`C40pg>a$2zv2e54gZ7$QwFS$w~Ee~T7NozN&P&6LvLnuK%tH@xa|pV!s*3S6)8 zVvLi}=r9f?iwnEy*Z#w*T_B)fcA*{6y)+}zS3skiQSEDGis&r4=zhI3ooTp>w7t&5 z&x97a3f8lw*uB5E6O<8Q@(nGej$+;TPbAgXT#9@RzEV- zC95Pt#s-u07oPNv0wC~}KbAA>46Xeqd`Y%CC4J`_qzq4dmju1o5k!dw{f`Q4T}mOP z(3cE&XEMWm7VLuofep)oh|yT8LL5f7r%h)J@`vdx46MGoFLFD%g1M!fw>-01)KY3t zv4Lokuna&!>e)xP>jq31$r*YelNDMZLtA~NxWjS=^(~BNdBGW)zT5$zc0~~4+VAq- z<*)Q1DhFsqFPgx@ebl_cNxXbNHVjam#XpE6A1{^}>VCs;o>ww5lPE z8b<9TeWzZN(<7#G-07={(WZ}s&l(=mHAPBRet>P|A7>~XP?vw?y_1Xi2R`M&`6wzU zn66|#WbC}x${<5ANN&F{TJQY?*Jry7X+4o_~qFHx`HVQsT{$%_#P(FNK&6$PlP*tdBcogu> z$&Fq~4re(A%q%PQ?8{i*U#%Ui$|WK@)FM|wxYVyYxKsCa4(2~~L#ux(qju*%pB?s& zWSrU&M~c29yW1trLViQT=mA!!taOV^zqaTkSKqPg%t`(^SG%bCWr;K9#rUnfk<{pE zmx`jYdMYmxbd@O}!SqayXOwkRS@2T`hwMW11{|^cU%-N&q66Dga>TGFm@94I0=fa* zE|?22IMyyJ!>-S|Dpr1I0-Wclkt_87?q+R(IOFRsbnN~d-w&EvzQr}D?t&~_DhQ9 z6-EZRw0f6!D1axY)<_KvNJHI$2cya&l+8>vuHO z9drQTQgMzEl%&`$ZNY#-`L7Z&Xg$r&y>0_t)~)TA9{Eq`q;51HMnueo$fYB8pf*=5NiOT)YZir7akw-KJpm??&9nt*#iD_-P>5tv;}5&r7fO-^ zB3~#e;Vyo4`Js?9fQs{&o%Q*XbD)5;(!%A>xW6Cz$t@6;gb4mZMr)tLQ~k3QATtgf ziqPAjYGpRK=V6*T(#&aIRP2?%fzS99(alvWSm@q4#B0GvMZ_R{$0OX6@*cbR=yj1~ zZ2h^I7tusnD@6gd6ZEOow~F`-jHf84$RatB;=~EZ=ZX>gokJ!bb#jwWN*XS+u;;bL zSnbeSSo=277d6-4Z}6X-@n|L?~s1tv$U(X<%GtE{Z6n>!s_bTC-#X^A(EP>_o%8ClAF)B$Ycw$ zWipt3{-V^O-lf#j_x+WSAJ&0v&E-Gi5V3mIc#T}X9aSf?BI|QtWyy2R4lKm*{3V5P z7?+btZL_nov0#stG*N&k$sb@s#HpSaOaapd`xH-lwFijFaOr=vhVRe@+?-u%1?l>1*bR9 zzAu6ADCVapcbnqSO3R!iAy;`GohY;qD{|0QfLiiVki{CJ1mJh}3fo9=-dJ))jV)7S zjR(oA@cXN*AVZb|iUEFl`S#^cg&fF7A5o~5)OTlC=@xK~{uAr6RMZAdXol;$m`-E& zAG_hdXLoA#<;_i*(mDWo-*zf-ThFcRH#n${;3J~eBE${J1aH!C=NW~VhCY_d=a~l00{Os9sqw4 zTDx#-lu@$@WCSpC)ikoN=_y_U0T*R*J}LZ6rq058d=N7EXxECwl})y7ie!>RuE!MB zSE{U8G)k7;ZM5SU_(7d8O*Z8Tlt&RHd@$++Jm5{L(wY`0F6ONCG)Fb^Uea)#n%%xZcm#5SEHI}0UX^k zK^vaV8Mc=wp*7#)WKj7$s2t`0ynp|utQq9OBWeZL!Xl7rRd&xqC*GwGP%AdkvPa1N zRRKW&c9GZUG>ha7yKVk%p)=q=e-kHuXd&#uK8YA55*{{it$O%P=CLQ9xFMH6p(e># zDNNL6V-&99uYLW=8HpW6J3@G;n*f@j<-bC*BJ_3ze6jpXk z2$aG_p*Jzv`m{6Z`u5{H^?$DJulkaZbVNk_?(g0av0|;yEPUtVLpS|SH~e15z0j^3 zlqvM5cr6NOI1cD59h-sLTWI*r^TJt5YhysFE=l zy(d;{&FT+`fF~N;q$@4*J3~!Yw^n>V7=62;3HF#S1u_fnG$9K&N$v7y_HZ|HcyMSx5s;F;aAvT&EgxnVemdJx(ZP})g_Btw4b9Bi=l}(XU0^_@ zEp9G8qqw zsF1Zuse)Hj4`1^|rJF$qADJ?nW`GYNPz%Nb_ffTiT&v*uEEDeox~II;3^_z+(IOLt z4^@Y_!zbv1=+`oN6izL@Z&7>Iih+pqtqPTN+q%SAfVhjwLfkJc>3K9G%bGhYt4L}+ z{RVvL6l_SMTPX5&PAWX>G)d(S;gDL`ps+3+&azuX)yX!_+84!@oTgA8$H5pW_Yi6p zBwAcuJV3;#L(F48#*#h?xjQG0L}rRy97-7&)G7#UuL^lVHVUjw&JqP!&AA#uZOaDoConhA1IFQO_Z+_Q* zXqCo7ML_g&i2fCQoqG|$H$P*11NNvWRf&o|q2xvuq|9KifhR?%4T&X$L=YLTM_6Ud z=r#EkzC1xV#U{dSXXD^=3~qJ8xT1cB6l=0beK1%Kj|{Y|w%p5QF)U11Fo0w(=X_cQ zKPV%0N7Q`tEx~fmO1l{h*?GzKAn4@?=%a>1&8K*NLcTOp^pPy<;Y+$?>tjEOL_wxO zcmRWpKF2PDdbn?WLn#l)#N};RVx%q0fJhXo0E8H~R>J(kf6sSlUq)XO^EoRn2!F68Bl~^d3$tUjrYH zCsA+<%PtjF=KpI*`Cz6Y>*pJqP(K8P+@qVQ3*Cb#scLbaP?WigQqN>WgQ4)GpjiIS z*%zJdd}eSq=Hh#Caefj!m^xG;5K#3EZG#&@bz8$yr&=IBZf$%bvfI3|GfK@NH~ekv z1FgHAhVkBy!+vUYAvmW4EcV*;8k!V| zHRAe>f8tr3wtlknU#QTTo&ovPo3UR2gAV7P1VxTg<)=7tEXIHujZcH&^0JyaO#L}N zuN=!30qn}I*A-6S5evh*1k2;wl+uYP8)Ly6EoRq=>p8OIm`b%a%VKB{_;6~Yf)#!8 zK*z;O80U61lI_`>1^jz9k3cuEzhL(%zrhKE-_zGj8_N)5HlG=aR|vX=m^H>0}-0&;C^q4$IiQUgoB<224x`Czp+ZKg; za^bM~&yO@rf!g~PwnmER`pfn(E_D}Di4*;U*@+ppF*#L>>)~s@57p^|ANwp=(H23p zI1p1I;}Tb=^UqsdT@ywX98y&Kg<_GxQYZ8>SDaZpbxn{>xcnD0e4O$P&Qqwn+t(S2 z@``rVPYncP`e)e-b=rII7yftHkH#sugK%*i68t;ZW^zN#-69{`v%=eNG0qcsLfjILqTh}cU`~Vk3u$7;?U5S!2}DM$ zADdKfu9r#nO}RLK_?iEE3s|1a=kjkaKk=_nsTBQ4#!f@abWzE|JU~u|mV~NUjSYeS z-+$q$7vebrG-{Gt5{4A2X2B@yNmmcgmU>y+WR|-cZ;c~~c(dKkTzqoo`K6YuL zBA%@Ms(oFc#;K3CvD04gFQ#(Clh&nEZ|O9!ts3m2c+% zYL%>&w5>&OZX3ag?vZ&eTRSjUU#x5s;5kwdLUQut7xV9l%FLk+C+1JeTGDoH$7!Du zT0{)wP)rTDdc)X>5Hen5(%SeVwIqbrwg_eU$1^q%0F&(ap84*2(T<9F)v>U-`7+b3 zE#DlR07=N{99%vCpQ&Na@KBljZ3I)Y1gB7kfIpI2DHI2m zwWr-E#`9nJ1UN3yeQT#ylSELbyU7sAf|4m5oCRy(Q-55@1&b(SmX{@u z;-)JN9E%1`qmKg-tD@+|{G$jFvp`axmAaC<@N;mA;s3N}5TYTJvDJ}zA@KP6V5{!u z0R>{;I{i>aX8sGAC>M+Vw*D-?q-~^p+AF+bY;DSX5kgq1noafSz&qPuw)}N?;Cfo~ zI_1)cIOK)T&HU{6uuf7Dz_3F_G@+dHw@A|FDR7ypv=(?H?3WcVyt0OkB1hfg=b085 z3_!OY@`NLMFRwYeFv9nMY1!3=EVw;6Qu?3+}_fdnTIRf#z{4 zn?Q4`|Bt4d08(Zl)GYP%st|>@?(BeA@!wzqYRK)T;DltPwWV#X@w8lFaXx zcWy9zg+RR$Y$UgI-`sRAo7LhGim+arurQNfY+T%nU3OE2_3d{v!cy4T*Y5q4DZ}(m zJCtR()omHcjk58Kn&-)I;T!Dw42Pj5UJiEDtiEOnKD2)$lM?0+GxOjTLl@^0FRsYr z%8r3uLP-=!Y@7 zp*aa^4Qt@O!&PLyhNrR+G)rE6Of1vxGdHoXct{jA*9|7UxRZP#j4a!hDaj)y$<(r7 zV|kFVmB!Eawe>&G!z@i%+^+EIFlQ4SkMG~#MtnNhA;!kkyY+wc7Dksayvu{7LUAF{ zg2kp4T+&hhdAHB96y#jg(mm`!s>xlq(#((%GmU+1^wZAet#X8H#9eHf1k=;}&P|FH zR#Cv@8`^B_GpXLMXMI+13Z))CIP%cQBc^R{@!L$8Pob#1J&Nr48!pTFN?ADFIOqyw zBj(wS*G&_<*$KAf^RK$;E*q0W9;fxK#y^ys+vI1M@-X{vQsQY#taT9a?S8gWX^GgTbAv;B| zdLw%V5@%`TBafhC#}d?!A?lag6x#B|yYBU`t{y4ucD|&woqb@99OU!SCqHJkqdRB< zdT1s$%omol+tD~EhUaIZnfzzzvDP^A+v{rAQ;;Roc@E4g<}>^a?n08E_?E#Jv5K9N z{pGsWO)mb|{BlLR+grp?W!lV{{>o*EYqQ&QPR;NZaz4S3h2=gugBI3B&m#ekuML%B zbCD3*#x+UiO%@YE*er9hHiTZum!7BR5<%-n+A}qu(z5Wj#h&3XVwT#$&~)r6Qk@rq zZrN3?kXr+ys^QstV|kwG#J`ZtI$bhsT*$heC1e@L;vxLbkv zrVk)CYdLEbTqbt^0<=%aO9tn%WZ3+^1*L9iPEsxcMhksA<0Lnu>3OS%2|T0sTxQ)a zCFh0M>pER*k*9Zes~k3NBFQPuf`4N{p`mVVm*L#Ha#L%UsTgzrtJ*hUKy6S)2mIh} zfu;Xc^@tU~#ctp5LRzK&-@??9y&*f)=C&S;)xxU_YhjZ{Sa z6@0+Y&~u;0=XS!sUTtHSjsA2VxO*xj;Qdk&Uu3RAQ-`^Z|GlyF;o&Fk%a z621;nnK3!Q_C|&VY=vN_^JcR8vt6TyrLCKN=~)-H&eoU})2r6oMcK|ZAX8)F zZX3HdZwix;G)#vwB21e;t!0*P{<}kqIF~}f2j*tnjqp*!)(%G`Da7GZ!)JZf+`tKfi!Q_E=_@>U!OU5A`U`rtf-iQ3n~BG$3Oh;BABO#n$xmHK z6!z6NP5#eYc!w#ITeaV|RO>wO)fR#?nZYJKd~liMD|>NodTM?0rkf3o@5pD2O#Pf6 z^W*7L<`Da9dKqFd@i7zz5gQ6yS2G78F9=J)>}X?7t3m;mOG_~Jp7Iejnx=76Ub+-7CE(`-Wpi){*?3m(P2gkj(bj$b&R z3=1y5d9!9OF*3M>+>fd)nabo+?O`rQ!pVDCGS_Jh(UFJqX%zXsTL@luyAb<&_Rt*} zs^71g@Wu{2|97bCq z$-oSP#;k^PJ*lrLMRRT)mk-ErING00GI2Tk>JYG75Y#*tm&De?#|^E~j?CzpH>11U z)QIM1_}KL~2X<&BLy&J1k|uke;!)=fX461#FixNCAFIAt7|0)>^_UAA>DT?Ea*0iu z7s2hzv>r}cUn{m<2@a6!QF5jLbEooKc}8a3kf*D(l9aD0NBxu0(!Z3Fmw(wvDLy(w zlI|Y1Q)T|r5}T0FwP-AbJrOE{dU0%KUbCUo=KfiNdF^k`bgQ{!rStj4vzBRxhdO1+t6TcB7#6q z!=#I_&)V!ZQA?SI*6il=wpHZ44trPm@PALwHU{0EA1KBem*><|^2VD#?VE&KD?-9m zCtaR9+`i@32S^5kuoRDkHJJSBj2X}8BV__6Lgd&=7BKq~|pd#QKhV3gH6n%fkc)4j9X)&ir)i3Z#TSbCa(N7Lt}ww2?c}uozA_LaKa`9PwQ@ z=POEY!|v1ms`>&0295Wf@#h>S8%-gwkydrC{Md@kTUtBp1_&$K=#l*gsUX6#&X&ln z(GK&nE=7bjBHcQbAjFLMR-4ZcKSdXO!gZ16kBoHLVFTOEIy`QtN0MJ*eqi~P1m;?y zncLkOvgMW;nhGB>rC&$< zb;wjfjoErnnvY92v`3#{P{pj3~Y2xb`=|p;HHuZ{4v_XIW_UI zgkbC3>^o-OKQqL+eDeA~qYD|gGcPawM<5AHTzmm29fKF!uP^M_w1!2p?Us0RrUt;K zYtIKb;%MH&E)nf)sAyV*b&Kn!auJ!2eYJI!9doa34Jj~r$@Cr>e2Z{P;*RioYh>TB z_McolXS(Ntg>D&ve+I9l9nZ~hhN#9mMOt57oW6qg427JFSjAU&cZg|;_+EHjjf|{ zItxKf1LGurY8 zpZ^^%VOJ$1wS>wcqwTiCC4gaS3fR-keVJvj+Ga{ULt*jpopbNBUGl8!bI@MFEUhZm zb|p=2-tE!{6lN{Em?L3I%g~8wn-Fw>675nG6WqZNy5Lc|zL3>F!f4jbrG-tK zk;7w!owuhF09Zb8J4sPU)j1y^J~nK3dWNoN-4wm4;l?l>3T3H+rZalj{j*22NZe$B z>WUSfe{a0$KEsf+8Sxof6X#hNeNSsIq!gRQ=JeKdBSS{(YV-4!_0<6l?%KQE;&A`j z>JcruphTq3g2Z+dpF2;T0`_E|xxf_DZ6f5`vwEbE0A@|TLd%(n{!{JUnz$+5YbZu8 zPJcDmO`b~1r^3_W>6IFJP{!^Jb|P?fdl%Ck*7hK+5uPPC>E}P(-F}bZqiOjZNr6+H zLpBex_jPGqcDLLXSl6MF?JHK;7Q}pQwSku_SbF3&JGoN3^b(g)MU|Ml_HOi#X`gAZ zrk$q1`MD+!#y6y9Kw_Oec`erS8oFg)&(_WAp})7UFp-VV*YXF^Il-;y7Qzmn^MV3_ zXV43VJ;eCKhU#Jei9fjEv%Y>V@WTo)Bk@+7`;vvI3hyAz$x81$>#7+%n6TX)_CGM02J?+N|qsY6Sq9f7-f=>GzIJ zP08jX0lH@Msj1wa!>0^&+`M+do?IQ4jA6e>k1BTitZ(QUt62BR)>GzXY^B<|Olx5X zr}1^dRx@qieRpAnL++C}>Eb{}wi&gJe;RFLyEPZ|?!OMnF%Eh(;>#UQ$3}MA&lJ}t zx!k{Y4F~b)BsO-7GE5EfEDP2#%t#q;obyHn-#Dt4)iPl>Hp;@ZT9+cbW-z<&MX!U| zVABFV+S?dWxl1SyXEtS@H+R8TOjT#QX@CDpcFeOT{2ZbqJ6c)RcL0o}BS9JSLs8O@rTGf32K<81RyszcE4^-cCTc%8cr%@<7jHAt-?>&*5} zk+HqIkV%y|5ou}~l-(?t(hf>4cTt@7BxaX4d2m|TV8PRdA5Km9GVb!vw&DUeMc6<9 znSW4}ZLmGx#kSB#TM4v%_Wq#qJz-cG* z7q=NRW%5N}>Hf+!+Hd>bKk;qp0>I3#FjL&9-hYTKEii2Cc-QC7v46u9bu%U*|H1Ty z3+MR{Q}Hi4A>o(H-LEeF-S>yu>d@w)>K}i2s(bW|8?JyXKCFfKCNI5OIli z^FQP*#BW)z%{sDlhE+eLQ1T7KqidP0QYg|d-e{Mst|L7SQ~bVOteV8)Hcd2>u%C8o z0ZauSx_PtV*S&Fe=k5NiV}tKX+E?2-_?FJKoBT6tzTs)X&MciabNf7Uz7-LGo#w0}V95Lc)#2u6 z@Q8b=u5H!88EGvrz^r_}%EpW$?QeYX-g|wqXz4tE zkfC#jn^LnQ#5r!L+b$~t2{4V|GAB8!#JM&-n>q36(nfS#v3@L!=AD;+Si|bI?B(qX zle0Z0sjm{(YZz4%wC;I?zs%6UnXy|z%=4HG=M?w#>}b=PR>0NCX`1y65 zHrD6*oCj0>p0)qz>;Cu7@|Mdd*9o+Q|7=T}5+mRpH&^Ah`*5ikUP!(>U8sYfM|ChzDBm;vw_u75FN9s)q`pG_1XF6F896 z*P36slmT9_?kZ+>goms+&v5yk{4eLun6`w#B!9_^mNOS*$Vd0dw1$6kH!}|ArVAJ< z0;(ks)fZgI)SR>QWiT<&$1kr%G+z&922!jv1+G}Q3E!X6&Cpzq{1 zVGq6a?X;VrNo&taw=0Wz0Zv1kP=nv&I9>1h3*&N}?;PP+2rhqim>OC_*JSn3txgNA zSHR$;&6k&F2}b+G#^1{1_t=@&fy}QzgP;H+L2SWacP@)qG=K!N0hO6b*(-4A(TxyyQ+#$=2ZP zajxCqm73N;{v7{qnv(v>|7KpuL4kF{9z0mR;$$+J2J?!dlV?EYAeVWL9{*adXLZ=n zXKX1*Z8SG|?^y+4C_&6S3o+zaWUpDvrg^nZUa)P&vtbOiWqqft;Oiy*M8)Ik;e+lt~=MT(%pPx zsy5D`Hh*+8<+$72PE=JmArSM;(QZwha_~x5A?GFt(+_cq5zp1Q?opGDt zdhyTqH8b}Y6EThgv%?w|XJLBTmy%2-*uygy&hY97H@()%;QqCF3g_NtDzpK|u4(mE ztxMTgNgAp8TxT&iBiT6ZcnI6)%&?W!mL8c>(1PHmLiU_H+?Y35DRcsMeHlO4s6|ii zf*cFWNsr@+>%0Ubu`vMLWCWd4hc%XF-|9^{-oSu(wH+_#%aVg~>*==JsiWMgr|pli z46Vp)#5-MBN#mRt!7y)Sn{3~b)k9<{Hbf>*r&3)ldfwFwNq1AkhHzNd6>@6fAh8lGn^G?8g_c?s+SC}A3x_813m_`rtrhH>g z_4Ps;zGXd2t@W3$TD2Ux#Qk0h3$LE1AWq6Wn$BPIZj7M9-lJU%9ot%M;(%@z!RDrc z7oj#}=5fd%N%CaGQ z>@>uJKhvb7ws0F_`Znja@V70=tUKqnENL+sC7;r0p}D+NVYY(?_0e4Z^-xQGwqKFS zhqALSZu&pUZMKA^ptr19^V4Df^u+{2<|-^7gG!2)wrXKER;0#Cu(#WP-P8l8G11#z z1N6A<0IYz!e2bXbW|7$DOZ)uO#BCNVf~hqUzJk?5-5UH;?3UR^7N%UL5Xh9oc{2;X zs+N4ksu$xged$Yvw;)riJf*zd?XD#@Ze2>|kb+8LqAcXjPwzTvUyJmm@QKy5Ym{Pw*E~`h;9+Zeg%tHij{T z5;$%bzw!&Vgw)Hv~=|aQpO^y=1#e>Ue%ehM`jR@@u>q7uy-QY+52;PAb#BjTl}Y7#W2-5dKGh=qj!RoTmDpRC zvayqusT0|y7?-v$GYmbilVBRA2dpo#yA|c(zG9x6%CybGHS@Pw_q4G$2EICPCw=`h zp1yaZzD@0N#cK+(kN^@F8tZw98qVukW!TO+mEk@EM-#iSY_s+HY>$-YubFFM+T3=#KX{M4ROE=NP1#jVM z>oMzDYd0^XFfM|8(0qIv-f1=A9U+rsE33aer&R}d$dH3nCEbN|+EkXDWnRzZE5~dI zlp~Q-i>46>NXJDVHi+vAaT4p+bgC0`o;d4BB9#CvE{k&-cUinsNC3==MjOx7Q5`TP zpUS-I>*ZG3i+^T)#s3)+&)0Sg>@Rsp5n&jH=UI`m$G(x3v4z&jHwdY|U=%;ga4_AS zXKB3+lOe3v%}ROxU8OMr-(R%C4j9;5({^KRY@`zAzPfUsqbX4?-mjV_Zr#Y3)K{)^ zW9y{{yz=_mul#|Lg4vh`o<)aQ>IY3S@a7}xY@KwnA6x0|eSJ!`qMmi;%} zsKc$H@ZZ2}wn9DZHtDmPft2fo`2IZ9(eT-Ze7(KrD?WbbBEw8y#W`pSHZOG6t!uiA zVMoL)F?g`k%M9Cye=vD@rnM~bA6kS38)iMpmZ2dMgI=)?}w5f%0nn}!qL6taZ@ zuejxg-}mK3U!f)q{fW+7&jKH7I=b9ah8T?MHnxh?5Mz1!@({&deStL!VW~>bTB>X7 z>#}|jnzlUH{GBpC$h@~!i8RXPvTr_@;n5o~&JlzxU_|&Tv*=~ne}90x8p@JtLNsW zfA17+J%TY6@aMUS?JjGe zPNh48Uy?-Gbgi`EGSl+;8^Sj{6)ExjCAWJju`%j=+M8s# z=Kd|4?BxCi%=&iCd){=@U50wg$-%8T3O5N(QZfng`qr@P7i-X=-z-2EgSd$vAtq99@Nr|VZk(94A z>#83A-gcIM_&>KS*Hk+ZFV-naeQsrFZmM^O@d0pmX>pC{tv%L>$-rj4UDkOwBbN}s zjoo=Y^@jSLR!ciV$D56?pmWDqwY?2vIr9RN?=jdc1gSCoRfu$- z1trumOwA+94}eF|X8D-ay{@%tvjkmF9$~VrPh+heyyjaMx3?FQl`bv1Va~1|JBHod zs!Gzb_(6QG!Jyh7f6o+rf6 zKR-YBgKH+{M)v%^$!H{Jd1BC;vgVWLZY6=21Oh&7lCXRjqfrYVoANLFr(2d4*5@F0 zCx%}e*<(IAm7~;#b~C^ZGlTbi_Fi9hUcQ8AYR_W?4H7u#ugD6E8^K@kC^PY<)!lBz zg*0~a(Q>8~bLdIQS8wRZ@;}zVu2Xc=+?IvDQdmSb0A)5QqrDXH;Z|Fh6>eREDb*b_ zY!Y6Voq=h4Q^Bv&KBibMJ04N4rakg#4cvq>kocktHAJ!d;;kk!-YA{hn09INW41;3 z=@=Dnnt@<3Mlg?cuoT0VgcxDli{>4e;sqJi6Wijoz~+!piqeq30t4PowY$PGBsTS0 z#fBV&V|v0=2V7z((_E}HEiP2V>!#%#(`Wh*Z)j6&6`SLj$}Do@2lG8Q#3?3hf{s&} zhSk8g%`Z$NRnM{z#1aH6*7oRZklwyD_ilN*yLFnTDX~ednt?(4nb@=Q`7=m&;|q;3 z-%iM`_<0jPw$6=y1U*|5l@NHb&41S2x!#62b@#ACiXkLx@Nk^^kMQFeOk>%}Ul!J_ zX>-fe#Xh&NS+=iI_1o+1&~ju8LWWW5+if1fSYa9pFP0R`dX_L-PY+HJtZmaG(4SqN zY-i|Zfo&e#yhkW>b{N0Tf96}iTCo*lawY8l(zU6u5SfKftfx0$cQXtG3i)c{JF%19 zZhtBI9oq>w_mZnQM<)BsfVb@)+`W0IsaFAcmMu@A)txSPooNtwsyKp}g&f-068Cc% z>K9+V>1O|48gGG?mJ$sVEL6g0TD7RDWfoy;Ry)jEo{ztbYFpxV>@?x4&hzfSavOdl z>k7vt*r=gbE)!a5K!K+9RG)5RiLqUYv$}mfJ);SHzqT#JM7(y!x0cg$?rBFtTFG13gyFxMoxAD5wxQt2Q zQbReXuL?3mSG|JFe&*XuPhvGIE}5kzvQR$0&$Np%AU3A3yoR9?WinNp$La{r87=rH zSDAKWdmlHcoUbn)K4ttjE+ykS_A@3h&g-uZn-th}pDt6M?I(#gDeOYH9J_;^x4Psu zXxXG;3z=%gUDapSUh4N^>~YgNJ zwH-9x){!TuwkT{N$m6qC*-jtQGBIcBF~@D-itoXf`nGj$ znW8l|yK#DT+ej;DVRKd>GKF>Jgl!NLZ0O=vZV~G08*W;j@!eOD5-wq5tV5o^kXh^N znMZ`WTcT~)V0u1-?mMWV{dVr7efiL1o*nPI@9X#8Wq!B3i|`|BAhB-6?VIguj-SO1 zoqa(zF?R5;Vui&T*)T(AMJx(Wo*+flkm$&HNPX3A#|UH`SFl$w(>!Icm)7I_@BW63 z%eaL=*pMz|bDNEa?q2P}DE!;`wlgy&59}6=DEpUl*UhQo={FQ0bp~zgDR*hB)-m(+ z{IEH9^_j5Q1$Ty?+^DG>mlV-{!$sxcIrOibSIBlZZ8^gj!tFd2ao8*q7w}?k8BS(} ze?G+_Usditfc5>$+gD3bp_BDe%m^8D^E;Q~rS>h@iVXY4<#Tw*?80cAp6$%%ni4QD zVzv9RQ`pfSr*od*WhzaViOoRkOl=+n4KT`l;W6Sv>?f&T zunVl@q5DsUH~~fzP|RoxbOZhsM<&A)i(DL(xz5Y!NqEnCO~pcxPkh!^p6OQhWeH2< zW`<4w=C^Cb_l=+Wta&-}!|hfWNGoPXR(Sw^fhSjzk%;cktAcz?qObvqtcVFXIAb$(d?O1?0= zjT{QTGyS4+wGETpb)?Vm(E_pyn#+ytU3NXG+s&eFeYIjMfLRhrJXkT=??5Ne6pU9 zh>}JZv)LKdnkman@Fb;L+-hT-7<9cFI2C)33}{C2xYh8>w0t1%(}2DL6N6E&#&*artIy- zjJ92?q;wW9T;%D5XGuZXpDX!yIU~EUce}GS(8)HEghd{CDg(5PZTeq*fiC2_c=L;c>I$$eb&gka*Fj&fWnL$SwEl5~7Han|8H5LBpPk7Q$I<{B!gyzb1$*gPj?Z_f*TQ#_l z%z&QUjMzdxO42h*b)w3Hx0jc?eGkny+FgpTLgEv$VBz^H(6ECDcw>dJGK8eUo!wTh z4_=pNO;Z{1AabEF`oYfZs6H=?8aFDbCk+*rbSkI;o4CJ_lbC$tBHfLj^tE?N$a-d0 zc6Qi|fVZu-t9~H@GfPUsHP;0pFSk;~X12B~Ue;Z(isrL5h%c<`LRZ=m2D5$E$X+#- z3(EIg`QXTf%Hf(cOhK$T8+JG@_f0)-;k22C;B$JYH4X?v)8*vKvTcIB&_>2BtGQV- zOyHRRHnr|1Z6+hgLUPR8Nw{L#8jGN<`keXnb?>5#fa<`qRqD!^G{47_;vTo!S$D|C zY^AQcU9C+y*l67UiYw;FO#x6=s9z({Gni~PPtWSI#bv1>LqD&5I(jsb%zLLR5M%MI z1xpOeuWKLbTU}U6fC?tdnkfaVPiPbWSr%8J7|rdrdvF@a!`tFaAyeWz@Fn?65+S-o z`(}`wyAGz3^|DCb6l?-joZ4Fk+@{G%*^D`c=Bdr*(2GH$$~ zWcapiEnSL9jX*cVq+4zZfh zD+DR5SYmSwJHM4rhvjCkA!5st#iqn0liY%T${8ekwU=hB?PCJfS z+=gm*%V&Kp*t~p&QB7=o#O{}jnXS!2rXA{JcG;d~0o`=p&88w#2?U7LRVyHfc&m(5 zyR|~?HfU`z{Tx$Do;2MZJ+_EMkg-nOo8a>n8z!I0dNoUSd^_8RPZ)Bruo&wVY4j}N z)au97Ub_ZHd2}_yCsUp4v0|i7GM~+pO&XeY6DPBl*c?+w!UtFp=D+GzsjZajN3X`^RUucXh9KP77ZFlo*>07#+rN1lF{}2y#PPX5|+*Dft&}KKR&BaEY6P@?Yc8 z?oPT?Ygnu!cd)r%B}8zktXCKF?wN&oAnSm`BaC~c&||wa2yRepUWaE1v&vJuXN5#; z6=tP)C3ga9GlRU8t?Rrny7iUneYl4Ws?37jo3Nv8rtI8*Y`&jQuB){nhS$ctXg;GY ze@Sg@0C`Vmhvxskc*D)U{L5Aq23@bhvo@G?9bvG`?1}`|3=oZNq@)}31!WROe2ehH zdjew_FE>L4JaKH*3@f>40Mnwxt9IFtYaf7aY1B=Wou?ns7fVHUmSlD6PLB(}Gu%Fv z52N7Dppgo@thvD7uEL~gYtKCdX!wso)~|id@bZl5EuO^8^SWe;=6kr$(A^;^Kr1fu zf}7geBx<}n3d4TXqh(u$yu_`Uchd{IB^cVHfPyWBaCTbI4K_x0FnZTQL8m}zQ=7Ng zt?&zTRctb?B@o=O(`|Fe6-V2854iIQyS>!bF3%@}yi&c}x-M8$VB<=2Us66n(1)-|D%y+Ey6Co!kEcl7wb z4;|9dL;j9g)NQNP%|dYhLHt=1(ierGSlb`Z?6Juz(6Y>f`A_`GeNviyVor~>8(VUp zkkxif3$TJ_nU-=9^C4?_Q9C0A-@vnG`;zfi7QY#mbn$#HFN7~aPnXpu3s|z<(Qf2_ zt@GwNqa6||(qO8WWV*!07qjk?D579XZ6fohFGX&P_Idi;($=9}9*N+$KB9$m6d?|Y z!cXfUy+XH5^27S&Jcv4YN7`FuF{6eG3&!0Z=V1pZ*1nt=KH;@Q%PQOxzD(ULd+|41 zAc$r{osob<7QCyj&oSJ^G-Pqk8+Ks4=U+`q3@bFlhw}dEgn0q0aKmDt&Q9xwaTbl& zT`t{xN4Mh3k(3^5H!PwUg%LHDz4a8c5|%v7M2yIN^yTSB78f}eOSf}}8vD-- z8BU2ytF_p3#{ye_F->58xu%^U8M#-;O-}w0Q#1NpmzZFuA+_7Eg{d&}5bmb=Z2e*t zQXYEA0Kq)voguy{n6K#UfoNV8&{8X^hx~`=`OaXl5$6}9!J<_p&}hHM z6L1x|tO_mbdi1HNTvHI2OV`G?-F)#y{{j{*Xts#uGhK7lbMUcGf9@xJ*}0nGB8Poz zaeaeg=bc~}qlzw)E@e%tbRG{cZMWk$w|N1}tK052h5Yz>ip>NW`L9u>RaQ6N)SB{2 z|DN_Wodx#H6BZJfQdlN{=RHj5Tz3a}s*zsIRs|`yTEI;i7xerp92>?Hmd_&ODGxs8 zml}R*__$}Te1`>3JnD4w&7U(@OAGss4)3SOI%g`(?46q?G!&WzNBG~9d>_qe9$&P0 z?)+(swLFBceYxE_aW)JcYF+Jr?Y4Ca8D@EQItHy}&pYOuW_4R>$+#o|mUY8T!>+2CegLYMbdnuE_d3>B(UQM`^m#({Ry4k-QqEU4s z0W|4O#_dAV7YfWG2~QlJzX6Un#9;=Ug_VYhan13AsV|$)(DQ9#SYTe%wsNI9mTBwf z(u3x^!h4P4IvN6c+!_sH{LB|IG_cD(cP?AivsPv)Dc_fq*rv_qSaky{H*zXuT3W*b z`{kRZ7l}=8<7cwus3tQ|g{%!<{tC0bN>IzBmbwKVUNm!|Zws#IW7?J(a|w?KLCda( z@luQNt9=$+v`%5z=T@WHmBZ!wW~YrXrkpBVp1#|yd@|gyo}^VI z1oj1($_wh7lnC-dc;>pU4Qs1MJndWnO`$WTUe1IuwPv!*j-xtQUSDu&O=A+Kkf~u> z2VQs8fnwiquPN-*KGxc`pq=rfFZh%<*IzNBb87E@LEY8{-kV9VLm|+Wq3LqF zQjp0gx4f<08i;I(fG@+BhIJMGlz0Eg^;PL@{$#yBY4s=V{$#B`>GUUE{-oQVboi6y z{$!OuS>{iA{K*=Bvd*8Z^d~F)$<6*`sXy7|Pd@8UHu{qd{$#a3`Bi^%lRvr9pZtnH z`Mf{*fM$vyi0 zp#Rg?^`{g5Pn-3pbN)|X^e40Y$$WpZ$e-Npe>M{PKh5+fzw1vv?N8?E=|2Cb>HcJ~ z?un0n&)@qcfAR@`@(q8oz@PlO|NZc*{!g>@r!)Rfzv)kY!=L<)f2jI`|I`#8npUm+mzwJ-%(9e$hKmCFJbk?7Z^!uBC{?Xk)1l*|}zT!{*Soa3}pMKf@X=JPa)1Ua0JN?No`je7>x~e^P>+$3M?kD}-k^B8wRcWz@ zU-yrFRX3m4S;e1?Y?CsJcllrZr0zcA&xZQ^*~K&dY*=Kw_+og=K7V^;i$5Fwrp}($ z*^}WX2mBvL_WHA7{cZSB|F`OMI(ycijcoUY{!{;_OZ&o`59*Iwb@s478xB->LVwdQ zs;Bk0$8@&KpH;u5GgawQzyI5%{r;@_ZGSfWls_91-G(3ce;auz|n z*mp=IA9^fQU7s7)PexSr>N)?yVO3;U#2?w=?~a5DtAN#`{%-YEe>NNlIN<+Q-R{pW zNn9?T^nd$aD75(Gdn)O$8n}Ag|6+KzKdYYdXG1Uhvx~?4*|6ArMBf+=tTX7Js~+}e zBZe1M=!@cx;UoUR>Wex%p|h6*&G&{={raMaG^7?AzF!aSiv5d*LwQ7psx(>kjGj}S zt1sy_v3@+HMjqCWheMO~`q#Xrv#Mn6qIiBtoOwylU3$p>c1X`%enmytB^7ZsP+Q64 z>$@e#L!!;dH}&f++BUHs@6cc4XLPzPJbS zQYCLoJ6=2LjX^=(PX`zq^I$-(8*dSRcwQk5KhKm6o*PuHsWyQ*JY zJQQB@l7DbWn&`50*Bk0#8vQEv@C*JYm(|YoT@tOT^wT?HyNgmRZ;Jsho(hFfLtYh& zT@l+|7MZTT=6}-Ym7re|?Ji4${_r6^*sou|FXeni3ifSj!OKCr?GYI-YpchxCsoFa zs(-c5f2MxFI;4-zN_%`?n&7fgVEXt~Nl{galHs;g@D_a%Z?MZHT} z&ry#JAsxS}_vCm#0iY4nNsS0$y{z;n9a zFO@R#wEoyP;7{vMYg77qEcv`BPh5SQnCSgbxu>7^zi-6Sa_@xl4aoetD!F`9`o7k$ zdR`V~-xlkP4(g9r#ErH7&{g-z1a9=JbeE5+Nd0P>VX@i!{r+~nFUXHft}Ea6w?_xX zxR)#bkLjQ^M5ABY^|EyDWwr3-z+EcERsA@9Rt=X9e#^hT(JPsJQ!gJ5eQMu!&kS#B zRMF^HiR-Zl-{=d{zwIgi{(5--+YkF!4e55H?-~Ec%aZ9g^!m}$!FC8{*EZP+>A*4n zp+kSK*( zDKp^GdH-aiU&O8VN*$&zOX}(msF{ZJsgKU8Klz?GJUy-!sc#nxrpH66`vdv=)V#G* zdTpamy>4@_=cvaPJ^r@nk_xpF&`X$SwgKGaqZ}?K5EYZu-O4*={gRw`% z6!k4qU_-K!8@)2N8oh`8XS0XImg(_8YrQo)b=)__6ao6*4(Q1tu}?jg zc)S%fh>X7XLzM$_G-4@`TK^$mj!P1`bU?k?*eVl8s(*?|ZTv(m^rW=%D)e?3=;L)cx#$)Wc<|q;yd2 za9M+@dhZEeymastU+FjWruwd4f4|WuQZ)Lc_AhEs_O4WIBUp*=$pti|;pdPveWOph zrQRz#42Qn8SNEHH0&@&X>0gv?n0QfEX6=j=?qzZDh*9o3%?^b3ACNs-Ro`3RB4eh} zCx&nIil?%zDr@buh*s;@XWx(-NKZ*mW)DbWny;x>ysQeOrgT&dx6j^peEYb-2F@07Vpm{>p zN~1qG#0_4({sHQLy-vKH$5WWimO;ut+6flW_Iou`MzqgkMD z`kYixqhBKMu39kLuKN=YsAqmp{rsGUd8h#u*n45%P?^SR6 zmde%Wm3|mM-0KTdyP#jCM~~dw`Y!R-=sAtOvq#mN)Xok1HfZ#!3iUneXloZj zVR!8F&wg}HC8(WIpZ&f#D%-F7qo>7O&Fy0E(bLjVBuq=Ql&**5*G-mCHdI~to@ z)T^?sQX<(_3B^_YapD0H=CVxUbU@T@^zZc5N(ZH*>%C|EUmN|R{6xQ$Nb_L{e*Fo( zAv>vx*SEdk-!LAFktbqV-4p%6JH16bS>JcW|E#`S4A|(^1GPaZy5=j=NLSP+R%Pw| zg$j_K(wnkvlA83Wd?;^<9NEia)X5$C=;%2yM|xPNjo?77oec)7s8Qc0shbQMF+HpX zA01S&-;k1fSIVLmti1I2)1EA~Ac5(#Vx#o9x`O6Yfz=f=wSV$FkrQ4Xq| z8@=j5E=e6V`lVUMFGv(7pA#ph$3?~QL+WD2o)%M&hZY}uK~~DUYK5!Pg3aLYcvF7) zw^fwvQSo6qsP@dZY8?8eRKT^vQUM=?N1LyzP-DBKq{es1B&?l#)YE(H6^TN9kCflo zcKIYP%Q(C!5~nYV@U?#3zpCCgJrU%4cg26IzC(X)#A@0`{}%tiSTKaM9b%*Gun0eP zKqB#mI_YeuG{{v^YqBD8HlI>!HTSCu&10%;wo@MZ_w~oI0TC`eE@d#eTZPT`pZDdi zZ&O#)+!k8-U}(nUDs3aof@fQ0QjG1Bw#Y8@`}d|t#mVV_xPJVoK3qE|!)*Md1S=g- zh3nfS03Ytw8=42y3l6Itn%kx7-&J8gIxE44DKfJuvy8 zN^tEV$yM!wxS}4*D4E!=zt)0ej_sGz{IVWx^yxG^E$UC~7HOJ$Rf$GW@wGFO@(+%w zG^3~GZTW#paP5!^-so4&CLWNg9zCt5&UT80vYoGLq#(XagI}@HtDZVLAmU$A50M@f zTTLF+3r5dLpeCME&r^@pJ=gcC&9ecGo-fJE^N#wc+8OD*>{}AvZ2x}Gfc0InfzqQA zf^4S>lRo!=f3|i`!dKrWiLLLJf@;2W$dlue7_9!ZEReAm#b9G$5H=AMY`vmp%JxY; ze7JkBFT&_)34L}%bGq-T%gfGbq+Hdv>U%}R>}7FqGgdo~o(k;KC%xAQlNqDu#0cqe zc{YBiN3-~>FJZP*5?$+8i={z}PwZ1Sm7Wl7>w83{^r*-@7A%PIZ_6@CkE?Yiw}}Yf z)yJA!Lh~L_N7CG)sdU-U2#IA?-gI2(Lo7D_Na`I$#1E?qZeeaRMj@MGZK;7 zS=DCZpo-Y&)5k{7iA9@(YN)ZG$J0TXCRe2gn*CdSl>Ofx40yR&qh6M1)px1Bjs8G|ASI137m*&8>@-63(IrtS9SF@ks0f6=6q(Zj3DM+J zkNUzkcU|zOY3LFro)GOmdPywUJQ(_nQ|e!`=OhKqUUeT?=oy>4r6b25kv`3~sej4N zN+fH+kZ$zrWwn02VeC0IdvmYo*nCy}L-wfp!}OdKc{(Vw{|#|Swo`o5=oh=zVvR{J z$vVzX>RYvQBI(%E`fIi|sJ@DbH?~XV8rvu0)%S`)#-Eqo$ET!Pz$po3wp9}+m&DMubF%2_!R62h#BEljEUrDMZl!ikZus|3_-|zcd;DpA zpZbzUpGrHiM>3ZV>ValOrOCDhemx}~%XX*^6T#JyJ$ue;{da=)6;sr==#TH|rPp@s zQVDm7T=l?q&4*Om@$Dkw_yzrJB52g{m&LgC?P}?XXVihz&Z&cFJ|eD|+#;c_-!I}f zdR52lf(+lcRD_8mG9TX2Fu%D!aMkmYxDUcy>_qU7rsp)rF(ejFkE_eh!nkwnfHY9{ zm}=ALl{Rbi$ygmfCL&B64#eLlIc@Z+tGKGyHV>%ht8d#Hnj|p(HhpaDu=r}+qgymi8TgP->-iS`3sP*QjDxc3gT`opZKTq)89U zDrlaNJWcG=UndWSzT!DOP_N>|qV&-<0-mR#d|5ym)Q0B7)cV%C>5RfRpACG28eBiPpqE34Jyok^OLo>R8_) zD=9l5B1}9W4$1=OHg|>>dy-l?k58l@L{nDM2_p7{F@Hk|TieJYM zOTe3_gDYO&&5l0hKbAc#&b=1ecXGe#e(g2!etn08XZ$&}L?f1t8rvnJr-xOVM!%Y} z(Wh?mTAu_sJts*V+amM4DjVc2O#!?k9&bLaKfa}+)OSDO>s8+^;cOl}77gh7|=OsJYR(06*;M}V9 zOQ+ZRRq*tAadUc9Y}x1)$s1-^aZ1vc9t{tKdG1D^hSTq=C2ME(t46<+Lbg@Hmz~xG z)mwUW@b5~HE=hTqd!x}A*4(fFLn3}h~JFsoPWT4hB-8!~QBpI0KF!xf-LaQ6P4%Yu?%L~Wh}s#oSv?k=>w7}U&xza8<1z{+pE%}uVe+8r-ssiO zCikk@>$?I;PwIh*;Qpz_dcy}N)Z1i7Mabp>^(NU-30>{D`m087DA`jY*~EGAc6xG$ zFK=_Fbkrr$Jlh(+8-!!BqGo?r6dOIO*s-_8jp?A~7>7ieMxUB@EQBG}PD%JigHN)) zBWM^^wsuA=T;C(LG8SAIqo-7ZiTw%|eOJG#J*5tJJOu5e=cINfpHykHvts9TK&C`G zAYGC@Diu|`5QrRa_mutMHPNHdCqK}eQtqQ?#1mOX^tz%hp%w>yd3Kk7YkGXEKdprq zq(NU!Zc)J}_lngupty`nB~VypD!wU;Hlh-luGTC7+~~t_`>7j z(edqK$MoD5&x4IVNmsT_`ekB|oSBzZu4^x;4&%?LJDWTtk!w~|#_{0eY90t`*r(?Rie^Hnw9*dr?L=%A)! zF3Ts7?bd1fvOe29Al{$Y8wU4khw5Db?qFhbyN;7`A`?$ZK?Ud?H!|ILkLmIG; zA62y0MQNyPoBFk}ld|~VRrgaHR5OmAl@iTDz0(1apb$?J5eM>J(Pl|);J7g1FI~96C@og=*)4nS)oCqdcJ;?F3#}CMh`t(E6)Xm^E zXr9sx?|V`TwOC59-mBTqw?x}U2v{8r_EdIMa-5x2*E#;O-ZdUe8I7J-o77H;S0=Wr z2-#LWP~WRK@{5wKH-h3jtDd5MTtph%BEG!#tQ<1meNJLn(XW~Z6w!W3I;Yt$6RaLf z1vdHv+a8rsF@8=x$>fuP#kZ(~YQ82-F!6$FJGMhb91GzolLypgH;;=3ql2RD`_T)^92>Mxl7F0+$+nt`Ji-IEtnt^p`j+86yv6YK{^LihAg<@>pR3D znF-oBtP%Od8N~*@r+A@@(i*ihp?P+Ot|UZUHp6(SW`yry>4%BX%}%}=X!ERCa4gu{ z*{Lwtd`e9<{+zVYSn#c9`_(Adwn`*_D3KjMFCXYz@`k-D8r4IvOmk{jUL!mxf_1EmETD=*#JT-GvJH$<+rylk1Z9XE_9D7a;l!e)d zS_qYF^ob6QUa5ohH8tB<=(^Gq62fDnh!}IWT6(>dFhVv!|DY-IH8hdf%(UR z9?A}?{j;MYN%OcEFFmK?G{Q2L@i0kTdrFiZdr{`a_`{Npu@lk<=}Fm%&AktK>NKC% z&(fo^AFjP5NlFjvfkvOKlD9Pf{jNHP>_Lgi`0ILAE#B|HHF-e&&?WWkwX>rBc!+44 z*ryka4yvciURJp$_KVKxfRtBuzckk5Zc%h1H2bw<;?l_{#gO#_LBl>Pb<%uQ@;i23 zu8d1EKHpT|l3l3yI!^4`=TEOaC%qG{8Wx{t4~TDT zXC*M>&r4G4A@;N0r*e(Gu7ZCM8mAVkoziDDxEv8P*Un4S>rY9(vk)RSxl6=vgoRf_ zYSGctkNI}W9+Sn94M=jvUseewo)m+9a4h&ErKYoO(oxwC8UEkbliwG2jGa|k)6ihG zGivAY^U~s%6h-w{lB-$>>Zt`^LAF!Ea_w;uYe+#be<8t7Uy*#I!C8?W4YtOB9vC~I zj%xgc@W8>K@sFvWA3vnYn|Ia!O&(JdjD~5eW<>%p@shgvbU>#chVYMU&jH^^%{`*g zL@;-*Jr@G8<=?A?^aEq()iUE3LW3XDM9$l4`t+zu+vt;C9y_EScN@#+wYiYkFRFt(oNwA$qC)tk{0^lqlGIDp2?p@zMANmAm=4gn#^7;<*oAlV~=2 zRh(?6O8%A@eDsW@C3{R=Ky$AeAw4d&ooyA7vZIo$Y@0YWeO(<=dQ!Yz>wikTp%zPz zOQ+OgRi}1B$~=2T)%mbLmVfEsPaV`(HpWI>Wi}wC^Qxv^0lWP zmoS_cMLs$!Tl$hJ*$8Vw#zRt+YatYIav=OlWA`i!t((EjxpqwDstrozH&2RLnxPKO zJu*`!_e+g8Uk*PD!Q&r3sRkZ@NbNTsW=7J$`n5CtzUqxIl{|Vni1|*vEZeL0x%O=- z-)o^)NrQ)YBBVQ;Jg7IMVcp8;pbYcoZZ*~T^HLBKA$0bGU=&Y2^^z}d7Utob+oS`p zeOoUZ4QuUcVUD^P772c6Vi1mqrN?$_a^;QCD?BUCYR1yu<6-PQ`Gka~`I^-1wJ_^n zJ13cF^olbYVNKZhd8wZ93lheO&_I*>WZ^V-sa>;RyN#XM=XXe5-?^HNAvU*l-Mn(; znvR>h*^^;vL#8}4@_pCy7XRJh?QS^CG0*gQytOGIF1|9`_K?OVyj)6gtfbW4}a22Fr9Q-Z#Zy>rDML!CD9Q*|M zDeyDkXTi^dUjUx~H-gWC&x79pzXN_BlrBz{?-tJIfkj{mxD(t3%3wKI1v7d#0LfTutOJOc*6VemZoHh2jf1;@av;5c|4 z41&|(EI1Dma1mSr-vQqPZ-TeLWiSlh0e=bJ16RQhz+ZvC1}Uh6QP2eA-~%uT{uB6H z@Snkd0sj^JH}K!V{{a6JMAazz5%BlG-v>VmehmBr@DIU{gMS46G59CoC%{jFe+vE? z_$lzu!B2yK0e%MjOYpD2zXm@G{tfuI;OD^4gMSD9ef2tDasG?o2JlPZ6X28Jm%*pN zjo>EmY492FS@0|1X7D-idGM>?*TAoX-vGY}ehd6I_#N=O;0xgQ!0&@EfLt1kvLhL9~k_h<0-X(H@Q>=om*3y}}VhuW|&@YaBszoFj-% za0Joo96@xFBZvk$g6I@S5S_lxr)RhZ(OHflI>!-2=Q)Dt0!I)f96|JF96@xEBZ#UT zL3D{Dh=w?V=sO%i^j(f1`W{CRy}=PgZ*m0D_c?;-Esh}ibB-Xo%n?Lya|F>aM-Yv0 z1kpPjLG%|KLG+g#LG&(15WU9{L{~V1=qg7Lz0VOuKi~+WA94iIUvUJ{f8+?Fzvc*{ zzu^d?lp~0096?m)2%-i@5RGyKQN|HOO^zTM;|QX0jv$)g2%---g6JAY5KVFf(T5yC z^q)9_=p&9G`df}5`rkQ%=s$A=(f`2_ME`{&i2hHGAo{NyLG*ud1kr!v2%`U+BZ&Sx zM-cr#96|IyID+W^j zat&gTfau4#1~Eu}>{71dOaFjtkbvkPav#JX0nv|h4Puah=pS(nVvvC7A9D?2kbvl) za1CORfaoW<1~EuL^pjkJ7$hM2r(A;=Bp~`{T!R=SAo?k;K@1WQ{d2BC3=$CiG}j;o z35fm$*B}N7h<=7^5Q79n|B`DEg9JqXifa&q1VsOuYY>A3L_fA3M8C#0h(Q9PU*{UcAOX>Da1CORfao{51~EuL^jlnm7$hM2ZLUEK5)l0k z*B}N7h<=xA5Q79nU*H5w0nwMa1~EuLbSu{&1__AfaSdXSfM`C~AO;DD7H|z>kbr0**B}N7 zh!$}TVvvC7D_nyZBp_PMHHbk1q86?}3=$A6;Tpss0nu$-gBTA3L|^9` z#2^7tnQIV(1VpV|gBTT!R=SAX?5fh(Q9PHm*So5)iH68pI$0(MqmC z3=$Bn;u^#t0Z}{GAO;DD?&ccAAOX>8u0aeE5Or`3VvvAn4c8zB35eEm4PuahsFQ0D zg9JqDxCSvuK=ci+K@1WQb#V=1kbtP0YY>A3MC-W*F-Snv!!?LO0-_CEgBTDC zK?0(CxCSvuK(vW#5Q79no4E!tNI-Ni*B}N7i2j6Y5Q79n_i+tkkbvkC2gBVmm z0#Xni;yx&WGKfJ1Bp?ORGu#IyPzEumfCQu<`WE*=36wz$Dj)$Vhz7V1N}vp4Pyq=@ zLGK?#&W3@RW2DToepACy2D#GnEakb>wr?t>C2gBVmm0#Xn?&wWq=We|f3NI(jr z7q|~fpbTPA0SQP!^lk2g5-5WhR6qh!5WUEKPy%HTg9=DM3Zj>|4@#g6Vo(7INI`Uj z`=A8MAO;nXfD}YWxerR93}R3L2}nWoGWS6VltBzCAOR_ej&c9kl>J-gJO&kzfD}Zp za37RjnR+JXyaE!Cg6LK5gAyo%7*s$4QV_kyeNX~r5Q7RxKnkMc+y^C41~I6B1f(E3 z!F^ByWe|f3NI(jr*SQZ$pbTPA0SQP!bdvj^1j--=6_9`wM1$N1B~S)2sDK2dAUef; zPy%HTg9=DM3Zm282PIGjF{pq9q#!!OeNX~r5Q7RxKnkL>+y^C41~I6B1f(E3$9+%& zWe|f3NI(jr^V|m|PzEumfCQuwE_dyAiK@2J&0V#-vxDQI83}R3L z2}nWo9qxk?D1#VOKmt+_eV6;71j--=6_9`wMBn2+D1kDFK?Nir1<@Pa2PIGjF{pq9 zq#$~e`=A8MAO;nXfD}aE=RPQbGKfJ1Bp?ORTigdFPzEumfCQu<`g87s5-5WhR6qh! z5MAazD1kDFK?Nir1<~8w2PIGjF{pq9q#zpRJ}7}Qh(QG;AO+C~_dyAiK@2J&0V#;y z;XWvVGKfJ1Bp?ORUvM9kKpDiK0uqpd=r6esN}vp4Pyq=@LG&*7K?#&W3@RW2DTvC2gBVmm0#XoNv`R;$MxzT{qg=4sOcD(Scrb1JAs;x zfr*9ai#vguj)94V=oh&YsOcD(ScrbHJAs;xfr*9am$(zC=@^(;h%W8~YB~lc7NTG3 zPN1e^U}7QqW$pxOItC^dqF?S#pr&JBVj=qKPN1e^U}7Qq748ITItC^dqF?Dwpr&JB zVj=of?gVN&1|}Aw)tx|1$H2rw^sC(o)N~9?EJVM?oj^^;z{EoIYuyRdbPP-^MBm&A z)N~9?EJVM~oj^^;z{EoI>)i>|bPP-^M8CnEKuyQM#6om+Cs5NdFtHH*Mt1@=9Rm{! z(Qk4mP}4Cmu@L=ccLFsX0}~6;cXt9c9Rm{!(Qk1lP}4Cmu@L=McLFsX0}~6;Z*wP5 z(=jly;4j!HCM73OQPa}VGcYnSv#=8VFg+#LU7<^dt48 z#LU7<^h0`5asm}KEgd}rBNH z#LU7<^tF604nV4Bv@qzUyCM73OQPa}VGcYnSv#=8V5qeT`0u?na9X$gh z6Eh1d(I2TNB_~i()6&s1FfuW-uoC@IdQx%%6*Vm#Jp&^XGYc!x13f7@fr^@zj-G*$ ziJ66!=#SQuk`t(?Y3b-07@3$^Sc!hNo|K$GMNLab&%nsU%)(0a$LLAP2~^aybo302 zOw25-M5UgToIpiQOGnSZ$i&RTO7zF-Ny!OR)U z#LU7<^vCN-$q7`{v~=_gj7-cdtV9p>q~ruDYFava21X`k7FMD^K~G9fprWRwqi0}b zVrF3_`V;k}1SN=~4nrlq52U}R!uVI}(W^`ztkDr#CfdIm-&W)@bWzd%n)PN1TurK4wH zWMXDvCHf2Xq~ruDYFava21X`k7FMDsdQx%%6*Vm#Jp&^XGYc!xU!*4`Cs0w-($O<8 zGBLBT68*(`QgQ+nH7y-I10xeN3oFrIq9-LMP*Ky;(K9eIF|)7|HF{EV0u?na9X$gh z6EjQm^Z&o^-}_7Tq~ruDYFava21X`k7FMFaOixNqprWRwqi0}bVrF3_`pfmCF604nV4BviT*l0DLH|PnwE~9fsu)sg_Y>9*OQVHsHkb_=ouK9m|0kf{suiMIf06r zmX4l*k%^gwm1wIcB_~i()6&s1FfuW-uoC@^dQx%%6*Vm#Jp&^XGYc!x-=rrcCs0w- z($O<8GBLBT68+73QgQ+nH7y-I10xeN3oFrcJt;YXikg;=o`I2xnT3_;Z_$&I6R4F604nV4BviT+kSDLH|PnwE~9fsu)sg_Y=U)02`DsHkb_=ouK9m|0ly1$EI4ViHm^ zatZ<^6&q?ATDElT=-D%HVC2ZeiJ3DC7gnxBf4hBR5>hg93IZh+8)_O_wsh?1*)wop z66Eg$pZJq8IjwNl3}aDF~EQY^Z5y+0wD2XV1Wa zks}i)X3i{JSh*7Yo%V@INXf`42$WQ8sA*`~(y^mw&%l9^BNHcP&MaJ5xf1Q*68+uwiAhMw$SDYvRBWheXxY-Sqi4^+ zfsrE=Toa zl95voD5=;`)6lY|V@J=PfdeB)CQi(pS-7xrCHlGciAhMw$SDYvRBWheXxY-Sqi4^+ zfsrE=`&Pa%AGf%$bD?D_5eI z_K8VI$;c@PlvHe}X=vHfv7=|tz=4q?6DMZQEL>Q*68(MliAhMw$SDYvRBWheXxY-S zqi4^+fsrEDbY;XW+odk%<#CXBIB3T#0)7#3ZC-DbY;XW+odk%<#CXBIB3T#5c6`@|%qWaJbCN-8$gG_-8# z*wM3R;K0a{i4!wt7A~w@iC)VdYBn z58EduAtfWHAW%}Vp{Ai_OUI6$Jp%_uj!c}GIkRwKrlDm^ z$Bv#o0|!QqOq`fGvv6VMO7xG~Cng~!Bc~uxQn8_?p=C?Qj-EXO2S$!eoR~SYaAD<2 zw6{-8LP|zXL7=2!Lrp`=mW~}gdj<}S9GN&Vb7tYf%9ZFJvrkMyN=8mWprm3$O+(9; zjvYOF1`doInK&_XX5qrhmFOS0PfS8eMovMXq+&x&L(7(q9X)#n4vZX`I5Bf(;lj$5 z=%27pOhQUVPC=lgVna_|&!pfECwS8g|QZjN10won2Y8qO$ zbnNKaGjL$!$i#`6GYc11u0;Q&ePR+)GI9z6B^4WL8d|n=?C9AuaA4%f#EF?R3l~hg93IZh+8)_O_ zwsh?1*)wop=`&Pa%AGf%$bD? zD_5d_#y&9#DH%Bhfs%?1H4QCWI(GEz88|R66Eg$pZJqJPdlF$pOdIR$}|iVZalEn7Nv^z0cpFmhz##LSt6 z3oBQmH};81NXf`42$WQ8sA*`~(y^mw&%l9^BNHcP&MaJ5xf1>J_K8VI$;c@PlvHe} zX=vHfv7=|tz=4q?6DMZQEL>Q*68#JIiAhMw$SDYvRBWheXxY-Sqi4^+fsrE=Toal95voD5=;` z)6lY|V@J=PfdeB)CQi(pS-7xrCHj}_6O)jVky8*Tsn}4{(6XgtN6(&t10zQ!PRyKH zxUh01`j_n!laP{;QxGVr*ih5ZvZZ53&z^w;BS$7q%$!-cuyQ5(SL_p$kdl#85GbkG zP}9(|rDI3Wo`C}+M+9aGI3(& z%)*70E7545n1qy!oPt0}#fF-OmMtASdiD$)7&$U=V&=@kg_SGOzhR%4gp`b&f+9aGI3(& z%)*70E78AYpO}P{jGTf%NyUbmhL$ZIJ9_pE92hw=abo7o!iAM9(L4LZB&1~I6a-2t zHqDbY;XW+odk%<#C zXBIB3T#5c2`@|%qWaJbCN-8$gG_-8#*wM3R;K0a{i4!wt7A~w@iT+*t#3ZC-Q*68-!3iAhMw$SDYv zRBWheXxY-Sqi4^+fsrE_|&!pfECKekUyLP|zXL7=2!Lrp`=mW~}gdj<}S9GN&Vb7tYf$`wCIC%QpQLP|zX zL7=2!Lrp`=mW~}gdj<}S9GN&Vb7tYf%9ZFpu}@4wN=8mWprm3$O+(9;jvYOF1`doI znK&_XX5qrhmFPdUPfS8eMovMXq+&x&L(7(q9X)#n4vZX`I5Bf(;lj$5=s&YhOhQUV zPC=lgVna_|&!pfECgMDHWQZjN10won2Y8qO$bnNKaGjL$! z$i#`6GYc11u0%iIJ~0U?894=kl8OyA4J}(bcJ%BSI52W#;>66Eg$pZJqF-R2n1qy! zoPt0}#fF-OmMtASdiD$)7&$U=V&=@kg_SGOe{P?cgp`b&fDbY;XW+odk%<#CXBIB3T#5b*`@|%q zWaJbCN-8$gG_-8#*wM3R;K0a{i4!wt7A~w@iT+Fb#3ZC-$B&1~I6a-2tHqrlDm^$Bv#o0|!QqOq`fGvv6VMO7vgbCng~!Bc~uxQn8_?p=C?Qj-EXO z2S$!eoR~SYaAD<2^xxPgCLtvwryx*Lv7x4+WlP77o;?ExMvhFJm^rg>VdYBn-`Xc8 zAtfWHAW%}Vp{Ai_OUI6$Jp%_uj!c}GIkRwK<%&NTqZ`B|q-5k21WGD4)HJkg>DbY; zXW+odk%<#CXBIB3T#5cW`@|%qWaJbCN-8$gG_-8#*wM3R;K0a{i4!wt7A~w@iT->0 z#3ZC-rlDm^$Bv#o0|!QqOq`fGvv6VMO7uV4 zCng~!Bc~uxQn8_?p=C?Qj-EXO2S$!eoR~SYaAD<2^gr1rCLtvwryx*Lv7x4+WlP77 zo;?ExMvhFJm^rg>VdYBnKielJAtfWHAW%}Vp{Ai_OUI6$Jp%_uj!c}GIkRwK<%)k4 zMK_2^NXf`42$WQ8sA*`~(y^mw&%l9^BNHcP&MaJ5xf1;^_K8VI$;c@PlvHe}X=vHf zv7=|tz=4q?6DMZQEL>Q*68*3CiAhMw$SDYvRBWheXxY-Sqi4^+fsrEM=!OhQUVPC=lgVna_|&!pfEC|Flm`LP|zXL7=2!Lrp`= zmW~}gdj<}S9GN&Vb7tYf$`$`1if#~-kdl#85GbkGP}9(|rDI3Wo`C}+M=Toal95voD5=;`)6lY|V@J=PfdeB)CQi(pS-7xrCHlYZ6O)jVky8*Tsn}4{ z(6XgtN6(&t10zQ!PRyKHxUh01`i1t1Nl3}aDF~EQY^Z5y+0wD2XV1Waks}i)X3i{J zSh*5?u}@4wN=8mWprm3$O+(9;jvYOF1`doInK&_XX5qrhmFO4QCng~!Bc~uxQn8_? zp=C?Qj-EXO2S$!eoR~SYaAD<2^o#8ilaP{;QxGVr*ih5ZvZZ53&z^w;BS$7q%$!-c zuyQ5(CH9F)NXf`42$WQ8sA*`~(y^mw&%l9^BNHcP&MaJ5x#EK%(G6k}QZjN10won2 zY8qO$bnNKaGjL$!$i#`6GYc11u0+4oJ~0U?894=kl8OyA4J}(bcJ%BSI52W#;>66E zg$pZJqF-j8n1qy!oPt0}#fF-OmMtASdiD$)7&$U=V&=@kg_SGOFSk!jLP|zXL7=2! zLrp`=mW~}gdj<}S9GN&Vb7tYf%9ZG=ePR+)GI9z6B^4WL8d|n=?C9AuaA4%f#EF?R z3l~DbY;XW+odk%<#CXBIB3T#0^zePR+)GI9z6 zB^4WL8d|n=?C9AuaA4%f#EF?R3l~_|&!pfECH`ym9AtfWH zAW%}Vp{Ai_OUI6$Jp%_uj!c}GIkRwKrlDm^$Bv#o0|!Qq zOq`fGvv6VMO7z?86O)jVky8*Tsn}4{(6XgtN6(&t10zQ!PRyKHxUh1?CqSbgaD$t~ z+#=yNDR;=YOU^wC?h|-G$wMk0vEeZ_PiS~b%QLn-r{e`XUefc5J+B#f!-2PqyyM7w zCO&ZDBQu{k^O=P&T=>e$H?Djq`eEMB4Q>*1i-g;x+#%yGIrk{IPv8M152<*>hR4)A zq2Vbl&)D*uju-5BNzW_xyk_7H2i`LBjwA1x_`r#e%zWa^XBNJ2;VUcOxbmIohkHLa zxJk?{5^j@nhm5=A+@s(=fd`a4q~Z}99#iv#hNrYVW6N_oUa;dOJ+IjFnt?YQc+1E; zj=X2$11CN*^NBN`S@^<*udICI%6Fn4;r-m;CNZ~2xJ}9(GVYRdkAnLI9#Ha-ibrgC zOwAJ-p3?G+EzjwA!H$>oykgI52HtSsEhFzZ@}7wgocPGhC(e9k;R_eOvhs~9--&+U z{oLRtF}Fy#P0AfI?vitlg8Kv>Q1Xz9M{IaZ%@Z1)((;Tg&*^xe$H?Djq z`cdA`4Q>*1i-g;x+#%yGIrk{IPv8M152<*>hR4)Aq2Vbl&)D*uju-5BNzW_xyk_7H z2i`LBjwA1x_`r#e%zWa^XBNJ2;VUcOxbmIoM|(dvxJk?{5^j@nhm5=A+@s(=fd`a4 zq~Z}99#iv#hNrYVW6N_oUa;dOJ+IjFnt?YQc+1E;j=X2$11CN*^NBN`S@^<*udICI z%6EM6Q}hFFaFdu@B-|$D4jFgJxktf$0uLy8NW~*IJf`Le4Nqx##+K)FykN&mdS0>T zH3M%r@RpHx9C^>g2Tpus<`ZW=v+#urUs?IamG4A9#{0R!O=515aGR7nWZWg^9tHOa zJfP$u6_42Pn3^XvJf-CsTb|SLf*mjEdBvXB47}mMTSne-ZBp)#ahIHX6x=89fRcw)JYvIRYM#*Wl$K{~c}~X*cD$tL z6?%O7&A=NDyk+DaN8U5>ffFB@`NWye zEPUa@S604p=4#M~m`HYs<=xJ%AG3honlK*>WY9dnP_`;v+MkIP;l>FI@P_$~UfjC;Ca=&kb%8bBl!A zq}(CnE;;uoxKH2#B@d~1#D>SzJfYz!Ezj8UoQ@alcuCJI_Pl1`4F}#b@{S|#nfSno zkIa1H%x4z9aN#Q}-?;Lf=!d+Y8{8!3774dWxkJWXa_&)ZpTGl39#Zj$4UegLLc>#9 zp0VXQ9WU7NlAc%WdCkBZ4!mXL9Y@~(&@cFZe+KafzdrGinNOVg%)%Egd}ZYuSH2Vd zWbfw&H;K7L!fjISka3rsdlcL!@PLwsR6JtCV``qz@RXKkY4xkbWlQtps(mz;YP+$ZpWl8010V#8x6WOnl(PM`k{8<}(XlxbT&gZ(R9K^i#c`8{8!3774dWxkJWXa_&)ZpTGl3 z9#Zj$4UegLLc>#9p0VXQ9WU7NlAc%WdCkBZ4!mXL9Y@|X@qrT`nfb(-&n$f5!dF(l zapgPFPxF3maFdu@B-|$D4jFgJxktf$0uLy8NW~*IJf`Le4Nqx##+K)FykN&mdS0>T zH3M%r@RpHx9C^>g2Tpus<`ZW=v+#urUs?IamGAhbr|1XV;3hG*NVrYP9Ww5cbB}`i z1RhZGkcvlacudU`8lKYfj4jXUc)^aB^t@uvYX;tM;4LHXIP#u}51jbO%qPx#X5kAL zzOwR-E8mHJy7zN~o5b8A;WjCE$hb?+Jqqptg5_5}$+oaqf<1RV( zD7a7H0VNNqc*KUs)I6c#DJ{>~@|=zr?08AfEB3r*;0*`fGV+ci@0s|(iI2>D;>>3j zzHs3yE8n>Co#*z=l!Hyn7&$UBa_XW|1VJ~H!(GoM-b!iBG_eB;V@qTAlj4Q>*1i-g;x+#%yG zIrk{IPv8M1{|`fd{NjLp$8r5ZN(tpCM+qgASYtiwee2opSz|qGtY?k&tg+_XZ@x9w zSYwSf)>vbWHP+Z~tg*%#OBi8<5k?rt2qTOz!U!dlFhU6CmOeJ$(iY88K$Ulo@jtELpK; z!vkA(>^bnrkrRP`Qa?e?I44Ay2vIJ$9DN&|El^S&# zG-=VMLzf=+^cgT@#FzqTXyU@ z@W_!9fqzOrLC!cQM3@LsF1X~1Yhv6G=avL_BuSAbLzWzQ3KS_(rb3k(bs98j(WXO} z9{2PaFl5A-2~%dwS+Hcqnhg(Z*|F!qBS%hn@Emx8AZMHtB20uR7hH11H8F08b4!9d zlB7tJAxn-t1&Wj?Q=v+YIt`k%Xw#ufk9+zI7&2nagef!TELgH)&4vfI?AUYQks~Jp z|FnLBoN-QwFcG3$aLEX?+pqBq`El$dV&Zfg&Z! zRH#y;PJ<>b+H~mBZ)+OM*L+q)3w?OO8ARij*i*p-PQ94VtuQ)1ga`d-@C*GGfexDKq9QSh8Zx zh6lFn*mK~KBPRkc>LLCE#WgW* zh;vJVJCdYGlOaovJOzrBC{v+IjXDjQv}n_zOOJc{3>Y$E%!Da3<}6sUV$FsJw(Qt* z;E^LI96Jd-L69@f2@xhjlnXAo;+hyY#JMHG9Z6E8$&e*So&rTml&MgqMx6#tTD0lV zrN=#e1`HW7X2O&ia~3RFv1Y>qTXyU@@W_!9f&Z6&f}C+qh%gbNTyV)1*TlFX&MgV< zNRlE=hAcVq6ev=nOob{n>NIH5qD_Y`J?`lZ)+OM*L+q)3w?OO8ARij*i*p-PQ94VtuQ)1ga`d-@C* zGGfexDKq9QSh8Zxh6lFn*mK~KBPRm?ynce5aZZRZ5u#jh$rabcxFOCh3GPUeB29)Y zIr0=JQld9DN&|El^S&# zG-=VMLzf=+^cgT@#FzX?< zZisVBf;*C=NRuH;jywg5lqgf7N{u=VnzU%sp-Yc@`V1H{V$6goGv+KsZpmvlNN0{bm?(Vp8-QgjF~WH#+(I9R;=0Zz?L0*4m@(?MBrc6PmnXt z2@xhjlnXAo;+hyY#JMHG9Z6E8$&e*So&rTml&MgqMx6#tTD0lVrN=#e1`HW7X2O&i za~3RFv1Y>qTXyU@@W_!9fmihtw< zDpaXar$LhzZ8~)6aZjHCLq?35FlEM^1xr?}+3>)Y9eWNua^ytdU(rvHGtLPSCPI`8 zF1g~G7&pYZCBYp@Ql!a{B}bkDMM{*ZP^Ctl22EPD>CmOeJ$(iY88K$Ulo@jtELpK; z!vkA(>^bnrkrR%b2A&|u8Rvuu6Cuh4mt1j8j2q(IlHiUcDbi%fk|R%nA|=XHs8XX& zgC;H7bm-FKo<0MHj2JUv%8WS+maJH_;ejnX_8fTR$cezes-GZdoD(8UgeVtWa>X?< zZisVBf;*C=NRuH;jywg5lqgf7N{u=VnzU%sp-Yc@`V1H{V$6goGv+KaYLM265Np_MVbs*a^xvcq(qqtRch2}(4sZpmvlNN0{bm?(Vp8-QgjF~WH#+(I9R;=0Zz?L0*4m@(?MBsJ(1UciJ z5Md%jx!{s3u8DC&oLdsykt9W$3|VsIDNv+DnF>{E)M?P9MVk&?dfd}zz>pDRCQO+z zXTg#cYc@QvWyhWaj~qD>_&4+uw< zDpaXar$LhzZ8~)6aZjHCLq?35FlEM^1xr?}+3>)Y9eWNua^!>q=Yb~(a>h9!!bFI2 z!6jE*6XS+BwFqrbCw=_w*SsWW<;WQ)bLruw=!W4G(PDvFE@eM@|I( zZT$o}aYLM265Np_MVbs*a^xvcq(qqtRch2}(4{E)M?P9MVk&?dfd}zz>pDRCQO+z zXTg#cYc@QvWyhWaj~qGSbCbXm1UciJ5Md%jx!{s3u8DC&oLdsykt9W$3|VsIDNv+D znF>{E)M?P9MVk&?dfd}zz>pDRCQO+zXTg#cYc@QvWyhWaj~qD>_;>Xaw)Y9eWNua^ytdE&T*JaYLM265Np_MVbs*a^xvcq(qqtRch2} z(4sZpmvlNN0{bm?(Vp8-QgjF~WH#+(I9R;=0Zz?L0* z4m@(?L?Ef3AZMHtB20uR7hH11H8F08b4!9dlB7tJAxn-t1&Wj?Q=v+YIt`k%Xw#uf zk9+zI7&2nagef!TELgH)&4vfI?AUYQks~Jp|Gs{LoN-QwFcG3$aLEX?gHa6Xc9@LWGGB<$_DDxF*I8ac)U)N0Jn2GGxh-r$CVs zWhzvuQKvzZ7Hv9o>2Xh=0YgTNnJ{I>oCQl(tl99umK}QzJaXhj;2r%0IpdrVVIoAi z;F2q@iE%@mTN2!nBt@DGS#snlP^3he3RP;oN-QwFcG3$aLEP zBgRaaGGoqyB`el!cwozpJqI2+a>A#kfhP!Z#yKIvM2K?1C0AS%PBgRaaGGoqyB`el!cwozpJqI2+aw6~_=_kk; z=Y$9oA<6}pTyaf|8{*uO;Ep6I(qzbzBTs=MCCXH&Qln0TCN0`@=+fh!J_Ck~7&BqY zj5!OItXQ+*fh{}s9C+l&iNJgM33A3cA;Lt6a=|56TodDlIJYFYBT0%h8M5TaQ=mwR zG8L-SsMDZHi#8p)^th+bfFUEsOqeob&VnT?)@*oS%Z@z<9yxL%@E_|Z$QkE^2ooX7 z1(#fLO^h4j+>+pqBq`El$dV&Zfg&Z!RH#y;PJ<>b+H~mB9DN&|El^S&# zG-=VMLzf=+^cgT@#FzsBy;uw}=d z1CJaz5qMueLC!cQM3@LsF1X~1Yhv6G=avL_BuSAbLzWzQ3KS_(rb3k(bs98j(WXO} z9{2PaFl5A-2~%dwS+Hcqnhg(Z*|F!qBS%gI{!{$~IpdrVVIoAi;F2q@iE%@mTN2!n zBt@DGS#snlP^3he3RP;sBy;uw}=d1CJaz5%|yb6Xc9@LWGGB<$_DDxF*I8ac)U)N0Jn2GGxh-r$CVs zWhzvuQKvzZ7Hv9o>2Xh=0YgTNnJ{I>oCQl(tl99umK}QzJaXhjAg`YwXPgrvOoS*G zTyn)VF>Z)+OM*L+q)3w?OO8ARij*i*p-PQ94VtuQ)1ga`d-@C*GGfexDKq9QSh8Zx zh6lFn*mK~KBPRm?g?@sZaZZRZ5u#jh$rabcxFOCh3GPUeB29)YIr0=JQldLCE#WgW* zh;vJVJCdYGlOaovJOzrBC{v+IjXDjQv}n_zOOJc{3>Y$E%!Da3<}6sUV$FsJw(Qt* z;E^LI0{^9cf}C+qh%gbNTyV)1*TlFX&MgVNIH5qD_Y` zJ?`lKP6Xc9@LWGGB<$_DDxF*I8ac)U)N0Jn2GGxh- zr$CVsWhzvuQKvzZ7Hv9o>2Xh=0YgTNnJ{I>oCQl(tl99umK}QzJaXhj;J?;SkTcE+ z5hg;E3og0hnix04xh26JNm8WAkR?Z+0!2!csZgaxod!)>wCT{L$31-p3>h(I!ju_v z7A#q@X2SzpcI-Lu$dMC)qJDy$aZZRZ5u#jh$rabcxFOCh3GPUeB29)YIr0=JQldNIH5 zqD_Y`J?`lh9!!bFI2!6jE*6XS+B zwwCT{L$31-p3>h(I z!ju_v7A#q@X2SzpcI-Lu$dMC)|6V^q&NwGTmFqrbCw=_w*SsWW<;WQ)bLruw=!W4G(PDvFE@eM@|IF`U!HzIU&MC zh;qRtS6mb0hB&t*xFbo5G#Rqw$Wx$5i82+c)Tq;-NsBfey7aiG&wwE##!Q$pW6pvl zE7ojyV9Sm@2Oc?cBJe-xC&(G+ga{KM$_1BPaZQXH;@pzpjwC74WXO^uPk|yO%2cRQ zqfUb+E!uSG(&L^!1BQ$kGhxb%ISZDoShL}QEj#ucc;v{5z^D2Na>h9!!bFI2!6jE* z6XS+Bw{E)M?P9MVk&?dfd}z zz>pDRCQO+zXTg#cYc@QvWyhWaj~qD>_)I@R&NwGTmFqrbCw=_w*SsWW<;WQ)bLruw=!W4G(PDvFE@eM@|I(XZ-{@ zaYLM265Np_MVbs*a^xvcq(qqtRch2}(4c8Rvuu6Cuh4mt1j8j2q(IlHiUcDbi%fk|R%nA|=XHs8XX& zgC;H7bm-FKo<0MHj2JUv%8WS+maJH_;ejnX_8fTR$ce!JUq3<4I44Ay2vIJ$9DN&|El^S&#G-=VMLzf=+^cgT@#Fzx_=+Z9 z)8a?8`7s@SLYJS?<7eFSbNc*(0l#F(uNd)b#(cwsZ<+ENW_-t--?HF)mi)kq-?8TR zZ1@8Y{E;nxV#lA^^A`^Ml}G-@k-u}|9|Zna`||`(667h)c$#ybA;hzUd5#Fr6XgXi zc#%t9;)<8K<`rVR$_=j(=XGv*g9LAK$6F+Mn-uSm=3O$pN0#@=@d0^0q`*fM`Ir)) zQ07xAd`6YesqqDMzNEodH2InqKcdZ#>F^V}{FEL)=NAn4B}0D2h+i}28zy|q zl;1GpJLdeB1>dve2Uh%!HNR)WA9&!8Z21#A{>+}gaNw^z@;8qBofH2c@P++(f+q>` z6lXlmInNN{S;9O=gy)I!0vEi?Y9$DTe#|Py3kOCi36>JjC+1gpIXvgJ?g_%nO{!hyf?$lo~fcTW6+K;8a4!IK1eiZh<(oM#B}EMcA_!t+FV zfeT*bl9#yRWv+RJ7_V}}Ys7h-TizhSo80jhN!}*KJEVD+4DXTUeR6z2o)0PT5k)?x z#3z*blnS3w<#TF$L7gvY@D)wIrp1qF^J6;vgf2g&$IrOu=k)mn1AfVnUoqm>jQNHM z-!kPl%=nHuzh%MqEct;Izhlks+3*J*_#<2X#Ew6+=Pw-iE06q*BY)?_KM4Hq_U8$n zB*;^o@igZ=Lx^Vy^BfVLC&~+4@FJJI#1$`d%`3!sl^b3o&g3x_Q{xNjd`W|^X!12Jengue)8Qv{`6)eq z#yvl$&o3D8ONRW45x-{4H%$1JDZgRHcg*=M3%+N`53KkdYktp$Kk&dG+43iL{Fyy} z;lN*c z{EjuhXTu+O;E!zi6FdIQp1*M5uRQWMj{Kby{~+*x*qJKiG6+oX7hH1Cq(J+iz{jt|K5Aq767 z$j6lUgfgE};WMgyPK__9^CbOoyM)<)`%c8Tb60KEGhVFB$SHM*NyF z-!S1@ru>E(-!bR6Ecl)!Kd|C=toc0~{=fr&WXqq}@n`n@g#&-(k-u@|@0|DtfrkBg zf+q>`6lXlmInNN{S;9O=gy)I!0vEi?Y9$DTe#|Py3kOCi36>JjC+1gpIXvgJ?g_%nO{!hyf?$lo~fcTW6+z*qL?37#a#Q=IWM=R8A*X9@Eh z5uPW?3taFbm%PLkFLTW+#CVk(UL(%y-0}tq-sFzANb)u*-XYDqWO$D(?~~&L@_a~v zk0|mnB|f3dr&RciDxXv13+jAHgRf}vH7$Nbn;+BRCv^EKJ$}YLKc~+x81PGm{E893 zX3RHC_?9WZVa9jN`7H~+XUPw&_#JD0&xSwnz#rN2CwBaqJ%8cAUwPzj9QivZ{z2gX zvOiDoBtf3yjHfy08A3cunCFP_JW*cYf)}~uC9ZgxYhEG7tK9G!abD+^H%Ra%cf3WC zw@L91Y2GEndt`Z^93PP9LkfIEk&h|y31vQ|!e>{EjuhXTu+O;E!zi6FdIQp1*M5uRQWMj{Kby{~+*x+n*JKiG6+oX7hH1Cq(J+iz{jt|K5 zAq767$j6lUgfgE};WMgyPK__9^CbOoyM)<)`%c8Tb60KEGhVFB$SH zM*NyF-!S1@ru>E(-!bR6Ecl)!Kd|C=toc0~{=fr&WXqq}@n`n@g#&-(k-u@|@0|Dt zfv@e)6Ffo^nuFnsL1eo2xfNs=TNKK18%5=kPN z6jDheoeVO`BAXm?$s?Zv3MrzP5=tqfoC+$bqM90NsiU3-8fl`L7FubeoenzbqMIIi z>7$7FlAM6;@eeoeehGVw)Xy*<+sr4mskO6HYnf zoC_|w;+h+7x#OM(9(m%K7hZYeoew_w;+r3S`6J*T>rWs-1QS9iVT2PwBvC{YLo9K` zlRzR#B$GlaX{3`uCRt>YLoRvbQ$Qg_6jMSeWt3AvB~?^YLoIdG(?BClG}A&WZM4%t zCtY;YLoa>wGr%B23^T$gV~jJwBvVW?!z^>mv%n%tEVIHYYpk=uCR=Q?!!CR5bHE`- z9CN}cXPk4vC0AT?!!38*^S~odJoCaUZ@lxtCtrN?!!LgXjO$MzK?D;*C}D&XK_pQ` z6GJR<#FIcGNhFg(DruyXK_*#blS3|f!6Z{mGs7%%%(K8EODwa(Dr>B>!6sX5v%@ZX z>~p{&M;vp)DQBE>!6jE*bHgon-1ERAPdxL&D{s8>!6#pQ^TRKH1pE{I2_%SMLI@>{ za3Y8#ifCepC60I!NF<45Qb;9@bTY^!i)?bpC69axD5QvDN+_j_aw@2#ifU@8rH*b~@;!i*9=8rH}r9a+Lg!+Rg$7%|i?`!YE^mGr=TNOf$nQbIh~AB1Kq5&blR_$Kq?17=S!9z#E_virKp{mGQ$i_a zlv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak!FMaegz#u~mGr}lij5EO`Q%p0%EOX4W zz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWSoO8h?S6p+$EqC1Wz#~sQ^TI1{yz{{) zUwre!FMkC5Q~e1fh+skpC5&()h$M<=Vu&Syl*dYDyXE2YHFyZj(Qqsq={x)Xr+yII_RW}ZhGjYkA4OiWQbu#7-fuc zCYWT3X=a#Zj(HYXWQk=~SY?fMHrQl~ZFbmYk9`g} zg0fiJ%ObMlwQBDPwR8dV0wbW5h1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM z%m|~5G0p^&Ofk(2v&=Ei0*frM%nGZlvCamYY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0} zx7=~h1CKoM%nPr)@y-XIeDTc>zx)v}r9XiL5ljf7gb_{zkwg(q46(!!PXdV~kxUAy zq>)YrnPibo4!Pu!PXUD#QA`P?lu=Fvl~hqp4Ykx!PXmoK(M$`iw9!rnopjMn554r! z&j5oAG0X^~j4{pxlT0zq471EJ&jO1qvCImqtg+4pn{2Vo4!i8J&jE)Vam)#)oN>+t zmt1kp4Y%BJ&jXJ<@yrXayz$NlpM3Gn55N2o@Xz%pkRXByA(Sw}i6D|FqKP4vIO0hl zktC8yA(b@J$sm&~vdJNrJn|`^kRpmHp_DSpsi2Z7s;QxtI_hblktUjHp_Mk;>7bJ? zy6K^pKKdD8kRgT{VU#h(nP8GBrkP=uIp$elktLQ{VU;!3*h9qTyn)VH{5c^Jr6wc#4|6v^2R$KeDcLNKm77Xz>NL`5=1Z| zgc3$L5kwM2G%>^yM?486l0-5oq>@HD8Dx@0HaX;yM?M7y zM?DQR(nK>Yw9-a99dyz~H$C*yM?V7$GQ=<=j55YJ6HGG2G&9UH$2OwwoN~rF7hH11H8g0fiJ%ObMlwQBDPw zR8dV0wbW5h1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM%m|~5G0p^&Ofk(2v&=Ei0*frM z%nGZlvCamYY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0}x7=~h1CKoM%nPr)@y-XIeDTc> zzx)v}t3QDR5ljf7gb_{zkwg(q46(!!PXdV~kxUAyq>)YrnPibo4!Pu!PXUD#QA`P? zlu=Fvl~hqp4Ykx!PXmoK(M$`iw9!rnopjMn554r!&j5oAG0X^~j4{pxlT0zq471EJ z&jO1qvCImqtg+4pn{2Vo4!i8J&jE)Vam)#)oN>+tmt1kp4Y%BJ&jXJ<@yrXayz$Nl zpM3Gn55N2o@PF!0AVCBZLMUN`6G0?VL=!_Sam15AB1t5ZLMmyblR+j~WRpWKdE`?- zAw?8ZLMdgGQ$ZzFR8vDOb=1>9BTY2ZLMv^w(?KU)bkjpGee^THAVUl@!YE^mGr=TN zOf$nQbIh~AB1Kq5&blR_$Kq?17=S!9z# zE_virKp{mGQ$i_alv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak!FMaegz#u~mGr}li zj5EO`Q%p0%EOX4Wz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWSoO8h?S6p+$EqC1W zz#~sQ^TI1{yz{{)Uwre!FMkC5U-}bB5W$2HN*Lip5J?o##1Kmy@g$H)63L{HN*d{8 zkVzKVk0rUD3ND#q<5K0)~L=Z_7(Zmo-9PuQOND|4U zkV+cqWROV~+2oK*9{ChdND;-9P)Zr)R8UD3)znZ+9rZNONE6Mp&`KNabkIo`-Sp5) zAN>q4$PmMfFv=L?Ofbn5)66i-9P=!&$P&w}u*w?iY_Q1|+w8E*9{U_{$Pve!aLO6y zTyV)1*W7T+9rrx&$P>@J@X8zSeDKK^-~8~)9|8Zj{sa<4Fd>8zMmP~f5=AsI#1cn5 z2_%w4GAX2zMmiZ}l0`N-X?_+;Yb~4?Ob3GcUaI#ycN;^2Ikl{PIV@g8l>&L@*(Q5=J-?L=r_b zF~kx_JP9O{L^3I)l14fiWRgWTIpmT@J_Qs~L@_0lQbsuyR8mDXHPli^Jqh9qTyn)VH{5c^Jr6wc#4|6v^2R$KeDcLNKm77Xz`xX=K!OM+giyi= zCxS?#h$ewe^2n!vLW(G+gi^{Vr-DkVsHTQm>ZqrI zMw)1*g;v^Vr-M$q=%$BW`sinXL53J+gi*#AXM#zlm}Z7q=9p)JMV44*g;myAXM;_) z*k*@a_SoluLykD+gj3Eq=YmVFxaNji?zrcHN1k}*g;(Bq=Yvna_~wUS{s>sqpFn~L zCWKJJ2q%I_qKGDjSmKB$fkcu>CWTbeNGF3#vdAWfT=K}LfI^BWri4<;D5ru-s;H)h zTI#5$fkv8WriE78Xs3fty6C2dUi#=~fI)^BW`t437-xb>rkG}iS>~8$fkl>BW`$MO zSZ9Mxw%BHeUG~`LfJ2Tr=7dwuIOl>(uDIrgTkg2$fk&Qr=7m?@c;|ypzWC;cU;YUA zSNao35W$2HN*Lip5J?o##1Kmy@g$H)63L{HN*d{8kVzKVk0ZaN5ND#q<5K0)~L=Z_7(Zmo-9PuQOND|4UkV+cqWROV~+2oK*9{ChdND;-9 zP)Zr)R8UD3)znZ+9rZNONE6Mp&`KNabkIo`-Sp5)AN>q4$PmMfFv=L?Ofbn5)66i- z9P=!&$P&w}u*w?iY_Q1|+w8E*9{U_{$Pve!aLO6yTyV)1*W7T+9rrx&$P>@J@X8zS zeDKK^-~8~)9|8Ybe*y_2m=HnT31QJOinG{k*l~ z0tzXjm=a1UqnrvVsiK-1YN?~11{!IinHE}Uqn!>q>7tt+dg-H|0R|ajm=Q)9W1I;l znPQq5W|?E21r}LinH5%9W1S5)*m&DWjYUDygEH8fvMdo(39eqL~(2X``JEI_aXD9(w7cp8*CLVwe#| z8DpFYCYfTI8D^Pdo&^?JVwn|IS!10IHrZmE9d_Acp92m#;+PXoIpdrQF1g~G8*aJd zo(CRz;+Yp-dE=cAKKbICAAb2G;NR#^AVCBZLMUN`6G0?VL=!_Sam15AB1t5ZLMmyb zlR+j~WRpWKdE`?-Aw?8ZLMdgGQ$ZzFR8vDOb=1>9BTY2ZLMv^w(?KU)bkjpGee^TH zAVUl@!YE^mGr=TNOf$nQbIh~AB1Kq5&b zlR_$Kq?17=S!9z#E_virKp{mGQ$i_alv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak! zFMaegz#u~mGr}lij5EO`Q%p0%EOX4Wz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWS zoO8h?S6p+$EqC1Wz#~sQ^TI1{yz{{)Uwre!FMkC5Tm1yl*dYDyXE2YHFyZj(Qqsq={x) zXr+yII_RW}ZhGjYkA4OiWQbu#7-fucCYWT3X=a#Zj(HYXWQk=~SY?fMHrQl~ZFbmY zk9`g}g0fiJ%ObMlwQBDPwR8dV0wbW5h z1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM%m|~5G0p^&Ofk(2v&=Ei0*frM%nGZlvCamY zY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0}x7=~h1CKoM%nPr)@y-XIeDTc>zx)xfrayrM z5ljf7gb_{zkwg(q46(!!PXdV~kxUAyq>)YrnPibo4!Pu!PXUD#QA`P?lu=Fvl~hqp z4Ykx!PXmoK(M$`iw9!rnopjMn554r!&j5oAG0X^~j4{pxlT0zq471EJ&jO1qvCImq ztg+4pn{2Vo4!i8J&jE)Vam)#)oN>+tmt1kp4Y%BJ&jXJ<@yrXayz$NlpM3Gn55N2o z@bC2}kRXByA(Sw}i6D|FqKP4vIO0hlktC8yA(b@J$sm&~vdJNrJn|`^kRpmHp_DSp zsi2Z7s;QxtI_hblktUjHp_Mk;>7bJ?y6K^pKKdD8kRgT{VU#h(nP8GBrkP=uIp$el zktLQ{VU;!3*h9qTyn)VH{5c^Jr6wc z#4|6v^2R$KeDcLNKm77Xz=r+=5=1Z|gc3$L5kwM2G%>^yM?486l0-5oq>@HD8Dx@0 zHaX;yM?M7yM?DQR(nK>Yw9-a99dyz~H$C*yM?V7$GQ=<= zj55YJ6HGG2G&9UH$2OwwoN~rF7hH11H8CWTbe zNGF3#vdAWfT=K}LfI^BWri4<;D5ru-s;H)hTI#5$fkv8WriE78Xs3fty6C2dUi#=~ zfI)^BW`t437-xb>rkG}iS>~8$fkl>BW`$MOSZ9Mxw%BHeUG~`LfJ2Tr=7dwuIOl>( zuDIrgTkg2$fk&Qr=7m?@c;|ypzWC;cU;YT#)Sp0t2quJ3!U!jVNTP@)hFIc=CxJwg zNG63;(nu$ROtQ!(hg|Z=r+`9=D5iu`$|$FTN~)-)hFa>Vr-4SAXr_f$+GwYPPP*u( zhhF;VXMjP57-ob~#u#UUNv4=)hFRvAXMshQSZ0M))>vnQO}5x(hh6sA=YT_wIOc>? z&N%0SORl))hFk8q=YdC_c;q4$PmMfFv=L?Ofbn5)66i-9P=!&$P&w}u*w?iY_Q1|+w8E*9{U_{ z$Pve!aLO6yTyV)1*W7T+9rrx&$P>@J@X8zSeDKK^-~8~)9|2qX6G#xjgb+#?;Y1Kg z6w$;GOC0eekVq2Aq>xG)>12>e7TM&GOCI?YP)HHQlu$|;uj*e7TfHw z%O3k2aL5tIoN&q+=Ui~f71!Kw%N_ST@W>O-yzt5!?|ksd7vKEw%O3&%S$_fvBA5_D z2_u{cB8eiJ7-ESdo&*v}BAFCYNh6&MGRY#F9CFDcp8^UgqL>m&DWjYUDygEH8fvMd zo(39eqL~(2X``JEI_aXD9(w7cp8*CLVwe#|8DpFYCYfTI8D^Pdo&^?JVwn|IS!10I zHrZmE9d_Acp92m#;+PXoIpdrQF1g~G8*aJdo(CRz;+Yp-dE=cAKKbICAAb2GU|W9z z2_l#fLJ1?B2qK9hniyhKq5&blR_$Kq?17=S!9z#E_virKp{mGQ$i_a zlv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak!FMaegz#u~mGr}lij5EO`Q%p0%EOX4W zz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWSoO8h?S6p+$EqC1Wz#~sQ^TI1{yz{{) zUwre!FMkB==uaR)1QS9iVT2PwBvC{YLo9K`lRzR#B$GlaX{3`uCRt>YLoRvbQ$Qg_ z6jMSeWt3AvB~?^YLoIdG(?BClG}A&WZM4%tCtY;YLoa>wGr%B23^T$gV~jJwBvVW? z!z^>mv%n%tEVIHYYpk=uCR=Q?!!CR5bHE`-9CN}cXPk4vC0AT?!!38*^S~odJoCaU zZ@lxtCtrN?!!LgX{8#-6B#2-_2qlbgB8Vi4Xkv&Zj(8GCB#C5FNF|MQGRP#0Y;wpY zk9-O!q=;flD5Z>YDyXE2YHFyZj(Qqsq={x)Xr+yII_RW}ZhGjYkA4OiWQbu#7-fuc zCYWT3X=a#Zj(HYXWQk=~SY?fMHrQl~ZFbmYk9`g} z{a3Y8#ifCepC60I!NF<45Qb;9@bTY^! zi)?bpC69axD5QvDN+_j_aw@2#ifU@8rH*b~@;!i*9=8rH_6F7-WcH zMi^y`aVD5#ifLw;WsZ3kSY(N1R#;_?bvD>!i*0t;WsiLhIOK?9PB`U^b1t~#ifeAT z<&JwEc;ty^UU=n=cRu*!i*J7T<&S{>rayrM5ljf7gb_{zkwg(q46(!!PXdV~kxUAy zq>)YrnPibo4!Pu!PXUD#QA`P?lu=Fvl~hqp4Ykx!PXmoK(M$`iw9!rnopjMn554r! z&j5oAG0X^~j4{pxlT0zq471EJ&jO1qvCImqtg+4pn{2Vo4!i8J&jE)Vam)#)oN>+t zmt1kp4Y%BJ&jXJ<@yrXayz$NlpM3Gn55N2ou%|zP1QARKp@b1m1d&7$O$@Qb5l;e% zB#}%Csicuk2AO1$O%A!_kxv1I6j4kGrIb-l1(j4$O%1iwQBMPnG|@~8t+dfj2c2}$ zO%J{F(a!*b3^B|Iql_`m1d~iL%?z{5G0y^vEV0ZAtE{ok2AgcL%?`WlvCjdA9C6GE zr<`%l1(#fL%?-EQanA#fJn_s6ue|Zj2cLZL%@4o)5%AyjCy*e52_cj)!igY~D58lW zmN?=`Adw`JNg%G{)heq5=1Z| zgc3$L5kwM2G%>^yM?486l0-5oq>@HD8Dx@0HaX;yM?M7y zM?DQR(nK>Yw9-a99dyz~H$C*yM?V7$GQ=<=j55YJ6HGG2G&9UH$2OwwoN~rF7hH11H8qRnJo3adFTC=`J0E=V#Wz3v z@<+h`)Sp0t2quJ3!U!jVNTP@)hFIc=CxJwgNG63;(nu$ROtQ!(hg|Z=r+`9=D5iu` z$|$FTN~)-)hFa>Vr-4SAXr_f$+GwYPPP*u(hhF;VXMjP57-ob~#u#UUNv4=)hFRvA zXMshQSZ0M))>vnQO}5x(hh6sA=YT_wIOc>?&N%0SORl))hFk8q=YdC_c;CWctzh$n$Wl1L_nRMJQ%gG{o>CWl<|$ftlp ziYTUpQpzZ&f=a5WriNPTsHcHOnrNnlR@!K%gHF2WriWho=x2aIh8SjqQN|c&f=Q;B zW`4(66G#xjgb+#?;Y1Kg6w$;GOC0eekVq2Aq>xG)>12>e7TM&G zOCI?YP)HHQlu$|;uj*e7TfHw%O3k2aL5tIoN&q+=Ui~f71!Kw%N_ST z@W>O-yzt5!?|ksd7vKEw%O3$p`V&YH!GsV>7~w<^Nfgn<5KA2KB#=lF$)u1<8tG(^ zNfz1UkV_u<6i`SJ#gtG=8Rb+^Nfp)9P)i;4G|)&B&9u-;8|`$^Nf+Jp&`Tfv3^2$L z!;CP>7~@PZ$rRJfFv}eCEU?HD%dD`<8tZJZ$rjt}u*)9%9B{}H$DDA=8RuMZ$rab! zaLXO{Jn+a9&%E%;8}EGZ$rs=J@XH?o|BwCz5=1Z|gc3$L5kwM2G%>^yM?486l0-5o zq>@HD8Dx@0HaX;yM?M7yM?DQR(nK>Yw9-a99dyz~H$C*y zM?V7$GQ=<=j55YJ6HGG2G&9UH$2OwwoN~rF z7hH11H8T3 z1QJOinG{k*l~0tzXjm=a1UqnrvVsiK-1YN?~11{!IinHE}Uqn!>q z>7tt+dg-H|0R|ajm=Q)9W1I;lnPQq5W|?E21r}LinH5%9W1S5)* zCWctzh$n$Wl1L_nRMJQ%gG{o>CWl<|$ftlpiYTUpQpzZ&f=a5WriNPTsHcHOnrNnl zR@!K%gHF2WriWho=x2aIh8SjqQN|c&f=Q;BW`CWTbeNGF3#vdAWfT=K}LfI^BWri4<;D5ru-s;H)hTI#5$ zfkv8WriE78Xs3fty6C2dUi#=~fI)^BW`t437-xb>rkG}iS>~8$fkl>BW`$MOSZ9Mx zw%BHeUG~`LfJ2Tr=7dwuIOl>(uDIrgTkg2$fk&Qr=7m?@c;|ypzWC;cU;YUA|MVx2 zAc6@YlrX}HAd)Dei6NFa;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqYxppq)8 zsiBrS>S>^nCYouXl{VVxpp!1T>7kcC`Wax5A%+=YlrhGcV3H}OnPHYW=2>8oC6-xX zl{MDcV3RGj*CWTbeNGF3#vdAWfT=K}LfI^BWri4<; zD5ru-s;H)hTI#5$fkv8WriE78Xs3fty6C2dUi#=~fI)^BW`t437-xb>rkG}iS>~8$ zfkl>BW`$MOSZ9Mxw%BHeUG~`LfJ2Tr=7dwuIOl>(uDIrgTkg2$fk&Qr=7m?@c;|yp zzWC;cU;YUA|Me%3Ac6@YlrX}HAd)Dei6NFa;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@ zB8n-YlrqYxppq)8siBrS>S>^nCYouXl{VVxpp!1T>7kcC`Wax5A%+=YlrhGcV3H}O znPHYW=2>8oC6-xXl{MDcV3RGj*yl*d9BTY2ZLMv^w(?KU){Ggj2dg-H| z0R|ajm=Q)9W1I;lnPQq5W|?E21r}LinH5%9W1S5)*ZqrIMw)1*g;v^Vr-M$q z_(3;4^wLK^0}L|6Fe8jI#yAs9GQ~7A%reJ33oNq4GApdI#yT5pvc)z#?6Su`2ORQ~ zBaS)Ylrzq`;F2q@x#5;O?s?#mC!TrXm0$ek558cE_=`xQc*9%X@tzNSb~@;!iyw5;Loa>wGr%B23^T$gV~jJwBvVW?!z^>mv%n%tEVIHYYpk=uCR=Q? z!!CR5bHE`#IpUZTPC4V83og0hnj3DpvnQO}5x(hh6sA=YT_ga>OwwoN~rF7hH11H8qRnJo3adFTC=L-~1uszcGIziQ)}!dB=M`@R3h^<_ll>Ml>2Z$uMA zEOEq>Kq5&blR_$Kq?17=S!9z#E_virKp{mGQ$i_al=GbmDygEH8fvMdo(39eqL~(2 zX``JEI_csE-Sp5)AN>q4$PmMfFv=L?Ofbn5)66i-9P=!&$P&w}u*w?iY_Q1|+w8E* z9{U_{$WM+q=7dwuIOl>(uDIrgTkg2$fk&Qr=7m>&@tZ$H{I}*$BvHKKE$?{G2R`zN z&wSx4--srLSmKB$fkcu>CWTbeNGF3#vdAWfT=K}LfI^BWri4<;DCavBR8mDXHPli^ zJqh$W7A5=bP8WKu{ajdU`|B#Ufv$R&?_3MizAVoE5bjB>tH zK_yjGQ$sCv)YCvCO*GR&D{ZvXK_^}Opqn0g>7$ z7FlAM6;@eeoeehGVw)Xy*<+sr4*AIu$DDA=8RuMZ$rab!aLXO{Jn+a9&%E%;FMjif zi2u&~i6n|QyyYG5`M^g$@tH4t)YrnPibo z4!Pu!PXUD#QA`P?lu^!iDyXE2YHFyZj(Qqsq={x)Xr+yII_RW}A9T}0FMaegz#u~m zGr}lij5EO`Q%p0%EOX4Wz#>a5v%)HCth2!;TWqt#E_>{Az#%_5;+PXoIpdrQF1g~G z8*aJdo(CRz;+Yp-`NePk5b@ueKaoW7hPS-qJsm&DWjb4R8UD3)znZ+9rZNONE6Mp&`KNabkIo`Kj@~1 zUi#=~fI)^BW`t437-xb>rkG}iS>~8$fkl>BW`$MOSZ9Mxw%BHeUG~`LfJ1(A#4#tF za>h9qTyn)VH{5c^Jr6wc#4|6v@{8a6!HJ|1e-TL(Z+Oc)-t&QveBv`-_{uk;i6NFa z;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqZsP6d@zQB4iC)KO0ZjWp3r3$3)# zP6wTI@q=!9=%tT-1{h?BVMZ8bjBzHIWQu8Km}QQ67FcA7WmZ^ajdeEIWQ%Qf*kzA> z4mji|M;vp)DQBE>!6jE*bHgon-1ERAPdxL&E5G>7A0qw-^Cyxh-td-pyypWS`NU_w z@Re^w6GJR<#FIcGNhFg(DruyXK_*#blS3|f7~@PZ$rRJfFv}eCEU?HD%dD`<8tZJZ z$rjt}u*)9%9B{}_jyUFoQ_eW&f=jNr=7wADxaWaKo_OYkSAOxEKRAgz;x8hJ;tg+k z$9q2TkxzW)3t#z0G%>^yM?486l0-5oq>@HD8Dx@0HaX;yM?M7IOI zDypfWmOAQbpphn;X`z)i+UcN^E`HEW554r!&j5oAG0X^~j4{pxlT0zq471EJ&jO1q zvCImqtg+4pn{2Vo4!i8J&jE-0YzEeRZRa8?$Ep^n>KqE~w(?Tn4w9`Q+UHqV%9(w7cp8*CLVwe#|8DpFYCYfTI z8D^Pdo&^?JVwn|IS!10IHrZmE9d_Acp92p0$q~n#aLO6yTyV)1*W7T+9rrx&$P>@J z@X9ZK^9L7@MEpf0QM}we z^2n!vLW(G+gi^{V=Q|ZtQbjd2)KW)14K&h3GcB~zMmrsJ(!~$D>7kcC`Wax5A%+=Y zlrhGcV3H}OnPHYW=2>8oC6-xXl{MDcV3RGj*5^4sHKj28fc`6W?E>ajdnWdq>CSP(?c(P z^fSO9Lku&*C}WH>!6Z{mGs7%%%(K8EODwa(Dr>B>!6sX5v%@ZX>~p{&KRM!<6HYnf zoC_|w;+h+7x#OM(9(m%K7hd_rZ~oxov53EjB#Jk@xG)>12>e7TM&GOCI?YP)HHQlu$|;<$R}tN~)-)hFa>Vr-4SAXr_f$+GwYP zPP+I(H$C*yM?V7$GQ=<=j55YJ6HGG2G&9UH$2ClmN?=`Adw`JNgT31QJOinG{k*l~0tzXjm=a1Uqnz(lP)QZl z)KE(u^)%2(6V0^HN*nET&`B3R=%$BW`sinXL53J+gi*#AXM#zlm}Z7q=9p)JMV44* zg;myAXM;_)*k*@a_SoluLw<6^F(;gI#yJ;Ua>X?_+;Yb~4?Ob3GcUaIi{Jbq;(sxJ zB8lP+Z+XXiKJbxGeC7*Z`9?G`#1cn52_%w4GAX2zMmiZ}l0`N-S>^nCYouXl{VVxpp!0s&`l4$^wG}%gA6gu2&0TK&IFT8G0hCK z%rVabi!8Cs3ahNK&IX%ovCR&YLoRvb zQ$Qg_6jMSeWt8)s3M#3hni^`Uqn-vDX`-1HT4|%54m#=L2i^40OCS9VFvt+Yj4;X= z<4iEg6w}Nw%N+A8u*ee2tgy-&>uj*e7TfHw%O3k2aL7-NIOc>?&N%0SORl))hFk8q z=YdC_c;MEooBCz2@M@RoPH=K~-4#Am+nm2X57Lo9K`lRzR#B$GlaX{3`u zCRt>YLoRvbQ$Qg_6jMSeWt9J`-GqN=I*h3BxRNTWsiBrS>S>^nCYouXl{VVxpp!0s z&`l4$^wG}%gA6gu2&0TK&IFT8G0hCK%rVabi!8Cs3ahNK&IX%ovCR&YLoRvbQ$Qg_6jMSeWt8)s3M#3hni^`Uqn-vDX`-1H zT4|%54m#=L2i^40OCS9VFvt+Yj4;X=<4iEg6w}Nw%N+A8u*ee2tgy-&>uj*e7TfHw z%O3k2aL7-NIOc>?&N%0SORl))hFk8q=YdC_c;MEtMjPb5*i;VtiY&j&v8 ziO+oDE8mDFhFIc=CxJwgNG63;(nu$ROtQ!(hg|Z=r+`9=D5iu`$|&bM6;x71H8s>y zM?DQR(nK>Yw9-a99dy#g54!20mp=L#V2~k(8DW$$#+hJ}DW;iWmO18GV38%3Sz(nm z*4bc_Ew+tmt1kp4Y%BJ&jXJ<@yrXa{Ngu%uvtvRUqlkc z8{YDc_k7?ZpZLrdzVeM|Vu&Syl*d9BTY2ZLMv^w(?KU){Ggj2dg-H|0R|ajm=Q)9W1I;lnPQq5W|?E2 z1r}LinH5%9W1S5)*ZqrIMw)1*g;v^Vr-M$q_(3;4^wLK^0}L|6Fe8jI#yAs9 zGQ~7A%reJ33oNq4GApdI#yT5pvc)z#?6Su`2ORQ~BaS)Ylrzq`;F2q@x#5;O?s?#m zC!TrXm0$ek4>pR5_=`xQc*9%X@tzNSb~@;!iyw5;Loa>wGr%B2 z3^T$gV~jJwBvVW?!z^>mv%n%tEVIHYYpk=uCR=Q?!!CR5bHE`#IpUZTPC4V83og0h znj3DpvnQO}5x(hh6sA=YT_ga>Oww zoN~rF7hH11H8q>EZ|7^w3Km{R}Y35W|cx${6EJFv%3t%rMIw^DMB)63eWx${Ooz zu*nwN?6Auo`y6n{PmVa|gj3Eq=YmVFxaNji?zrcHN1k}*g;##@n?KkfGU6{HiQ)}! zdB=M`@R3h^<_ll>Ml>;3J>-%oo1$jc8(sC60I!NF<45Qb;9@bTY^!i)?bpC69axD5QvD zN+_j_a=ueRB~?^YLoIdG(?BClG}A&WZM4%tCtduYn;v@Uqn`l=8Df|bMj2zA2_~6h zni*!9W1a;TSz?(LR#{`64K~?gn;mx9W1j;K`NO- zyzt5|e)9*L!$$l?BvHKKE$?{G2R`zN&wSx4--srLSmKB$fkcu>CWTbeNGF3#vdAWf zT=K}LfI^BWri4<;DCavBR8mDXHPli^Jqg0fiJ%ObMlwQO! zi*0t;WsiLhIOHcs9CN}cXPk4vC0AT?!!38*^S~odJoCaUzxd4`YziOo7m-BqhPS-q zJsm&DWjb4R8UD3 z)znZ+9rZNONE6Mp&`KNabkIo`Kj@~1Ui#=~fI)^BW`t437-xb>rkG}iS>~8$fkl>B zW`$MOSZ9Mxw%BHeUG~`LfJ1(A#4#tFa>h9qTyn)VH{5c^Jr6wc#4|6v@{8a6A>v<~ zKaoW7hPS-qJsm& zDWjb4R8UD3)znZ+9rZNONE6Mp&`KNabkIo`Kj@~1Ui#=~fI)^BW`t437-xb>rkG}i zS>~8$fkl>BW`$MOSZ9Mxw%BHeUG~`LfJ1(A#4#tFa>h9qTyn)VH{5c^Jr6wc#4|6v z{_EZUDB_=Qt^fEBf7|r;9e>-B{C9u%FB+5o&C|bW{V$sT_CNIZ|KOk2{Fnc{@Gp1& z{{Jh=eGoo_(7^vc8~FG9{|DhS2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*# zga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP? zKxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH z4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J z&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf z2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a& zfzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY z8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$b zp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN| z5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l z1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM) zG!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b= zLIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu` zAT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@GmqXdpBY8VC)9 z20{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*#ga$$bp@Gmq zXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP?KxiN|5E=*# zga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH4TJ_l1EGP? zKxiN|5E=*#ga$$bp@GmqXdpBY8VC)920{a&fzUu`AT$sf2n~b=LIa_J&_HM)G!PmH z4TJ{%T@C!l5&wK^{l|a!+or$o_}iA`zx%s?(U@HGkN@f4{F|5l=I?7Gt@x+6|2vlc j?Z373FH8TEe?Dl$zjf6Atmz+`{_#I=`u`mHpH}`q8xD`9 literal 595423 zcma&P`Ez8~btb6US4~fMOi#=bGtn+pn^hE9H1|a$?Zi@5pnxiHbm=Euappuc3k?>pbQ3AH^P z?U0GA%=hj+_uO;Oe$Kse<>ux8S(|S0?1DE#es<(LRpWvzg1OFfE`TL%F<=`+O`^CTK!G3jIk6#(KCG->+n@V~^9A3XWh$BC+Gdr%LXyY=|e-rk^pY0&oS{?Fm_AOGEl^v>kJ!4G`$ zWA_64&(4Mi&;Mum_p_7d3DdI=O`hjW&;HV#U;3r;>EZI}k@D%$^69bi>GAUEiSp^m z^69Da>FM&RQ$8(~Ps`=gO8K-}K3&dF4;T0zF7Q2E;Cr~h_i%yl;R4^o1-^$1d=D4+ z9xm`bT;O}S!1r*0@8JU9!v(&F3w)0h_#P?nJyPI%q`>z`f$xz5-y;RSM+$t86!;z~ z@I6xCd!)ekNP+K>0^cJAzDEjtj~4hIE$}^B;Cr;d_h^Cd(E{J21-?fMe2*6R9xd=a zTHt%M!1ri@@6iI^qXoW43w)0i_#P|pJyzg*tibnJf$y;b-(v;7#|nIp75E-2@I6-G zd#u3sSb^`c0^ef=zQ+oDj~DnJFYrBH;CsBl_jrNt@dDrD1-{1%e2*9S9xw1cUf_GY z!1s88@9_fP;|0FQ3w%!$_?{^6JyGC$qQLh=f$xa|-xCGCCklK|6!@Mf@I6uBd!oSi zM1k*#0^bt_z9$NNPZs!|Ebu*9;Cr&b_hfT>d`}kmo-FV^S>SuJ!1rW< z@5ut+lLfvf3w%!%_?{~8Jyqa)s=)VDf$ymT-%|y?rwV*e75JVi@I6)Fd#b?qRDtiQ z0^d^wzNZR&PZ#)}F7Q2F;Cs5j_jG~p=>p%=1-_>Xd`}nno-Xh`UEq7V!1r{4@96^H z(*?e#3w)gdU#Gy=De!d)e4PScr@+@K@O27&odRE{z}G48bqai)0$-=V*D3II3Vcfi zzNG@+Qh{%&z_(Q3TPpA^75J74d`ktsr2^klfp4k6w^ZO;D)228_?8NM%LTsW0^f3h zZ@Iv?T;N+S@GTekmJ58#1-|71-*SO(xxlwv;9D;6Ef@Hf3w$dDzLf&sN`Y^sz_(K1 zTPg6Z6!=yOd@BXMl>*;Nfp4Y2w^HC+De$cn_*M#hs|CK*0^e$ZZ?(X;THsqP@U0g3 zRttQq1-{h+-)ezxwZOMp;9D*5trqxJ3w)Oge3uJ+mkWHC3w)Oge3uJ+mkWHC3w)Og ze3uJ+mkWHC3w)Oge3uJ+mkWHC1HPYq^NaW&HA4KE{-^j`kNMwb=RcA(vkm;_-s4l zf2+Q~+x6c8zpZ=zSL?q!dSgJ%7aF|&4EOk5BN!^dQ1|%*lzgwREMDszY8|zV3_zIk z3MG?#W!iGkt_&GGw8npp+y4ds#lQaE{|Sh`XKZ)*&z{!a&6zInyK3N-Ec?-3B4x`z z%>1|Gzc2Z3ZIo8SM9MY?Y;ysJf%WcuJ4-6r^g@>SbfxKL3H)>XZoiIw>+uh4BmSAz zW4;EPE(kA)aYZoi63B|ioRI?)(X-^=b^mhQ&b*=>fz?0e@P~0cD$&)ntU1Y`6;+HI zPBzP_&v~4qeGd)W*OOlLPZI&#Bn+$m`7+1p9{8b3gZuCa9hG2e!hQyo-5`j~WXeC- z_Wufh@a=TJO8CmDIqw-h!5P}v_@uO3{BB=+)dg1lO$K~ysm8ZLf{5A@rAfdBq`N2efAC%PKsAYbqg%dP0OsR9GJ z?0)4%yk@{^v9c}+A-}n!Ndpr*e*Ct+6APT>R4;Jorn~R=ddX?7`-`Cb=8%uITyLoF9(ssN zX-8aGz1Ix?Tl@uXfiIqvUy6VId&vF||C7tz;ZPj%^oyro;-u9?``Fy&Y!@$S&OIY$ z{x|;6j=ft)RT4@iJhiht09rv+`0%GW$NYo&Ti%x!Iq+U$Y!U)tBmnlbPwlL)os7j- zVs&?7jO?JyrbM7v6OZke};R4 zX(*00|MC1oJ=)L1)T>3Yt>4j^IWm(!ARc3iYLe7e_w}uqSk*O_X2YHVuEze*NTkn1 zhNB55g-4WsuoK>HfwCKujiiG#K@n8|4ZkFx1pR&JK_^l(CasnfLgT_kJ~J`QjV46fqbrhenak8ABs9Y_5(&Vzrbia6Vfk-7(JVOQu;z%f;aRcR3r08=VKm%8junWX zzdQec@9)(qkNVnZtnjcX8lT}4<0?7C{QCs49(FYs_ZjEZ3n7Eo8YQK0T7>80qAC2M zm%iUj{Ao53ZGeF3@5X4IArFGbJAz=;V(;p}oqj3s(r=H9=;c(%umU|8Dr7s0<2sb$ zHR7U$*Pn#^K~O_fJPSjj7Dt-x@eyc~g4sm1KMlmjkMp0Le%%uf!&*qOJx;W>A+O@K zZm5yHR6}P%3sxcWzK|@il`47^hh%(g%;2zbMfEr|#ISHCQ|a;Tjvsn;rnj0Zwc&P_ zJ2~q6nPDv*9*k4O1ss2pdlcigacZXfb&B-EYMcj{GT4Kd6rr07lMMX$DZUy&Omy1E zp^syg@0NE{uKr5W$=?)yI2C)U?|A#ib#6}8*P&sd}Rjx*8GE{r2SfS-{cY9lW?s-#)fjgO3w0`nO}n3){fx-zUzIT|BzHNsHRDH z`$YqqzwePoV^J$3X&wj+FE+Ek>t}jR9-n@5AoPu^V-DSnC&|Bh<&+F6(d)QL#REtm z8vkALUalP)x`7I`9cgm7{LXJ`ZOw69Ub#n0YvOnkVp1w|i`e3LYx00dS zOuD)FZkv1bv)BJ6=b1@`YlnHft6s3ue*6hQ%~}}tBi#KEb%)Sm@54VqpZ_ucBWQT; z*EnnFQ{z^Qv93hPG7+U1R3b~96s}^n16_@?R>2!E#3YTLThfXi#Q$;7iFAixw$)yU z@h{Eai+A(5{p|HW;1{*6gsyfH_%XJJh+!IcCx6Zdn_9=Go$YBZ@XI_Ty*dQ#?HDh2 z=V)R1b#(NP#CQghSI<7=!EdBst0$hh8HR4LqsObeCc963;uC&l{=JweRl<%KZ9m*E z^X~_vMc9;2O+udG7<^dBXdp8!6v_G9MX!Iz5A>>O`W3=(dHI4t01M-oZBfT=Y7T5a zu^y88(xpoY<{NzYxahop`+xJ_O;?-K+~GeK+QjLXJ^N7=zJpN0+G6GJrJB@Bm&+xN z0Hwk&w!?n%nu%24y023Sr$Lgm-4Yis65I=^WFxT4k*Y(zHEoCHXOn$0Q7)#YndWII zW&E+~Px9DKq~hSj=2XAhN*6qNCuQ$}P<7$rr%e67p6j*!KAD(5er9$?SDPfO0oTaY zt%y?FVd3xPB=5Fqj||$P41&jd%2&KGCvWnLY8no+T?hv6Z#TFfL;HMmI>v$w<~HO$ zv36IJS3N1gvk>unp+xT}+W@B$f~m02ZJa-jH7n|8(cJcngX)d}+e(Js-Pfsr5FRDo zZ^Lvmn4?xl8hW~zb+a36VO&cd-MeRyr^qeP#*RV~d~o_T|FYYJJywkmh>50KDXxLRWro#n2{(rejD`F7nSgfan<$nRidxghNW{X9x_HM10S4Ksf4U+ z;NxWD^}tIV&ZN>D)^fe!Y)cPlLDPPq*T|~|4Z2NswU&ZvGKx`?Ua>9t5o>GzPe6br z{|El(Il$%pshNhpJ_*Fv{hrUtNX$9W^=AoOczVDB!jswVdIPusQqzMiD;@0Jm`n+`aalOlJY--t=t2ey5$uvAFyNv|+AgTJWO~DWJ z$Eh5*jr_VWTvBzzE;YHkJ9W~0-H6BDlfrZ^(KeZ-PSlPP=V2sFQx}~Ut#q?!y3=u$ zpaS9Ch&o@N9;L64dm-`;1)yI*nKU6kQubEzw(SIxVPEUl`Z#r*Gn#0(IG?I+Nb+Ot z8TgKl!&pQyY09gRg!&;4&pr7~Uu^0lueB=b%}zp1AA^5|cI1Jo9q861fj<1Q9B@$h zQ{N6-umF>CGoZQ0r=TKv(}&vh`0q2%c>JUN)Ya;-HD^%rXE{$bZ6zt8Y8%lzb;#qB zH*;^tGf4z6Gf&c$VsQWcd5ZpoBhC6%ShldXJny(pu5vJJ6BUy>f8KB8i5f0=_xy(; z<3VI%oDgAgeizE#e%L6pX^t{3tYW_KRcr(-r>~!WnXnx;VU@$z)D^;@H5=m)%i#|i zH?olSwTRQ-D9U&zSxOyV0cM<~grC2a`x|E&+<@CLDUum-4tAs;+6pl?S~w5Jb?)0% zl10C5#pM>z~L3>r=wYIj!>ZWO1fD*&pV$l%;FTs#c_P#XqX;JpX9 zZO~MUl&WUaVi--}pY%k7N{)pa+5z$)9>0r8Q=J%eT&Hlq+8*(Nk&M*SFQy9(32#J6 z8%le(P9=kcIaKd#uq9n7T{wc$>A;wEq~dbu&Iv2H8VO`YF^GxE$(D3 z1Pfgi^Q~?hNM%}bg0*W#mAUv{k(AJ(_`t4o_?f3y^h0LV_r(f$^0Ec zbMkHR1?V;Ja>k+5X_$X@8;2L2s}YsFJ%7*WUcFB}fJx~=<5|&?N3E>8s`w9fs<5E;MOv$OxM;Y%1oYOn{so1Ee__Z4 z;@&;|lHu4v&!GudA+H z^Ih30$`Ljo=`8-Ip0=pS&pjtXct4K_gfG9SCc&?f%ghmQL{nhk@bVkTk`xo|Zt9i0 zO@!OQ@RHv`MDp?Ubjr67Zt0#B=5DjF(dNIaxg1PdhiYK7L5>rs*;ot^l&^pFQ%xMT zQ0Tbpoo&HBD-JgJUXwn$DT}E68($FWI%Hz(;hzY<2$ zIQfBO!S2{HyS^hsz>qY3@;hCCrI&5+cx@BiDVY4%6K3?tTf)jB;rZ2?DhVu7d3)7U(A9|M91~ zpnTBil-Io`?5ZE1e%aKsr5%JTm$1J4DStcvJ#JkC6@p)fY83WRufAhsgI0Qf=9I7##w_efSEV*Q)+DRd&j3nm4XCy-u*r^q|Y`Pgl z0MF%1^AE!VyFVQW^^k(K6ShK0!D%s$pk;)+1t9OHezcm~OmDpg{NVJP{EN7pjx2~G z#K7t`qHJ6xeb-zjQPRn<%ubyOKO9ymv%x8Iu_tdD5?G$)_g(~LUJTLRtrdtd4TGj> zxAI?N@!B}!FxW+LuoUWqh?iXkK~?P~->mkKWk6UfDY?sH^+&LjlEnUiYXkILdAEtM z)(Djw{2H!${%ML5x*2(ELwWt6S*Dzw%)$W#-9=L@1k(eD<~UU&76f5zyiCJp zS%Ei{(_{Ph%@pQ;!=SL=N72Y$@|hZPN8AvOop3&aMNf>~-3)-L=7y_2mLdyj;2ELq z@#~qQehB>}7XF;y?#Uhj?;x;kFU@;1TWHYl@`rk?#|}gFfqTG{OJ;K!hp?Vah+#$H zC~)}uq+6f6_$e=IQiY~4|3O5oA#UY*!fE)jAef3QAr~}nH5ThSe@URGZm|>6D>Rtn z@VViD<`i%Ma*6#KJHs)t!^UKJ7p{jO2>avYyOn8x5hSrgPIGw1eH>jc zTn*(z>({$d8xGUMG#!V4xNx3N)q@mQ1867w=H!hW{91kF?T6TKItdEeylr_%Ob~EA z=p=0KOFsec{AKQ%7;|JGq)GrUsNAt|G=w$<|6x)f6e)iM+YGhxoAbB*6=|DeV*B1J zHpOOg=+FV-wn*m5B%dV7R~0ty)1gQ@n@UY%B+(C#ppc{-Gc7c`DTW@D@r)3pM;nxU z$zI*{0LbPjMgw&9@0ljI6S~uXPTV%T!6GMe$?b@3j;qK)D6Y6cD3R^BICB|0jbyf? zbh0=I3g?Jtn*U3G*ho$U;liz=FUWvbv&3$%J<^May~ZBkXz}4~OSiOX&Ocz>A0rgV z2f){~o1@0lUD4N2q0@0Q4mDs$p4g)T@X;)_3%h&eay4KpO-i60_*KRK&RyaJefozC zLf2-`Z3l*S?1+QwHt1c>pjTe8D`nF572`=>!3u3}$MOezsN(_%NQV3T?n8Lo`(;?N?rtC*VejB@Ud%2OI6Z`|%*o$$#Yk zOlUY?<7C()I8X@AjqCjH7ariu@Sn=akk^3A>CzEz6t)mMKGHEOm%$jcL#fCX@xu1Q zJuvI~O{k?p)!+W)lD~mQr6k8X&eo)HCKnBA&I^Sod}k+biq+LqNu(>Q{1JBeBAD~8 zL6R7k5_jnVgVC!(yoPG^z&U>n#>i2yhTSycaF~nd=HL3U0N@6f~cKoi#B;$#}Uc{K)+yQTFN!kVr7#>OX=WvqWWP^0()zT7=XvxeL)CM9fa*nD%MDh$q z75L+J#)r2B=xqrz=m*Im2O&=tW-L=+8;HThEl=zDQ zQ|qOo$F)Nh_gWy&6e)v4|1E!Y2sPdx_u5uCn}07j?(%j)!BB!@r%sw<$jv{(AN`i8 zbD}UGr)-9;X$DFGIldo;mrDCb8;yWBedM8Xu~kNV26Qd)Cj@&-^ai_KHt5B04sUJo zwR;zhqk32V5ZV}=zn&%%Ce5Ct`uzRVFY}GHZN3ry01^naxx^;ZZTNC`mJAz_GkoVM zkZkQ$j9|a;-P5m>NYirh9D~w0&nG{b$iqU6A8m5#ItO)R41tO9`Ij#VZ{t++pa9WO z=G^oSmaqyKLt*`sDno4KMbd8JeUnH*xVRnVw$!wD@?@NGl_2*hLEFZuJV?++t@P=8 zQswJc{U!t0qW90;=cPd@E;9J!O-n&&XNrpxL6A9b>F-ZMCkIE7sJ(QN-@>aaGXa)Z z@Js4X3B$Bf0>W&HYz%JzJl}lnOn?I}@!UL!*5q#iAczdjlkx~ zcM3&N0221oKl0z-4EvQ4FAvm7@`Z9r^nxP!1|U$F0j`h8JCl9HUx0W;`S5o;H()ZI zW!}YXP>$Rfn=8!W>>BId9L5|-J$q7{jV)2!clt_Mq{? zsQ!Qnn%Z$LQn!T8Empg+&RHNTIJ$C0Ib93ej9)a1-KbNXyc z;mN_sZ6*gvMalAtaE|b z8A=wD9I{TLzA}_thJDiUfX&tmlW;|Pg3aQ8IYPRyM>(6ObpE{Y^tV6puRF8G`MJ~9 zrx4UU$+%+`i|dRe(jRz<6Tz8|a3m>-teAg61JXA4;5-`kJTh#RFg};kDuHas3XSVB z_q?aDz;$kDt%{~J1Cew$7eRlp%(3oWk?tWBKfm1!OJoxhWI9ts|3EArt^uzLX+m7lkX&d24?uhPOA)`v5ZORGZ9Ocdh6*2byR6Xb&~XIAhk`} zvR38OJj!_Ar#{ZzYmjSXu(0GJ+YwZWEVOzk>9!M|_Ei}X8&P!HW^VCOafk0N{N{9g z_yFA;%ByJ%cPp8Az;INS-@CxgK9J7~Mq-FQ4d=s z4kCOf!w6dvMR1pp6yfwUj&9V_tKq&%Hv_|}Grf$nDZ~%ES+E7JU?|ej+mqlEy_-8`F}6wzH-N1w@K1Sfu>?=6{+nyy==;)p0!DE2CTO7fY1@APZQ+3PWNYwk?v zno+qCQlk_-!@7~aI)4XBVJesNKU>(a$SdKzFhCZpPVsDh+e0b_F*a^eGOUK7xJax- ziXH*j369iD%p$pkFScVj3fCpf7iVA%+Ji|KUi(O0zOCTR@Z`H3Zk+S&7TjKIQ_#V3 zhV!fZ7MDTXHiFpE&%^@hUxJ*=qc#aoOyH2fWc!lleULgqt&NO zss?A$w1Yh4%Q6*E`4iyuBg6WCuNgiM1fcVK_ zo4WcmG6o>vQq9ys(u#xVT5Kz%rcazXz zABuslmAKGSlvy!}`Hbg_NlY9Q5kH}zTV}fo7b=m30x6T|L}IWS?DV9Av?F;~W@E{R zk$fWKDf{~2Kq-~7csHw$Bx1m1xyDa?!aIeIfBm$n>N|D!e~5R1iVRzulYw`FpbbXk zfL(@gHeLX;I93B6wABeFkdcfy-O4KeJ5JV2t2@G+n=X*t$%MeAvUvL4%iN&S%+;gP z-K2MAYhMsRVT47T$9&YEr2yTpq;SJimz;f;*1B2{60D6?^906w0gopg)l``M+D^Ui zhpN8xF}}E+RMA6_UPr2yT11vhL{)vLibfDloAUZzPS?{$as40(#hanc@@MK)qx^nw zRy1ktR2fdcZkXV9tl-m2Y{540=|vcjfRHWcN*o_)oF05X{?BXNwNR}yFoUE#{=_iW z{73wRV+}Mr0*G|y&{|2m4a^AdJDnOJ>GbYZ{WL8=k}p(J%P61Z!#Vk@lfRI%1o2RI z8z&S3&Uj<1k4vFJxofl(dkfMac_#+uPK?4oxSwP-Xb+Qgf1AVcO!K*>pNrjV zgMe+wG~H5JcO;7(xy#@1yhEl+a1^9NC!xV>t?{D51S)W3p@E1w?_mSay;nrpr4{wU zhFO()j=ajR&40var#v)%cK)x6%>uT|Enp#OWQf2Y9yVf|OPJI`>eLt?R6o znMksOEk%q2`*I+p-3^PfS;Q`#zPo-%fKDnaV_{A7u@x@r;mKN!ErXLI6&GE1c=~ULm%3T;viRSdgW* zU@)i{5Zy2w!bd@>LORrcLkIfh5A|MZ>uvHucnw4xPHfsmnGjhjPG|QZdA9}w*ruxa zA9~bb8}<5MooBRYjU^kBUy&*YbWdh`X}u%ATnigRFnO%=7xLK89L&a#9G*0C`V~*K zSzOz_=rr-%eSCj5g_|N4oVpNg!aV)WP~gG+k(pkQF`ka0)2t<90FEkDU?@+~RxuJ# zH82nboV=MU89F3e!|CpoTnnB2u+qM;+)9c!`PAy2ZlnfzIV( zqs1|TI(!M7v!u6dDNATJY%+k1(gIP_P5t+-yfsd~=@bneRcitkHAC}+XyGRxwJD9@ zCotWhYYaq_myL{3DFa1eT_`j+LZe3Vn9j}&3b@t)^Oz*24TCf$z3-_%$m+H+vJ)9x z@R)1uBS|Khc$k(WAfT_)jCvODEYf__LWZH48da}WdM->(X_yx-s!04DoOfBgoUR@u-@X^c!<%qd@rSCP zEb143k^k=3hlUjfGy<8M^npbWVESK`Fm_Avh*%jn*$S-Js;CA{Ij+ z&<`!FzFseMEBL3A*+aB=0uZ9CmCwCwZjQN~E83SnN^1qjh$A9iDMLaB^27O$IK*n` z!`>KVdoyFzV-FExHGsVt)nBg!>MTFiD4!flR|k15rHd2%JtT0z_0Qyn75%00Jj%>t zge?rTqE3l20L(qP(}eb8d;KQdSNysujA}Ikm$2g{NuVDjs@F?O5EJzYD=PEwGm(Hl z;xcmRr3-|vF(ldXJlrR5l(H&()Yh(Pwd&kX2vBinu=hyb{0ebcLJybQnT8`Y4`jH` zWZ5Gf1|afj1cCYI2Zpl!~glLPfF=v743(r zQDgg=(GNKb4P{k|ktXnF8!B-2$aRL&74nx(+eJtm3@Y-@6}OYR#1Iid>qc8XJg*d@ zqfC*WjmYN=AEXDks{aAUhO@#kL{ZV+KK&Zc@=9_t2FCft${dhl1adj zR|eOzXF$FUxIAeJVg3l3&i0V$LU^0L{$StrujyT)8hU8*>0!yeyv3ALqa8{IT^Stm zcz`$wse)A+-Y4WqiK}igz~NWulgEKF^o*S7yg;ocHx#ci>zQVX)s2ed9V%IiEV55ln0qzWgHX}alexS<2B-E`RA*y$(6Eat0-ti~CtI~B z>As5@4~{k5ZDHgV!I7wqh23D3qcX9C?r2CFxqW0Emxa7t z2ozjXMiESZB|Jb=AaW*cH@P#zNuv1B>UG9BCbC6XZo;4LeLky#YXzE&y50K7JH%)r zG+cfkcbo7nI57EV@~WipQR(7_JQjYBaKBdseK+Rmgw`675Fu|?+9*t-qa*_A$*ekO zTB!8GVNU5@D`+Clku2<5t8i1N)TU58dUC`qBkKtqv+nr_Lg1BJlFW%kNs(4%=N0_T zMl#MEOrumGl|vtW9!LAq!mzsdHfEVOce}dj^hfFB5#KEE%cUrAB=jUz>uK7nN*ny4&!T z4hqW+=5PC~KJ+{%Imkje#@}))CpA)G`0H_KjxbgYAY{JhsTNH_N~!7&3@bl~6ck|z zsSZXk{br@Mb238wjgNx;d44*Ia2_{*2(v$aL;BcBXX4MGGNwADuGxy4%523RABc!i%HfYdL!w4&K zat!NXmGsUiE}rlfEZPaObp$86E=E=QKQKKbTqIo_8iI+W9SIeltjL|@wuuhvDc&QRm_Kh|XsND-d|GM8G7xl5wT^ z6%O#bx4^>qD>dyn%M{`qjk&0P2+ij52n-j@{jhDviLvpXGA_M_ds#ujRwv#rc5=zH z+A2AT4`cS3d;~E0BKSPZ8bV~k9ptR27EUJs6sXxk_%dSb#nn?e`KnpwMlXkup!c2H zwAa!F9GqEchlY(UOp;~E>jq_-kJBv<+VWFK*$K6Y9TDWOIQh0DIJR;Amxf}uARBrC z(FknjUL((t->?{o5PXIyUffA!!1(RXIKe!q>ybse^B%x9`4$*-lx+N5e5urLpsNm_ zf@b74imyl7W;~w zq*h(>b9X~OO=4z9!#f zbLCLk5DVbKu5B3vGg~}g^%NgELU=UQM0S2zXf+I9OBgd6cKQ`gI-6$X;N-XXZ#Yjl zCz4B(!YJYUN)7$w3Z>!uhK=;k*PrfSW6`sJOg~#ykas%rK#WUphF*@ z+)0s#@pmv?hT5)|J}Jyy<&!tpL=%O<-va6J`+aS~CuJO<9|{y^F7Y-DEiHU3M?m^i z;ho^Gyikt6R{h;nAXE~Vq>SUx3>Eyl6Dl2?2Ye3}gidZh=>~)~9DhC8n8T47y(wVO-S$@Qa8!d`Bo$+q&fo9S<0x-ex&WO z=Q_s(O(VjSMa}LmE24q-g4oy!SC;nx?bT3ZVl1sgN)q@cQcist}(Kn8BQ$7%^J!qnF$(#?S4x;0raOOl^+sm(UEo4;&d^;j6 z{M~TzK#BFTa+6QjJJH;HrB=T|&_I-@D7iOjq#IaVzUmEzKMFc#svZ&zVQ1_{zRip_ z3P%S&`H^QvtF|yNmaiMSP9zzC0yZRI5mKnt^#(3#z1>2OCx#q=;6QS2g6r^e)})a5 z*QI#gdqo()k;*IxD1A^*7s6=?!B!FVP#Q#B4q0#mr=eE}e!>JX4!yROha954Y~zMb z^79|}*w$1y-{Ke8S9<3Bk=;><_-dGLfqj4^8hO7| zexNgJG-My8rVVI2&DnLgaEu~A$x+u=OQ9QjBZ*$l24BuS zaE{Pii2EO);gvJfZ1=sVCsh$l3}fq*?MdAxVkMu<^CHQnFp!n#8V#ci zlt}2Xbde!0Ate|$*Vny`BWa4AO+ExOG>;hcRd>M0b+3G*4$T$i)@^a#4H+pXz&Dd( z7uUdEaFF`*QAoTS{(_}eIVZ?Yt6nj&QH@z$8N`a0Wi`7xol*<@Xsh} z9AIviD-K@!xG^~YS46>;JQtm$#q1nGdQJI@#4L;<|3h_`{t(j>A_PrVa~_vAZ)Rz0O1j zBSjM6*x--zACu9FdiuTO@V6qU2c3aGuuU6RKhsSypUX05lxYdijvk@FF(VHM)L}CW z1;b|=9^Mv_QRcU)jL;8l_yURJ$nMx_A;|d>#U=mJ1?#2i9vi-fOqI+ZAhVAg^;aJ7 zidRA@2?N&BUg|az8WTaLhK*2M5n?c(rmF*n8Hr*)1Q&Y_Mrj930`xpGFwB?~-&BGK z_L{)5irzEK|0LRLyKn0voQ0V7fW>#|K->lJ&?n4Ew5+XirotXY3o%NPP7M-IR^4#n*-#(-g# z`W89HP_0z&jU1lI_<1~5b)MRR90rKhOMZO*HMNkM4KhEBO)0d=-!H71=2toyqXj40 zPb!HlG+!iu7;vc4Y0z#253lxSMJz1(K5aEwtOy=Nk?qx9kOoyFf5RA+j>2f2G0L`H z5$-|E#xHGO^@beRNki45KS&qlCH}p$DZ`-ltpaaHz1q40m?jR=R4oz_!czQ$f+M4wovHXuzqDvL0$jnc#9wSsF!1+m_rLPuoi!V2HHGVDHyDe6V{^QfI(BiNp0aa`K90a`NxQ<|g}17FMFsH+tjv z;B3mg=2&QGBb|^!^7M6s_w2KYu0>B8XiDDH5bVB%8mS|LtPX3agw)9)>6>j4(&@Vkpo_gVe{~0~^ z+Q_!T>g>aUCCJCNHwnhy{zSf2;5VfdI1Dz^6K+)E#oP2|5SHuMa&HitpE56m4}$j5 zE;E;dMC{0AYxZzfIY|@6EEm5(z*_hj&(JW+^M zu!5s-Dz{mH5V&9WNp5U<703!A7{_5*JW~oY|EkcMa{LG#N`Z?O*DGy8Ad<20M9NAu z2d{ei;)Wt6oN1A+2@rN>(njv&Vh~Ie&+3HdbJ1HKr>MAYZq&(|uSNw7oQTH2F4{&K z!)U1p5$o@97l!B`#t7l(MrH67drPp;1jEJi&T>0>GZR~64|yPDD=9q!G4#Ybss)uQ z&hQpVW|F0KNL(}OLM_f?O5xR}dJ-;-6dvn~STCHsFj7sVx0NZlmN&nGd4%tb8wTCY zzzt5`2w^>uZn%^p6uF4XqeRYBvSgT6h7AV`cDL-zpr;tPvQHJg# zl~?Zd=CH|_17dOE3_SU63XEZRwbnAMbs7E}Zxcc;h^_O~FtLC;0S@~?=~_FBq5+V~ z3$2BFmZc;*pMAK;X)b-p9LHCj+QBBaj91WMEIxcBBzu0 zBRy#HYB^GX-B^R87}UlQPySNO0||nL6Z%J!bW&%IwqHYQ$3^Ro7fZjxts#A~6*;b{5uN8 zY~VJNO5FxPEDyHExM(Ce=&7O)WQAT{^8z+ek7Rnhxn|rfIR|~%wN?U-&yb8}yY6O) z?9DbU!bEQ z9)@e{GwZ|(jG7ESZ$Js>WXBo5hrDXr>~_GQQ;{Zz#XZt%ag~Xb9QiBX&e7xQqoy>O zs`@BfF|xr6$}#Fi+6Ol9R+ENUwA@3k5Q(boC$jH<$ha+~z3_U;tba%?0C9x5B8)CI z`C1r&%)sULz%@_p!}N$fL>j2!>CiLm&y;`CqV9X(zFbQ*TYFB<8SuX%1viRh*mO@G zuscl#cMP&<5_ME&0{#8^RRVx1f}#u!YcNX;!3}7Jiz`9UVm*&D5@(YX?4}0hnhbzN zMRjBz<9pPjxx0JGPWpfUI1?+Nj72e6yO(PCnHmRpFtW%Tf+-(B0}A2$<tc`X#II58tI3wU00-hyY4Uv43@x!4Tw>O>{gtlTh%o$93=*n;XT->VNd zTp3vm3}3N`62~x-JOt)jahL5x-X+vM@I7LAK?tD$^Nh8(365NWYN_`N=SaG<#bw90 zWL-+pTa|cyCU5F})vH5;F}?#gU!HfhV-yphdtL*=y=hckxad8DO1)@E23L-DLCM*E z^4gYHjJm~C$>9oyvLaq|RhR^M{c6ZNbXU|~qin8R-Ln$ba5GU#9Eg^{)cxkRM;=2( zAR6v}$cwqW2<=8T0EC$!mksZ?rid7M5$JG)A`w$Xw7IvCGzGqP91mB(6w9m6y(xEIG;PuS5UP>q=Bz3a!va)38Jeh zNr64dm%C9GGi+oIEk{CuSFlVC)-nn4oM8_xM~=UGZQGzTpb{03aLTv*tR+V*MOU2qU7MqONM zXc9$+mp@^*2`!u(`OQjH=ipi(F0g?oQsjMA0S7o#^0}?5Ur>V9bh3>325zs6gsB5r zLpM@+IE=1-EHgb5lsfr=0qK>FU#hc?EccG*Npev#7zS$*HaaRHi7Yp zY#b&28P~a&xZy2n_L_Bn9Hx^u_!J1xyA_3#CMINUif?+a@2MO@)aGL}|HE>B4OqjO z?)#adu_`mJ>i_SU1hf7B{mrENt9dAro~ED;`GgSk{sQ*$QapE+yG}P zNvjj_77%=RHP95)hZGg~mE6*Fb!xsIIudiYTNi8@2}G_9Nr= zK#2s{BV{~F!rV&|RfNhjqe`wxlWbCM-7ybXtMd2n~f}tt@#1H_|J77ol4& z8&xMTz>{98C_Th(@Edl*4^d68l1J#|M!_=>T|`EDctgHP?iH*SvE!3Blq>Ynmon#* z#0+klnk=*!A~PIzd=JR-M_LiWbW8jBs?I~q#m0+!TDen)z7nj zUm|6{pd08)*02l7!cAao37m|xZ7Cv@n+HduKcWw3D)hUucB)Zr#zCRWEJiLL4O4Q5 zKMh2ZJYX+!7iPIh6M&K`_NsCta^*NjX=@x&6*md&fibxrWf&brpe&~S4C6Jfg*i^O z0tOEb9sCEP{Hfm1g_oWNX&@!My;15C;ELz-%|U8qTR41TDPWTN=^%8SMfwi-+fuOK zexbmshq#A#?P~n!^c6h4B^9$<66|hpuS{9vm2rACQA<|Xg|?nx zw~=EEb{XhzB8kmZ0sR$<4iGx=7nJmid^S|raO=1g1O^e=DsxrC>s3ksO0R{gD-va< zjsYBfB!2}~i9@_F4Gv00D56~9uiw+g+>Vd|^5~;N0(Xm~xdQ7%0XVR|T>gG@+r)qX zG&mL44F4&?W)dX-!p2Ag7;vqge;CY)(L%xzK)BVzJql=qHPD4} z`f#H}UK>j-$OB&sOe1^JU*IZmk}3PJ@c$^uZR&xKnt*9srlHr<;0Qg7%$225@CLa@ zrb9$?ABfcppW}ZoU*<^c7=y9Dp;I|f*}%9`O*7O@)Hs_jftjd~jCXSJQ6!i67o;U5 z!qIx()BvTRG_VA%!mj5042|>U$|d(~5b0445$)kmYuBz2femU8lTInp`5lab>W-Bf z)(V4x^9^V;92V;x#da+!g}eN6QKRZ6W^?ta_IB9kCr_ z(%J^tq@RF(Eq$Ol)NY}!o@i*|mNX@laL8-OB_P&-{bw$kWGZm+4Dnyjt0ub05)=?{ zv*<66Am5}1M0EAhw@OtNJZ7oB(4JLPjvbk$c3*XA#CbW9P_&)h7 zsc{pQm4_c1y?0X~CAUFu3a{$)H(1ijg{VQMR>UGx)4*!TFZf^OkGSs=w*xH>_;UrN zCc0>2mSDwe*;)CSpS}KDhOfsO9PS!KcI*M1*u>wjXMO!q6Xqwng~fAG>H|YRj4DZ6 zPJAOeCE)6b!b`;E*+H7v#@$Zzb0=cx3dslKm?5#D@RfB@*k1H#xOe`N5yJCQgcTjO zVmk;Dl+-cAp%HfD3XY|f3{v5scRmI8PPw@bG?;|W?0f^%nJt<+nSnQZiFN3mer%)a zn%O}#4JYKomcMy9R*4!T>_(WjS50@|hna1HHk27iM-`NBb>e!XQ;a*!;52PV5EY9} zk;V)xZVKY``b#gFsTsM9RICd_U>ob?0SHKY7tmy@)aa@A8Wu-5H@5iSV9L1L)1#2R z!P~^mo`wnw?8&g2ME_Ex3Pmc!QMPnA{fZY1pXYl<8=xd5W2n-1U};=J>JDdyN9wZu>EMdAjP%x1C0ncJG04Rfz)#0va zUKx5Rq{jnDGrK!Oj5lDMcyK0ZM<^au6AGQ4e~?Il59oUyjrscoYFVW{{*spr@=GKE zK$`N6)87eq4r~bVPc+UJNyy7xobGHNqVa!x`bDny-siLhuroX``nTy|Bbg*61LZrP z!r!cgyOIje3#BX9HJopOsix|D9!=%6!Of&p+Ao8nCe^a@SII?!i%@+9_rue~grfJ2 zA|z;omJHr5!pVgDjs)GwyO9Fm&N>KYrcr~Y@u{AmW?5uh?X{^2Sh!Xt7n#SeYif~TNl;W*%U z*EBRcQbsM7>;IHg#NdtJV9qLmQf-;qJI!{EQy+x3#&#V!IBXOF$-ZDFGj#pH13D2c z?w|3u;i}O2IMb(Lkm(6WxXG3q(S-jBz@jxmunGqKPQOwu_{I6}N3nS`$v!(Pb!n|u?Ta>1@ft+w%mY9)iTrx78Y$sR$7{LG zdt)(@Nhb3$leW-6b@Dx~Y^$4BG9W4#t>untWIK^~a4~!elR=#xOva|dqY7&tE6Y>N z%RVA!wHc*VuX2Bm1TbIbY{)qzLhw)6b3P+XAPM(tPqC}Z9d9aI;(;G&%OLJkY_At* zDqu&1RFDq$(zUY9dm(cx>=?m(bl?(#ewCdBB2SucGq1^Dx$Rf5dXMrVE~nwoEMnYI ztL5|3)8U5y8o!T(fwC?g4@A2INiYeg`|3)jHrxaOG)J;wIyUcI^DI}lH`ckea#;Cf zY~P*80*?!PjJ@}+6g9~CKc9Ff`Mt@;FRHC zI?%oA(KbCKs-;~J!~&M|A~>Y-WJ!;9mOQT zy@>?$G?t!oTtepOFWi?@gx*y}3qg;#MF!e7GOkQ2$Y0U%Q*LE~+NDF!4=})_jXzLq z2}ln+9;MI(VAj?$|GbtZIV~H?483*Nf(43%KdL`@V2$J*<=WuDw3;IC^c+Ljy zsH1lAWdk#chEmso4z|?fGvnh?x`;_d1y9KDO_G9FCQ9m&9OhSX^#Rd8by)oD`> zOGg&Da5tjTX1t#Yb#yWyJPiuuo7CmOEU+`lj#2tL&dnQCi`071pEMj4ZhAI!L2ck& zRv!CE;n=5BpKp*rZe4i>fv3F&(<eKpBs z{>yoEx=1kLS`lR|^*IBCBny5^#0&{{nJ9x))wt6blc9MTYtpRK%3jtrTwcyb5FfwC z=_1}-2S~iNlhHrMAB^&M|L6B_M~VsjaJeO&yL|QBGg*pLl@t}EZHoNa93e@G4!n?s z;LGnu8X7Kp4HNQ4hf7EFtHe2P-~v}~?xZc<1~y|)rd84!E|yKj5J^;T8P>KG*GvH} zp&XY^#pp=GlZo0T@p~jH6ypp@xnO%9W4CQg?ec!;C76opIfibE;=r3(Q3|XfXv8(Z z{Sst#`=?(q{4C?4jk3Jz0ei@Z_caLYv1QqU?w2#!vq;B`SP*tUjArWO^LKm_?!U+d zq3w-_-+&w?%t{n^pAXSVnabbYBkcsdtRHyx8!Z&K)S|A3u7-Yc9Q1bS#Du6)@=5eD zFR850JVV@}B1?d7k?@T!v;goYxCgxF=o<|do8&V8t{1Q)r2&?V=pCqg<(1rN#NmvM zDzEvYWebf%Yn}?l>+m{B3kD%;E3=9lyXUu|e3$YHI195EuXC2Ju)n#nv8y6a104Oy zQZl`mm3wPzYXlwXOM4kWg9l9@A)}jHdn`2u)7mH*xzlmGEEy<|4GdqgEBO}(oJN*!dycr5R%CM3_;Hbw{4!1&;cM=MoMXjE^ z;>%zN=sDxPS-$SAK-59^x52CLpFXVx*TXz|ApWQhfY);VzF%WN$C%~4l41C5Dl|^- zW^_SnXu$iaitSbwC%q`Lqy6I~1t``bsL_TimDUdYrim1^M?e{VKE0YC+2Ntq5;steG!Dd!+=5K}{ zBkpCGB)Rdr>_zz};;C`v43;3Jkmbd>=YKj^9$cf?%O^jX4t*w_rG}x429w{wS>R0w zSN0j_gxd9&A5l07*D9e$WH5J|k$8Ru6*)+2Pjuc{-2Y$#M`4E{q&?+x!_*{kfp#3d z2N7UMzQYRUvO%~F)=5&ZA2PO4t(U%c!op1)@Ftn+23=kE89c8$X z7beDe@%$z31v_Q(|2PxM&9sU+8cy=xNODNc5-e93h0?OPP<dcIL2y)|J-V!tfe*5C7*~tvNl~Cs0nj}#i^rv9{(;5htnKB~Lrz$}HfbMb;> zb*vFnHw|J5ohwp58%0hoxCW7U?_5$onNLza`Z5qg3?8i~5qtWke|rHMHYZ*!Ep&`$YjB|w8Od)W0BIUvunanbCf$8|(eyCC3qo1D7 z;G7{P%urp0(|}N&ydlf1ok=FZ@=q;jJ3L6RdEx_uh7yUPs~N6H2ZWp4IFv9wQ_X}b zm&^`9$A?fk(eXJ^npdbX{_?YH}~7pR!Be3z84 zmG#MLjS;C-Cx2ZSZ&$h%`jVJlaD>az-dbf}77o(6Y)n4?yaAwkREqUSQ0R>r98@M} zaI_n4Y-#LVZe&u>Cu2`owgeI`y%zLYOJRq&9wVp3&B**th@BEM#GzX3*YBbC>4 zdFaq}E*frLhU~|g@q0>Ej)sS`Okn-l>wi>;61WH3%vp|riSmz4Ia+*NM83dH`KPX= zN+a*W1y5IzV5Je$3&3sM%jG~Fz=0uVN4-ou?#)IXklm)xXHW5o+i=f$Ya>7+QdF?r z`#Oq?VL;uux6D(s-$dQ>!jz_;)pj(eJWWQWJr1bFCg5O+%&@`of?qzN+l-)d?sN?ceP;--Gt!4czD_K$MX!zIbGhHv}srH-ONN*(s zP23)eHDA^dZBE{l63K-3bbsyI2E(p;)F=Vf``x?#4izf!C_5Xj9+rAaA4fR1f*qZH z`Sf)WuQd@yQD*b1u4|a+#!8`vk>0mG^Hyq2SHj6og+F8^oaYcQ#1pP% z+6UI)$MYiRZ$voZxpTi_8dc&F5Xtc)ao2#O(8D_jD+{@+;yB3aSD3@f53=X^Va!x*K7kT$PXylVPs5Wp`H6iuB2*Iz6~{h@)UE(=2`)YR zoTu$V!^)I$7okQHF*sRhYj;hzI7$fC5iM|4NP$9gtdzlEFp3FF1`BF9b(9wmA0c5z z+i-r}oP4Lm5mrlid|a%)S~lP&{kx{A@ZNF=M1u`9H1IYvmvDPpn+^QJPpv5V^UoSF zoog$6xjIZ#DA579P>kyQYl#Q2x2&y=snP8C<%9b?*|vBlM&5^5y!H2t|XxhkPz}&!izC zlwoEKvK+%);&j`k>F#7Ea|AKsuxCIA;n)FXgy&Yq&RnL%+CsgMaobv19mcUAyM@n*D6c1!%&CGR9!VqTPK;Yt;0(5((F9Cy&~y z6@MK^`8Q)xlH4c_G!T7fA41L2jn#;A2q;{i#YNplNH@CHW;DCqakvLk9%Sxe=*uMK z+>_r0#(-@2rDYi0W1>|7or$F%abPH?_wWvq6dlz2U|tDFb4zk zYf`45XA9f`1rf~cF5#Z-Xsv)LFZ=@EgB4wNH#)3{#VP(ly@mg}lOY&f%&fpVeCnJV zsYzew-!ln}=(f+zpj?$ACqL^emk2PtlYDp^Nn}1>%WP`ugTdfaxds3x5MhIo89Wrm z4<8FpHx4q#$+!tt!pjdl3=DC!TrfQ_LmhbG^OT?u5maZ6@KNZ=#@%CkmsV)5D|g2>L0ZFIO>xC6#Q2oh-dZ%pf5; znz12@cYd8zN4&x-&#~csib=pq!z2Sepyrx0EN6lgym9ab6`K^d2^Ek23*bs*4{Sjn zwSORsPPBi5*^7Qy{53)tW+R`8{&jV4e`Kz{Fy-i zUqu5MX=xuHAA5;32`VO*Wo+)Yq8&p z=ZJZ9%M@Kbl$SE2a4I22f>&8Fgq8Yg7{bqG8w#G(z~!QeHt!aURIp|*Rk(UD3L>!! zL_D&oIXg8gZ!-#bIWNB99VMag0Tqy1e2^Nl!J5BACEDA9NH#s_{%KFGygeXn7(awQ z88j-4Q}m@vvu>p=e$babN9-b@i-q@@bW8_MWI-a+3fD0c5=0BW;c=~HEOIX>jgV;T zeL_}rB$=VylW!tIl96f@z(*?Emh#0Q929||2q$*BVKBf{l52!G*KM9&8MJ3Z0ArNi zeLjHx2O$d4zgPdJeBcL2$%u0R!}92;7(2#bbB+(;>Q7zF{EXgO#$lo|*SJ z4*m&YgOn~?rPN`e!FNBO( zVIHq{*?QdDjcY$5rTrPvp*OB=@*gi>{FGPAf)~g>K6jk1E2{opJ`g^UEf^h1 z4I8}mq#xR~;tFu1Tb%^B-Egqs(^l4( zZz!t92|8DT-<3>DHZOz{k?E!!pk$`?VF7S_h-VaD7#7exFMj6y2nnY20)yc<*nuRu z7fV_VdU{}M_%oQDgz_Pr@Lr6XYm;DBKg%c3ttGR8;Fc|t`xvR_e5l;bHil*{(7sQ` z&izIB(L>&1h*m!W@zKWr3bxajx;;yw2Q4h)Zb>Ne(c?fhvb2?DqJ{QLaBzb7oezmj z_jJ>4Y?OK3%t3qC^8g2&Kx)9S{aIE zoSyVn!4D77Mr>SG7C}2cQ%c6WHNe2QWJJxpEw{mxqX`@8=<B)^^39zj2_)DUqNQSCGq0h^6*8YTr$Op~FG+>T6x!UM zY4sJ|peBtY1gkkY8mgsE2p!XED|prq^Eosy5hy94MvEK^ggL=bIm(iqa&=snhtA_g zZvv!bH0Ykd?@(;)hu|&dr#Jyd4(ir3-pNP?OtK>Hd*hkU40at}xD@{E&04}$TO1M6 z$j!ImI}i&;#|KeVpI$1i4i%tLh=C2&kpb+XI3mPm)X?oHG_E^{3vQ5`i0qzB=tE^w zlb`8LNydJis;HNW-uqnp$(u4Py0}LvqjUDu0qqEP!K=y_qPrl$WVApF7~<*)5-zQM z{(u8hVmEQ|IKRaxpZfwIqbuA~FIT!fTQBIKd385nqa_d<)g5l{GZ(xIyad|%=iFyO$(}x(lbLQ1^e=!HN>%i{68Dt)z_764*(q%Xsef5aM~h7z+?USzJMb1n z!X>DA_`Tg|P=tAj%u$meerCp$3cahLpe=)iW(>O<13%!E^I~Bs=&rKYIfFoU&BYZ;~C#x=d3 zqJXCQ=u2EL+`NBCn65Kt-a9S7U1Uez9H(WyIvMeK)LzLpKmF?I*R_^2WxTf$!sRnq zk=V^)R6X?@1%1`uEETsYYkCb6JJJ<37{;1D%Ml8t*i?#+A0}vo7}30WFWa_&E2VQ_ zNs+CS_vL8;|FEv$5#V&;r!eKpxeC&~wz|#jS-yUaxb8*K4IJGlH20G=Zu{Y;unJHG zTi&612p%EagRVd_+9S0kk6snNfQX!+=mPrmDKkV(xa8Poi^+rO}y2(3V1MEV0S zmI;z;(W+yCKHN%1^pnG6=UwOv1bEKE4AOAFoZSXPRAV|9s9N+;7B6MvxGdqTLv z`3cqkU~v5ObN5<;(MK)}b2W0T`R~Q8FJn#FQ`2x(n$drK`VEV8uzz-I2W^_}mVU1E zP{FSpkX?Vg^SI_G1^>0{${PE0W^lqBXOkT7!~Fm80tO|!7rK$o1H_(3TLDS&`EAm_ zenV=s(EvLNxe{V8H&}S-;ZOua;>7)}9D|qoz@x-5;9rQ_2UxIxqtB-clm&Yb(>vfd%2r9;s)VQ z04-9DEY4IpvU6=iF;uP?Mr`f?lWf$kx1TZF?W?PXZV$Of#sC6hgtJa39r7VWlh{!$ zpf}Na^D^|5j-~wuJ*=%7BqkOQx;9ng#~?sEN#P z@m6i)Yk42ZS7=zQjify0w6_?L^f*xF;>V*l1OUuht_cd!Tk{`!b?2ixo#DgW7?@qo zCFIdSJ!d#th|3I2o_^h!Wls`dF}b6?G6n#*!!{`eP>dz9F+F+5^XxlsAKcWijN0re ziC7Snf1-UQ-8%coD~ch&vq)^?lQ&FYQ48QnxAT&c(?6NNTNLuRiU!2xSI|e!4Wm%v zo!SHp&Bc8R;1IG$k%{E}{77xLV3lxoWKVf0h8Lpp27xF3ggQk2!Sy%%gK9lP#?CyV zx8-$zgC=}YH?nn|D1hpUYdQVf73M?wGZN1tT=+~TL)H|wM%&e5y5E4)Be!w>yOCFb zo;aB(fnFd<``7zO?dL{xdb;|KsJ^6V5*RepU{fcGy?6E}8ZGUDGj}S2O#N zvk&XrMVW=R0StR-&;!P4eA0avmE({PI`25nl1Erf8eP$ehX~j~s)j)hy+jV4g$270 zZ=>J&gBE8}=@>!F z1%j2NID#Jnb4Zc%wKZ@zarYqcqjwm_WbSVY8Msr!sefSPnj_<`1yW>{1d9eDYD#&%|b?Ps83rb$;cja{AwJRPU7$`Y*j3Xc-nf{WUW2Y?v2Xtb{?oeFQ@1icH zhm zu&MrgLA+wGe$}vCX7_Ht3{@%n@*`@JI@ho`y%FigE|zgO#;N>l6dpH$S}r>V{+?)) zS*mfiN|%`BeO{bKeBbkRpXi`U5Ffh4|88ae2d=S%_{jr=`qor?wr0sk5p&7&gGOOQ z*+=h-_L1F|#D{)iZvC^hdcy@sj0BA_D>wIJo_UVxPn z^?d%w5eDq|Qn_}?z2Qu3h~xSb;xcA*ahJUu^15;auRr_C-VA6^iu)X-J{L`@@yPWX ze*J2O4{*|=^bHL}2o7xVoD2m+0vf%I^~jm#kTBPNM^8J_jEg0-cHo?d+;4WQXt<05 zHWQ+=$$&F2S{6CjG^|%P)DGrH;Tykg5i1 zJWHW9UXHlL;u=6?NrYGD2GT(|JV*G z<_?sjVpy`4hgfPOAdma=q@Y8@g)TN)oVhjL1VPEhEVirB5ROXnjj#qsZQTYTKq8j_ul>KSGXxBZ)ONp{S-{A zxZugdOs2ElUDG5eD>$TKKCipXE4b*4d)#crt`%MhgFmGlh+Y+WqM!mQl;B@wp5gl1WM&|SCha0rbK}H&`J*;+sP09E zS8)NL6Mi%%(FENje`gVLLiksuOL!nSa+P{OwnNL+z{;s3-go*hB8hVy##bp8W7Lls zU0wmNsZF8;M;IaOn0E|-Qz5gLks?#`W^i9TM$OjC9OJZk=Gl;3mx%wuFB?YYm(#lR z!yqa3PQ9m^vK`>TU_jUCCs&1!ug*psBMrB_!Zm_{mlz*fa1kihlulD$K`D$JCfK8u zC4;;i1_!oJN1+0Fa7Ix7$#>wgH{cVbuZFV)p~7%DoV+2YFtXf8mSOnVJ+YQoUd;a> zx^lPsX*}bcWFE;j+U*f)y05%QU_P?LKkC?8bU7ct35`of;YzPxqiCVpl!fEExGeFNRP?PSQ6Dbc-5y1eDB`UvFGjIzNA=>Rr-Sz`p6aHVu+qfzyfgPE}E|@`H7Kd=;sVPBMnRikq7 zLBSTpXpfoe?LxEwC0yLe4@A1eA^%xVRW#njy)$gXdiuH^dtWy5=kNEDkbn+Ht^a|Q zLccI5Na)n&-;Jmzsvy36`a8z>pcpRqWn5vS%|avyS(dVd5A9lpq|B&{;d9x&4cw7q z?xJ^pc7;;h4IO^Kmpt88^oRwMSSs8H?vUtaeuZ7teICe1RXw|gY<{nUyU%gp81dEF zYfu|Vl#q|oOJK~FGM_8%qlCFo^OvG$1%?TC`T^O@u!GBdu&+2&jZ`cz?Yd+xymA>bOz$AJLraZ*=nDXVyo^N*{2-vg$(zGp|) z8EnwPHoT*rBuF;HE6ZsPDoAlW& zssw8A{5z9sX`c0`+=?$1iHft!eb5(P1s~j7(b19C`lN7az(_kAnNH zcCdzb8m_)Lerwy2!&#(CD?hCO9&vyDU3Vwn2am?a3@icbz5a^cLRt-Q-n;=3646EJ zHcH!6lla{B-+$w$xsLrSbp);e{DTBj42cgl3-}MW`r@73?N_Q(_LA@I(t($cP40iLN5F<<|A(2@;!?hi zM5Sb^{B1b?Wq!yZjM_+_3}#707XJ zS181mVpowM|w;c4gLk3AKQ(wuI>~ORik<5%Hd$~ymBU2Bet87mpCtCQGvR~S5 zShSJxUv#tBd##4uRI_-dOO&!6+AOpIrdL<M%T2(sB2RGviCMuDjL zO7u-o98E8guZlJgBEcIkB@46<8RaZ!LdHMz z__WREr!V+%Y!iL~#?UW{4mY@J{5%cZO))<*Y;Xt)@RmsTDWS(@J#=wNRw0=G#&BZX`&?N+iV?SR9g#Mz)|RStgaVydYrSa1HgXp{HM zDpx4?MyTb|o7S6moUuw@!0(%TMxpm4t>eA%o0{GB_=ECsqZZ`Vau%aX=@tzO9iZRE zPcVkdr~qCTx8Y0L;`QbVMG*V20Ac0IxqwGQDih>9Zui&q@?9!4c@ii*x$_tQOYaf9 zUm`+}>&mpQfTIJeJ`(mh*QO$Pe%x6B6!LEYz{YW%_h38RKDonWBg5-7bJ8VpTMB%O zGcBjO6`+dozjY@CRe)LMh$^T}HlYUTotr@`Y8C;un@&pFj$k7-Kyp%19|{>=g!fx; z2k+-j(|M_1nTeWR5i3WIBd1wubHQ!r*{75(LziYtMO=F+d-zyUc9p%K4lh_1ASc^v zr>IZP(Yle!_K8S=y#89#_=BxnD0Mlo#ma;ku9JEfh3Zf3(N$5zT*4NPf#TM{H?^^< z#>q-FR7P}v-QYK3v-X!1Gn^%SQ3nioQnGKq<5YqAs{B`kmf&xs~W4syxDas6L>?uvOJqK zIo&O15Q-ttgaA1VPb;$lQFu@X#{fNQnCA7`F-b7)ikclpuwH+{&DoIb7r)MFC1Ny; z!l_oidJeI!`Ev(zK8xL*JbcQ_wEAmv82AbE;PS7lEz1O-xjd|R^fJewDKzNca&*Zq zFVTvvLxo^M7~+p`jB2l2bSvqWKx#7Bb0$(?eTVUUf3xXzqKFB ztNFn()iw2PBu3eebk1xE5f}e-`Fif0AZDshukBbHP-jHTSPL3~9^@JDPMwV3EK+{u zg_4wg92XXbj(}YWq!EQfv29?{sgXf+>UQGMx+DX+!aX4fJc{W=)Y8N_gP<{taV#USHBt2${OxGW;?xOskIAu86p=ACoXK^^LILAgGdeYh>XnzT=hSDK>T&VUEp1_y4#5Vb!ru17OMF*Dp+@ED z9qx^=a&mDP0pVu?f_PAx5X$_oqN5HT`h0Mn*raJ>D=bD7;Hav?Ruf2&w=JI}_8RRO z_{&Vc@XkGSuF_&QsbHjaBW}Z@#A)RFAva!*a)-3t(x&lik%sOgK5=zleVNfC97#DT zEpRwG%d15`s%kIhE-eqswYUoUGt4KeuvO@TihvY6$<0{-i_?mS+>Jg6bOWjbdo40c zzxpU&MdA#4MXg*dxT+c=W%My(g6g@AWiw$oRP-ZiBs4*ob%VSQC=w@PqFM>k* zSOiuha8X5#c)q1uLyJhj6l1ER=9&>XW9|=~{7q_YEat<*sT7N-=N@Y&Js0IU6Zu> zE|)p!@IX9XpY5K6YgqX)`8O&}4(e8IAckX?ba%w5wS5qWM1F8TD0Y4}zp5tG=8blD z=M1PJB|a!>MoG#K4=1fTh|8w9Wzi(Tw+PgJB-_J_T6G?D1`Qxh9xH|pce!3_=j7&6 zVQ#QT8)Cg~{9d_03gTLKRoQWR0()J%pfuw=WkMa{=giyq=(!>oA;SGMCt0SeDglSu ze$X{IeA_C(Vsn|Dnpv$*=j>2mp#D}$fBet6abl;Xiy>81soPARyQg``tKom&o?UA< zBJz0+Zn0XNf1t)`#nu*879`&wFM6>I)nN!FNDeoX1}y68<*^t*Y&CeHBXDf40r(8;T$8-{)qL2eqg&;^Dy5>{E5N9RbHMuQncU)SdS1zf(v~zGl{D}+E$F{D zKBxb-WNsxQCTU}f=ueMs4iCLhG zYD>VuPF;?5vV4;IJJo6z7DSlE8?DPn9~6=*@H>BP02S!+0RV{Vi0QqJ^lbbN$AP zMfrapNTFdJ;H)D-BsNx{3(tCVcUtK?M;me1A`iGUfyLz6I`7{^=sGfGd~OY6&Rl}& z8L?2>!sItx3nQSTLJsBjO+_sW&76SKmVz>1#>rXl7epx!h=OHJy;+R*q4G|#m0H{? zT#;Y%*#*wNtdLB0>#>WJf^bBVU@3b?T6#V%cbcP`q_>HrIHBfbqA^ZgHAdAm?`7{| zM3l7OT%B~{hQH&AHSg8j+S+oS$ML8fRgfoEL3(sr_0pM^rU_z9-!TXKWqzCTBKg}^ zw0`dzT(%-}OtS`r+Bzq=h4Y}7|r1}yTL3uD?D@SDjBWXduwo1_9;1DCd{y1Ys8^;vRyMj85Yk&4f#;pGXBV7RQ|>sai;GIDpWqM zZ}mojYn0Zn&4QjF=-MO{vi?DGmMAL#J@45suM4>%r3kELIk4JHhet={`?KQ;+uJAntFkV{ zxcU~Eqmy${)Z${YI@Bd4@apkTUaNIb6tE5hpT9Y`Bu7IHLbO`(vKdAWB&IxxNKGLu zb16VOym!kGdJ$SvufqA!gz15O;)EFri=bW8hED-?D1YYR(Ei2l2Ifm~Iu*trI68Op znVgVgGO;p}hI2C1iM3S)m++*xiREp$-SlyR;edCRP!=9S!LJ+_OeH8~b6wQLI)hu% z9K7jolqxG5uL`YO7$$Fh_T}elcd0pGhr<0SeRZp4bw$(yc|k#^A`q7XY7oz7zLwK* z)#FjcHeiX4@RI#yvxgg~&~o6Nu!o?aNK)nx(IJQICE|XfwJxB9q%{H%_;~Wo3P(uU z&%@g|D$sGSd#HYX`5od#SW;TY(ZGBJpL&(pSZrw(IJPtnIx_t#b+Tag`qmK?%ZWSO zzP8e;!c&Y2+n278$z#`?n--pVy>f7Dt%Ox=7gV1t(zYD*hnU(%bLCQh(Dd@v7xjG* zIJvy_e!wx5C;MH!%`jEnnUl(mxYaz1t0+>||GMYZ7gDhTs*U6c&*RbXoR?IFU&|wE zV&DDAnZ|tjx!gS>r96;xn2pMcod@~vS6+FgNlnT*PSO$N8rENA8jLU);lBBm2@J0z z_eYZdEj-A2?Psl-qegclh=LpDhdB|Mond<)4WfMxn>v2eo3u8U(;#t$ij1u_JqATH zcS#7+vs53?Ex^~)+@8cT&EWz8QnR86xwl4@HT2z;NcE7$Q=$eQ8{5|fd!I`rcA{me zJO(Szx4|!3eZpxps?uNS7W?VkgoAD%anGBM^USM{k}=EkBDVyAk=zelaFnFfv_AQ` zqB3fjQf2*p!sT;b8`jrW*`d~9EBc^u(~M*(UZ7q6mE%+)H-OV@q9aeL%5I_KC8*tkB^@gUEW z51{BJ&8`n2d~;v~jaUwe58(u7#XrAvh5#9SG`UL3?;VwMbBQ=I7eHT@BP*AKu!n-{ zo&Y(iWg^8UB?2%0<>EKFH^X&L%*5MRtzI_%hyYGy?GVE&{Y1K@UOGe}%2o8hR_tY- z87pNeQA{eSjxv136z@Ywo}4JEwUkh4+Z<{EiEC}8cYOK>*0W%;+DWv5?jHSnO03Tk zzSVSCxT|#zPseZQ-{_hyZoPk8Sh;_hVFk+H z_%;5g#c)#<)y-3;4Bl=FdTd(&fdeD&e z1YmzvdKum+)&3@!D5AjC0Eke(8L3x%g;IvBg_nFTWwQ9BP#$obeQ|1K#G^lfWdgEsDDm+vN#wqAyT5%*HsV<5cpgM*&m)#aR zZLaut-^q!G*`aGP|E0b=9yZL+nuu4jn}|YghmS$T6h{>sRHBAUu5VFn!mHU(l^c z!VR)KtfpgG$g>V3E7Tk<;sL*(ByQP~1Y=P|1mU#6K2Dkn z-VYC3s7H__O^R9fTK1M|PPLXqgi*d)#+`Wc!L8eE)#+O?rRf zvsKvow11vFHc~_h=yLqIRcZ~;G(~=Gxf8~36)x-K%>2E(WkV9m zIOrB4gu;)s)g_pmJkmmf_lA^18!8E|Fi}g3IB2=G7=_lo%!5|LCvvmU!dcUTiWaTy zFIj&R-(+RN(2uI`FvgyypFCY5+DJ*AN3>@+LWW}Q6kgQkIq9_YK;|eul3TQP=0Gy) zev)ZA7E7?s!j1JPscXptP;`oQ&kP+9Kit$|K3pvsCAaV-VI1SU$c7{pk>&X9UbNHe zau;(g)1H>bxwV#bFn_RRTV}9VlI85okdmF_NyY*-$n&+Xh(w$s2UMVB!eO&*)`{H3 zIXv?!aN6;RC?s;g+CJs;a>R4e{fMdh-XS%J-9^B0zMK^}z47Rma|Jn!JJ!Lg$Hz7y z$8Uu9Y3jc2)N@gX8RDWKe(Lw~CuE9o9@?rhd$r?3JkG(}h;SFhl6jZU4GwUq=TZ(Q z<@8+qKR+fb#2iK?^yAE^!Z1;Agw)Bw=oT4hGxYOPBsdU2HJUh@YKpZr;c8J@jV+m( z(Q$qL^SN7i>ztvP+;GKY<1#1}K?E8?2puvqhiPUJ;jK|=TJ%P0TEa?SHS2RN>aU{P zi-)_YkCJLR{Y4d7)}Uum6KJ zh;zkq^%;9>shGGMG^I{)Mu)~o5jobw^ry2=Nj=AMX{YD}nmi@y zbnZmHKx$}fAN_^?t%z>)37jr{-E%xi)ctT zDn2vUt(`=Aw%pFSRtTu;1|LOLEKRSH=LYFfU8~_6sFhI+mc0$iIo>T3m12PbN|6!; zkJ8HiP=)G0&O-u8K$&@;CvqlN1znEc(5jw3eMb98_|;q?Ou!UDT+nCC6AGwoxhz94Jm<+a$fRd7R%Wtz}NmU zf2_VRxo3`$d8yGzch*{>oNGC+)DYF)*^PZz5tF7E7OCwGpvZ9`6 z3J$s;P|HQL$r?WTGsm2ZA`vpO&ZwAV4Sjhn>6VZS`cnx`vI7gxLr@P=5Y-ZLF?Nvm zQY0||HE0mF71dKDsqE}ZE=QFnxHX+3gKKT5EC;Tcjdto=fTwZ^+yj89IT&bB?cocL z^xxaoD$i6bS^s?XdMY(U-NfqP`C8_(%U28!5;bX!hm*y3*nsSKb#nZpZWZl(;yX(X zKh5jj$n_IhY1*gx_4^i5xPHr6SsxZV!rRh99pG@t!)&Pp0Ldu3p47OKHY65=O=0S3SY4?o$RkuT)s>&Fg>pT`l;N?|TrTzG@QRkC+;m7Q<=}xKZVa;7VTcX3C-Tjbz_A zB!5HN57O@9=Q+?BmR`P^?i}tFng0YW=|O@2t%?)C*t-K{xc*+0fiz@wumI zc6d7Ka{BS*>oqdzyJk-`KGRok+_6yj9~;RYt9f z_|a=cTR(2dBo)d)m&iX5{58{f`bR|~oQCd5z7lbkK%m=YQ0xp1il)hjkF2Bkm*x;& z>|Beox5{YP1;4L_bXI5cB`_!Tw<;aC^PlR2)M@YadUai!!L1HupHNy0~PuJ=B9EvXUy zqXxAiVCJ^wc-9xbl=FAIEB9zSVtz{18ydi42y%`h3+Sk|9N{v1L2FTeiE&9K%acD) z`CDRBTJUthp0>BQ+`4x>hER+v(T$yxp%u--_nHX!nU@2;c*!h#SSohSVIVTrI zF~@H=69q=|j)v%QpqkEJx5UjM$C=-H*2#-iI`Fw?u7t3Z;}4bP#-RoMtzy6W-sbq@ z?Izz8P)!QJJepKa0<>W$G7Bb1qXK`YEoV)02pt*mgg^7e1C0eZyZ`^oJ+o?cB zaD-h8`L%T3>Jdt3=8`~=t-v`sV=|bvTyi8*kHlFNSIQ1HE3|7BsC#o9|JS7!=U@@8KAGQV>?>N#XHP$GDKCg5BC+9T4eSw}a_NOhT6$$_XZ zlI}$z3PVuf0k)om!lf6ZNYKYOuO=ETsDP#<-}T0fKR6gT`l)(LmC&NXbd&vBQx~l~ zb>iJ-X{`h>FRc1PX&UA1KpsypvG`)RWBilx=`N8t|m?d9L65NursvBGGpQf#6O;>}E+yEs=FS4i^t!${Re_Ef zie$B2C8~fPr29KHOoYmfXZ}!2Dq9U6AIWp*VyQdX8=p_7yR2whedtP=c6~JSl^azH zMG5RE;+i=k<#V6QU#4&SRKNSeXcbMpDA&zI{<7EDrVAC$Du4dFbx^?%TP_&e4^My}Uf<1Zd2$w--Jw zUNLcUI^>&^k;^vbF6U_(U`&17A{tbu*OX_Df{>n+qV$7r z{Qd{-I?R|$6;wmLsLD}P;BX8f4f1xW>Ztx(Gr^T>roBJt+H_r5O_IP^&9XlfO+!+* z>=4MnsC_&&iK`qQS^;~~5YBS+)2_&vj}hfoipYtK<*#V=E^$D0P0H!%{e<0evU2}l z<++QLz{t(TE+2YBFnLOu0VG>D8z{BoIkN|@V|A1#p-O={z$Rtj%eNyCGBKEz(9xYl z1-yNK$hfP+Nxup*K(sAG_K;VYG%>PkiB$6`SJ85}Sc8*C>(bHjAoTLc)XuG0>vw%a zz$=$SYRBnP z#m6~<+K#~yB$bcrTDQ;){2|QA${EcHv9;7rs9$(iyPDec5bJM?aFS-1=YC9a$GRru z^p`)n3MnnGl&@wzDjBdx(cgijgYi#tOUcwRi$uZFZj6qNiPN_CkBg%$IPzv)$i*sv zzZD$`a6nd24u-UmA~C88I!Jh#M2+1N8;tIQgRtZq!*f@v+_QjKYX-aB^L&q_g4l{y zF8y6O!MO{9sWmqoD#mwmF66gpVKA(l`I;d}POz668HY(eMnzYBaDQPfDFDC?u5kIy zlR6ezg+@Z~m0L|x6LD^Z<=U5Wq%TI1vpI&(_wdO8X!xh8~QPp5?4odF={bf^^EBrp>bz@M+oMDsaB-n*6B1hb< zu%=^grQWzIL)}bSV1r_}_}I&Oxd(9CH)4}{XgI+ktqr6qm2bNH-@ zVT+F`-W={S6~~oux4}F9;3N!zb2@eB?$A0;#7HJxUGY7NW9y^-hH*wDuxVSyZyeZZ z1Ffu9Tu4c2o_H4~o|9CqkQ^GNug(%USRP2vEmD&ZP$-2B;+~Zn%;7>CMLhg{lbEb# zbI`5e-+Eu-J4YtTLva}Gnrc`SNy>F$B@3FRVqTnefafu#et7ZAoZ`yR=5crtdsg=={+JEL0FY*cN}ExJy{&K#C| zsxFkd^>D(i@@U}cmSyye|B)RhHjMAk!MxGp{iFQ2A&hmUt4?RD<4>BN%Z}wI7PRknXOxn|`C%^>oKQ*bf7|bTOHJg<;M|6dB=h~)2tcRY%LwHPE&T4dLZ=%jIjYx3;Mr1tPE@O(^lRzYr#j$Wpf)hQXZ%(S=6rt zNOEtL4XG%QzBsG)R-zBIS^HUVeU-^@zl^LwX?~p(t0MT3aB$|nO%3A)56p#8Lbb`C zykWmZRn^lA2)0zNDS}9!5J)RcZq~)%^i2*$KOS@za^i^GkM*?LWYA#OLy2rPZwC z4A7e%eI|)R&Au>2_Nun%?BgC-5e!PbgQgCe9vyTnlh9eWe7IMVbMRDnq4;=`9GDIw zBMkmYo9VS%`9h^b_3&SK(+;|a$)u(e7p2UQ6g@cWCfyPEJ@Eqw#$q>;pp?zh9A%`} zasD*h6S@i!8}y)1Ouzru4@IPWG`pfNUAuAP^ZByn!e9``JE;9O|HUJuBjl_=K9%T> zSYv+omXeP&UiWbeA*_xn=0iTAzi%YIo}z@sueH0~Z2NRKSGg`@!%p8_Be{v3x6 z?WC`7)=;CDZ&n9g$45h9q}Iv=LH0LE zt;)QI*ZC3XqO+1ORZ#~0D(K1tN61RhR|Ap!pqW%lC=S}Rl_dJ6S&7gTVM(I<{{W#hSxVSd`mnc<%qE4OjW_b!q*22P8)Yu&xAvwZ26)N@iRi~XL z1=?G8Fv7;liBU8pY2$F40x~#B@=0Vs))UgHjFkAT-IP9%w=kt|i@QNwmG?dc2vzq3 zWV*ghRhft&{_&@4;c(Kt23fyDvBEr=O>TyjD6HT@Nz6eeqM6O&qpsRh%uK*h%m7Vn zC?8UCuY4I+GkQtw>_d?IxyDIEFyCxsiHVXRy*?9QM1{3{*>_!0*ey=-;yyJ6Yp4Bc zR;he`sVX@jS(Yt?jmltc=cEb~Uwcl&g1T&HVa`OeD0sofK{DUv8IG;lA6gI7`dW9{_k#UGY?-j!0M#Is<&uRIBf~uKpx0Bi+A4F>r z%vu}1FFaOk*+i3e-Wn67Vus$$PQamZo&bM508v8I`{lQBvAtV|FMZE!KFis_?Pm&0+rv_iBDm^C?JC^EnztVqt`N+qnRrw_fmIG2i9lEp^p#rE6cFkkw1} zlr~h@)N&+lAV1JFlV&C*kMRB0l!TjXd#gyamU}ZotjG*AT?!uL5c^@EbIjJy^GH^E z+m+wWJZe)9heUPrduKyCN;uNUI+fC(HDCVi<+pQLsQuXaNmU9jV^8OeoHpPLEL7oJ z@Qwr*9vN?_eHZ1)f09#z?Uo(-z(qQetMC??Kb1%fyL0LOkUW7n%lTNb6v($e zXgNW%8>)p=@=?V#ogSx=4jjKxLVlR!R4d7)ETdWJEkt|EiU6r}lr}6b?w?7qwu{Nm zbD_AbX))pJU(MA}3?-VX-Mj4>8#+%DZH50%T{KAZZsb#7weNuMZ*@OWmI{*091>a3t~Iy5S{8tN9$TQgK8lw@FDQ`Kw&-Ip{`fDu$9A_QVe_zo}&#LaIfnRTkh~~7bGkHP{*B-E;9Xkpc*V$* zsJ^(>Kl5A1{~rFPUTZLg0D#1E0WrEr{#9( zo{~qHg8=rb@*iLVRFBEhVB zRFnMGs*+?rH)YpOW>hB4jq1(yJ`XDEFSOg1@oDiU(sZv$h~#|Gq0`>`k&6eb8kj}z%uZu zaL1MXLl$D5|I%aKqBVy_@wcE!nI6!k^JQ4H+of<|Z*HQJ>#IbiTVCE21w~Vis)REB z7P7(hb+dUgKXONB0!1b>r{z1y?K^2Q(8Eim`P{ZZT%5{Np#WT&&!Hc-P|~hl?`odc zA{y&Y?8+tSCod;0$w6RfGpJ9LtJ>${`EF1-R(~1%?nhpglnG5>yF&aB&}LEa_@jt( zDt9fHM8Ru)9WZo$KuT6wBtl0>GI=Ng3}i{>oLDq52ECH<7g5T8uv=rSj_U1#t(!rv zGMJv0>!mmbeQvDL!Rt2sN4RHrZso4L_?2dafX15EZ$kLBghHgrsQ5U1u;eczX~)UK zBn{hc7QvShJRz7?QSmj2+Wy>V6>!DX$*9 zaQ1OEq6cLOb1HbJbSYFDSApK6Gy=(Rd7=*@1Vl8XuEZ??wk8LO<|wLC%uvUzC6X~% zmZTaPd07f46lYZdb=3JHP{KP~!z3Gf866f(O^u8za4jwFjsfmHs7x>$4I*^rp}+N12uX{U z#R~xhCiRJH58S-*V(#3XejVe<(fHr}yucz?I%Lc=#{w6k0%2I;dI!w$@&`H6YWGxc zKOMiVm#%`|fh6l5;KHtgAPTlvHC!o%4$InUrllJl<Ls&A04B6urNav2F#`3Wnr8BB4`Y7I+$r@3p;?;wd>xx8F0 z?!QQxr9olIMk`>af<>G>> z;r+z~K$@E)77S>=0I0|xpwc$o=?(0j_{VdvW(`TJ>DLcIt*N6{I*Gk_NGa% zgtYY#Ri8CPBJ66fS)8RDs(8AyZC;q^n`FWcX5dYTW-Qx*Mr;wTT_-}RZTJqndK|# zXr$@OO3A?~7Uq549iHYsV7b#GBmWGT-+#u@tuLw$p?>c4cgx>5HIUbctclXY4(OYZ z`*qkIzwKF$)N>@~fI_7|%M~XYNTPL}t3yi_&UpM^y(WdkCccirw%L-XeYcpnRU*r+ zn>Wf=LRkd83cBayEXBDH)+CGt%okFsO(&!$ku~hL&T4;X05}X+pC6w*N&8q)EbkmD z>vP&C>a_!ZrstualK^OHu64rj5)f1^Dp~IK%4AmGLzObld7yQp7?#?G2QXaF8dr%^ z5%WY1%`KG@XzlFszqG*h7A~R4*m5y^DI!a^@-&6(3s1}<*;*Pz&{9P-wKKylnttX) zT-WE=o~WgVdOhbb)wTWgrJJdfs@a=w>xcP)&@n(1D0ZC$!`AtRY-25oQdnE<0c?{!jTdlWHU2{I$ z>~xM#vy%oFnCc&u`wzoPeIN7&d*yPSWS#a$FXd$)U*ovXWY;ARu!_Jc|*c8BN(o&f(Nu1xghw*rQmG-+jDIJH{G= zn55pJQd*i9lJulneByX_l7jwDfrszv@5|d0pS?`3jVxkD5oTXIP$~heBVbed(oDAm zQE#BKZb>Za>WX%w1kK~zpj(JS2HbD{GfBfpE<^mmG`$7`bnUj(sC`}ewSp9e# z=q%*dF1O4rwZ8$1&c;o=RKcvkPvUOJJkG7$8?{h- z_%N7F{^Ojj)}^Z3`x7Mber6)U;htaycYiq1E=mPslbIPUGm)9{^;B$E3MRWGgY_%5nA zlPxdK8wNYDc|aHVicGtBV%}91NLN;uzGw$Q*%Ei60>J9*%39LAxF`M;FKoAV3}0~h zj(a5CweBu6uuU!0;SZ4s48OFYo{|+Uf+5myl|U+jhUY5X6ID^2+`Y|igaA|%063>+ z@buB6q)!f|V(_b6`8;35AEBay_;uEQo&hPeMVW+_%xJO3p((%IN4IqlS869%@*DUMX zd7{^@UDNZdD$0{_Sok?418%M6iitOKL$wVcbbNMrZX+veJ~$HI!%8^gP~jJ~@P3%< zKHZPn1`;|c&(csOXG_SqkfJs+ln7#fubalt>hcbjR&$@H(Yt{G=Tg_~-)7Bsn>um- zpfneiyoJB!+X?xRN0o~ozvX;o;5rjP1|%kDWsbbuAa00SYm%S)oB=%JH%cQGA&2+_ zk^VhRcz0Ovs_0P}`kvbC>96B|sjET!h}La6mq*uK^OZ%+^vhKOpTnzaroI3LhX3sU z{c(A$18$mx!cJ1uqiGxqly%j}96GWzg%JfnTGACgg0)5%Vf4Uy^r{|bND3b6`0zFf zpRaGKov_$o?mdhK1GDy!41G~c=eE{HxwSlcv=HAMdLg?0985kZI*mGo`=n?DfRtU2 zG;5lit)hRL9qgi===9LV3ts-5H}L}-cN`$s&451hncs*?@xYdh-|srN8oy5XEb*;+ zSa!uXQm8?#5c`9m#z`gD(7#iGSq*mY{=z~|JVXDb%PN?ssSM7`FTOu06&5Q=XQ6i~ zq*CIO3ibyLu)ElD2&p|!8)|JoR}+b$EtFov2zo)ccb+;7AqwXKHv(Simrl1JIbH1J z486%y+bTx^XBJnOH&_5a2@PQ}YPj}GnU%$)U_et3H)+I>M@me~qk_ZH&Rl-~@-_XW zT6Rg&`xVsSb*iqz0+PBnTjN2MZlwVRJ=V25ko%Df4=kW3S-UwTK`l2NDlzru`*DA} zswUqdo39xL;}0;9M1oIWNis)jwZF>OP=1HA{gvj zu~sROe1}T7Ii5X+GqDC5Gbs+$l`meM@46>>J$pA#Ou`XZ&HQXSfPU>hkLedg*Tz zC(+LF4h0HxshyBkf_GFn1O33qNd(GVo_0>{6zFTDq2}+crASnoOt51}MxLcSkyD?@ z9NA)dmbc-Ho+3-s5Xb%c3xSA;L$eh!cuufOqn6wk%kOg7^-Ylzu;d>cl%@_E>FJ-l z*@iA_nam2!Yk>|@zMID`w~)ebx$>z9sqs?0I>XvFd^Z5swQVB=Xg<2r;tv!&0J~cr zlg{3HQrs;e!nd+1bE!NG2c95)~rK53*`MtVV0Xf_hI|2E<=XZ>A7{TVcadnkiJZ zw?aE&2O2LIptULX92m}}ay}yAZM|r<3XY|iB)pqcPaP}4c=O((YM35(8gIoMP%5~q zsIkyju4!@$1{I9o9REZ!p6OS~JRl4WG>+=1Wf74qIkV7j6+Te9j!shJ7twREkSXoj zy9Py4akvN@qhc~i#`o`PD);UsA>hQ;`-pRguGV^a{{9>PlB**bE-rm4%&ccW8XF8Y zeXZ8Xz|4KoX=NEX9vi>q`?2%8ALZK?$8Y716OoAo{iKyO)B^Jem}OBnEfL^Tkq!K7 z*NZ^`tZ@GXiA`d8vqtPhGF?!qS1+B{1v3fYB`+myE!;$ZfI_CBb%v}lHyZ&b)v^|X zG^I1EcX=VW3DjBVN`-m729WZod^t_cik+251vH zN>`jtS0UP4=#qX_UqOQ?0}PktNlpC=pK9&N06qLXj~fG7*`3nt^xXkv)pG+U`KY*v zsP2+?37xB0ZC3?Hv(dxn(?bH^3H`W`a|C z%OU*FzmimeX@CgLk?wSb{|emDwU;aBvj2E{P@S$=G$x~+3`AjZ$b|Bb#EE9j%$UX^ zST>q8STM_QK8$S9sQ%Ov*X&O!^>hVt@|Q@~5Rq=+{ZWO^Bv_EM`zE77|5QtKe*zuJ z@ouROoujV3njjP-guebHeloe&;`F=^T)cO*iMdl9J5H|uNm9dp7|X+rDgw|~PFjVh z!Ip%UF$=RQN?9D7Sin6bE|SNKTSV7SN4Z#)EL6jDc$QnOGIIE-| zq&$n_=L8hNA2d)qi!N6tVclH8e*%2*&%_*_j^9k;!UqZ}Bmc2KMEF?w+#SD_r@A8S zUH*NmzfA^0UN4ozQFam(mOH8>N<1L>FZG_mLdmc0;vyWCZblVOY@U1p3G2r}@Q?EP z5SDkDyKrsHQH!lK1ANFi9yE!{ke@#s?iCF9VEj>z`xT4`pZOJRj&en6wrJm)dk%4j zTcx2T;f_>aN2fA8d(zv^<<3NXGbC-vy}QkpT$havEG+5s_PrU=^1Y!^sk{AcTaed3 zD^h#u`Fx1>qxl=*c>U(mVD`IqZqMc_*@)!1REkDMrMM3`7Icv#Oto|nBTFKOwtH5| z9_xK-9$iWHR=FDdh`CRHIs#FwYdyq)qj?hwHb48PtR{pg)q9}cNv}aX0tM!B2EhAC zaSAEIf2^p`4J?f$7efZD`P`OP# z%3gHKpP-_|vS;)ZMejL$t%w+!gLL!wqjN8)dL3Y!m#=?1iFpo3PHyHj_ikd4;Mt7k zp78ra4>Uqe(hjVC-LbjP-pr}e?;!coS}jW<&YTStqc1TpP3jzVDc^i+{BiMzNyfdU zKI~Fkp*wKHZOvdFSr7F*(sP;98C9bp>)a1a)N&Uum!?a85(HH(aU$7kJdrn%WaoE|E??lI}CDL20S9{*Uw$7~u>+EFDr=Q6W zP=zeJ*Hk~M-9Lp0S{@k6uOY4Z4PvlPa>73O+oYcP11t<3B8OAuBKFioHotQ5KU$5r zLyTta$fAI$`o+FBjROl1hNe6ltIqUl1gJCcZPsZ|*C`^XSRl@x zR)wMab>j|}S8`ysM6{ z?eL6o;WZDcZMXVaOQVxK8E$hwC8MESW{F%_Z|T+W zdGivKoiY^CBVqR&r-1%a$^->IbFF>qnV6otTLpsx6jl*2xKMd_5GbuT7#WL_pVouE z!F_hlM9x>3H;Am`{gOrir5MZB( zjC^FOZ^tr*e#!4I&wn{by@kBb$4;A@ePx6{pk>Jj|EO!yM?&Pc)DkSolADY^nEw#u z^5PeH3NT;hpsf+->r~W^`~F)ECsC)La^i`jj*cxGv1fBWS;iN+V-yr*>@(QwI;Ut= zIElIeL|3t#auRGG%2^&Ah=``qqfn|qOWWraplN!S?o}j+tfZ`~WJ=v1c6NR#Uu4iw zi1?%Kj<(z4Qtf;IRpq=_pA?YuWMFN8LN)oPl~IXJc(Xb$<+v)+jY6jF7=JuzUHSF$ z%O3)osAq|asjOQq7N=6QFNMQZDn5=DYIAB!au>im2JaTtN5l7f)d zRJz3vtQko%J;VW<)89-)rOjK!!e!0?WFXSQ0?+a59**MVAp2~OS zR)RJ?F5$|rDzz#iS&;!{zvncr-gSh5wi~Zyat^M(3vcC&W_emG{j=FTM<5fhrW%-` z#CbKuOV?i2gD)4TWSV+;tuQ$d%aR|=q-b=7V`66dHoaVPK@sNH(L%6AUnJ5U9h5(j zl?>Ebi^6A`B}(dP=3RBy?F_O+LL0N8frb{nPv(@PZ|z|Pah8NavFBqKJLv zuB{RLCy$Q&)D{JQIM`ESU-5MT*x_rM;iYql`7%sDw&kC%Wvez{W!U9-4(N`&95VhL|{c^;2S{68h%roHqjXg}Oq^x#sH|n^Zk3aBxT>&SadLjR( zr@HGl^x$!(8qAH7XNM|1QBz(fTAwR=h;diuKaU?PwB5Y;0{o=%E-6&dy7-hQ2)ko#CC4UoznQ<1K2#gmsU~XcO|%iz-y2p*Mr9JY>sF@6qC5M<@6LXu zojB!LL@KOvqF7)2`119_uM5h> zi{5jqr4*{(EiljD=Ht^iJy=g}jZAU#ggfP+L&vuQ@}-xr=QBv&ER{o;kdC$6|4`& zf6z0Jc=|b)co15^z|0Fs~URO3XRv4U=2@Pz5pa z;v<~7Oxi+v{phtK03b5kwU->6z78OzF6lnjXzbg?8~mS3qKS9nXil!PU%uVNRej<- z(tICWm;+3z1(R*VCHhzEHMrnP)}i&2Qk7FC|1p3<{nkxk1HXCx73kG=GaL@w_Fx*< z%ND@Y-0I#@g(waDvMO#AvZJm{gB9uh<2Uze;b=uUJk{0Pigt}dAX&ki3aY#wTI)h$kKu)z%VOo5!5*hw!rbm9H1E1o{%zB(valXLLS}aiJT44$&djwQ%?8N z$-c9VmZjC?)=rTnsx4na##-Rz=bZ_O97Y9nj!xk5S}R0e2aQ9xx9fZQ%#Hl|)?UL5 zhJ~T?mFF$sm>BckMy^*e($(cWc^EVIN1IH;gSwMy-G?e{&Ew+VFaA!mkQu9pVcC-^ zHOV8*TTFDC>`ZO-7C5X>lT{NzJpVid5ae5nnCc@}y?5#%CT~HK#LS@F+W?4Lts2;Z zy_-^!TjRDl@WJ{PBcT^+jy`0{^68zU<7#LS=2&>=NVe5+Kv%i5VAX@Sj6V*@5$(6N ztMQP@iET4Nj3bFMol~>|%&*_+_J$Q|!JnhzD0PM9N-_DX9m{2PeNodB1c<5ny|c2< zT%We~MvjqYA24V_3xmD74( zgeX!ZxQu8cdvZt$pSsaMw_c!1skubLy5qM^JBI*Zb4TDqQA2)x;OU~n?iD(lDGiFn z5ds4}Cvo|i8~X37&n7WrP}MB4@mi2%He8+q>SAgwGJW!mw}h|C=|i- zHU5#&9}l++Q->I`l!*(V4Rp@_`$^v>OBX!Pw`}ggkMw4INQHzW{xS*3V^rwcb)QTs zvK{wZ-uz;BXIN|#EwRDpQ(@e7d)l5cqSU-c@Z=s(IVnVQ&?Qm+p?((@on@BAiwkLD zIq8akFVtKliOuwH5tuVsL&NG>tv~W>XC@cN{pmYk`^UD0)JUYUC)l{0*1A)m4s&PG zONU2PwA4UsNYB|i&wf$=Bn;bXsGq1upVByzKxdC@%1e*AwnbY|yhY|oRvh5pxXd)z z0&v-q)9kiT{}sdo&1>feB_xw_6kgT|LA0hyxN~XBv-f5+oa!8!7jyb5mZGQWZs*Nn zKNe=&@ihd&Mp3?;_<9>Xi^Cvbyp8iGd z$?W640aVwY`I=8K8W*pd{pM7Zu0!xbMe4JC9 zgzUOr8zpyhSj5(T)suqw@#MDJ6!ZWJkd#WzR!*YxW-X%dkmV6zif6BYmD$ePZ0_T6 z|FqcN6slpXG=d;+RQ%7Euj@RmgNshqzOp=<6x6sIAysPyEackPbY9QCVj}ErSEnQC z$2Kzb)oE-S&ky*THl9M!xl~e}S|k?913XmY(B$L0daiq9%Dnuq60W(0cZS6!qaAq7 zyF==84(ifbn$`4ejduAr7r)B&D~z73aUDs(jmO8MH?58gdD&Puew$@2lVjwUqsbG@ z&0Sp}SR29QKfP~-Ow~Uwzo+-2Rgl$o`8}gB=q); zp>Md0B{=m?tg3h?EDBJ&x1-^$(X835vdM8F7NGc*KhcV50kDxy8`sb$DNJg0enj$8 z|0pYe>1mJ;CBc@q?Z#Z#U zpA3~Mr_t3gB>4eq586uv#C&8q#y%QUk_PT%-1h42orBWsq$$5^Kv}gDGt2o$Ve>00 zeGjaUF_~sV#N#f0sn^5xDwv2$Zm;=zv3pR+CDYai%@eT^t!M-aO;VX@7~rYVQK?EB zQM{LW1!Y+rL_@7v19GN7qLjJRHQx|Ow4tDIec_!<+mMPQY!>1;_v6-R{F9`50*ho? zX(4z+$`;z`@0ccg72+Nl=;c4#JpV|4zW(eBb$FD0$Q`?lc(EiVCnZmX_P&B$=^cZu zJ*Nq5c5P_Orz>}o{zN+@@1t->#Ap96KH{og_4-P^kRvT$N~p1-j)G+5r?d;HODPFD zibiu7ftsHrRHaSvYx>9M`YBT(*K+ZX7r)kbU@pxXpxHnI zHIEKajx0{JN) zojNozOTd)JRi>dtcr-o`Cuiy3(=q@9l(-_=%5Bk1unQ8Nt>9;F8=Vd7a+$MU|#K9&Q%vU0&2%p3sa$UxpFM_mJbv=Gjcobyl=RFFvbh0kBxE;nr?$ zT*@-5Wr00#&%S&^Gr~MYJlOQtk)s|FuGl<^i+}XJDCBo{w@f>FX^tdgV;=5KRaXq9|Qo-}I43v6FjvQqv>`qc8i-ExMBOSqoH$*rD zWs3*EG+Mio&A{|euV@injDSjIfxxVHVCVVkSQz=X*)XcO;`qQd84?lqphl6!dGOrL z!2A;q69_0b(-9rVQ$cOjZMlnkLgV9v2pVGTdJo8}1V}0KDMuL4NIcRNNCPjb7%1;o zUjb=N#W@YONqi{}8}U(=Fh1q(Ny&Yjkthr~h5;LOlR+^h%XiXkWE+zut%0dV70wiu zmgS0grozUSaGyt?eyok?Oy4!HXm?|hqKnrMUr){tk5c*OY$2Q1Q8uSV7y6G$zR`#` zo<0&4%&?*I_xij}h@_=-L^lkz<_RZxRbzT>^#QB?NOea-Tr->{NyO$Sjm0n-cH$hK5L$_}d8#+W>l}_6~*fBA$EZr|f3(ct|Vv8vUloG#Rnbu@9C`woC z|Iy1yfKtsbzby(|MuE_N;AIXP9v(bV$->;O+k>G|zO19wa=!V3zA(G8vb3V%q%d$w z;UGJ~>4c#b?2?8HfqU&y4Wvd?a;Q#|5rH7+w>g);(xwrG^xec2rGGBAjDczJ6~K|o zP!`-RBAI&ZcvPArYHFz!Q_VRw0G8O|C}JH=hVK9F$4LrN@dPRS%{&r$LmaJ zOOMofkUi1t46t~CY*&6rGw)_+=GQb4;0w+(UtH5{!NOo+mlA^BQ^v~S2nd-bJj1>r zLam&P)=gsveLFrVsF24CgP>lLPV@fCtfm9Wgd=vwX^^=uP^0~~$0y@An=chnYC7Ox zb!7O0_^OVK0zk};%W9c3I&IjPE`9@?dO0XrIVV}i>hqIs5<5iYzDo!W|&4*qGjzHtJ25OS>16WNnxTS-64WS30-O^lY9bESla2CRs{w>M$uh6)_VbE%G9S?l`$lG%Jph zX-uWl%6Y6b*aQhHj=#WdT3Kd{0isVj3D!l&=7w$sRy|=IBG7h24uG2H(cP1GMAPhT zkKfWyT(hMYQChizqHc#mC)d2*(^T=+t<)FrOO~U)M^UHZf}Q=ks5s_YUsHFLm;e}* zE}iUZW@Ph|A}cmZTI}@NGp}lAsf?97i;%`>sVk#aQBDX5myeA6Jnr=;pS-4DDViqz zJkJv3PODQ)1xf9KEox$Udd<}R+d3YWuErRz`6)~vpA^xsb1NdR4^ut0?py% z$hr-cUP22}H84#US0P0ms%{5(H1vFJvanydG8gm-7v}pHALNVY#+5R~-4m+bZ|NS} z<49Jk*{)RYvTL$Mx-62sGJ7MOGs%!>Yc2l=)S{OwV^EVDf26o?9X9<%d}*Cr$eNIrSZeR*U)6hk~bCZio0?pUY2fQ#>=@KRDPymC(RkXkL*|;5MjQ$IJA0(t$AVnWg;Y+Vjaf zGqE^BWH^I-{z^~O8*%sKF3gOa5e76TcoR@U{4;VhY!n2IxK9Ay#vj}3rDt{6(1EjF zb=cF=5>(rcfIzWt5{^(v4x1vr6c`nzt6QD#pN^7!d`O;9l8|(ZvOmV(&4tSspzq-8 zOr`GOl5JJEPxCwi8WyF=F{7XJ_iinD2R~Av+Qf^pVhM({X;?aiV9)b4pd1})LNW0W z6!a~J^*xWpOi78-SSXGMMT@|b=_RbKy?*U8e&xG(+wC=>TFPq}MP*{5=5QEwStO z^J86^%>Nkl%WF-jxAfk-4bidAYfI*BQ|KZo?bNE+%TXOU95wx~Yvrg^z=M4Zs2~1G z!lQozkJ8U-0m3@QZxoe*RB0;WG-ipGblk6Y1DIa;XeLG#p)z>6cF_?uL|&fXDso&m zNN{<#|J0Ot(OTCWm!rT`)z~LTd;drL>EiG309+~m>`~(rXAT+zp&PMOrBU|=n|Zk} zXoDhiBzKi+tirr&i@?&fa)GyxhU1?ERS+-wp8l+$x#^h?h%x6Z(SmX$=!rN(-O|%; z1$*|pZ~#r_B`5geSZumlxHnCN#{o@s@uQZU5>1DOty9gvx5!4SqPlY~MRR=DW( zC`GJfxXq%!sfZ1yz=rDYrL61&zGoNnDhb|DqO`NoqAv&^*@4MaF{M!Ga)+3@y zkl@?TVA2-5kc(38@$vu3;1_>Kpu)}0CdZTGpAUAD7^{R(k&sGZgmRphC$Xb~*5zL2 z7;9OI7{jKOwSR!I+_F#jLaPY5Hc4+&B_(_vB4|Np%iu&E<(#xBpf}rbQOG;h5YjCw zxdh2t5)Zd7G@omUNZ#dl^5F2c(snx+M&7WW0WSm=k}@#L`%kP>Z1(2F ze?uTd@qyuyYgkb#E0q?P%P{N6V^ba793Iai9Td#V&`*g78S2_l_5LZSzUrOHs?I=@4%0)Ydq zYZO8;(t7k)!twpq-kNY@;I1D%eI~yu$6QP4Sj8#QbNyyi_bXm#?yoLX?hLzK_amep z4wvzJmNgUv8F_(JY~dD(V_pALo-M52^~bbo6usm(n)F`(Tu&}rOdKMgatGPpsP4u4 zZ@J1czhDgJ)K{UhW(8hJSjJ_#Urb&_U&uSm2;94R&5h5dEbG^hzyHB&c_X&ErcmdH zh0B*3W#ynn>4Qc3(nIL_c^3#w9wG>}U1p(^k)l}KjDVCtXaB3wEOrKe7eI576AwwLd~hoEnh1Xar7{Yt_mZ(Fbb z;JtO-LMb&6Ez(?Nu|m`I z2fC^tOPpwT29KN1j+CxV>&Tl;j-3ulNf{DA^g2~I9^RJ~ts%y_{=tzq8Xjid7OenN zJr~y_s!<6#GH$Ta-&0goJBousKjzoIlI&Tef}5Ph)DGOG=;yYz3S)kBvebIZBu8FO z>@!cBBp9s+KH^FEcKs}G>mn`*z3I-dnC~L@Ca+Lbyw-OC5*1+Pep!&r2VpBJB~v7n zJC#k#?+ULO4zymu6}~Hfrk-apEn(CUH+=}Yq9M+2HSh-MVkdwf%HTzyJl1@UXh@-- z03Av}h6Pa_GhBMq^Uu@gP73_-Vg6HKSt#0xdhF3NU(Ib;URqtt-I@+FNb#1)lDmHG zg(Q#2DMr1>(MUCg&|6ZI+NNhZsrDb$G>0UNqu-XFz~YK~NwAd;po$hE3d9fo_KSCN zcs!DrOU)NgsCPN1i;Ld+-qCq7i4EVwZm-Z$(^TUNZUKIN4~LI!O1fNYfB;)ownl_) z?5Hee@O6F~TykCk zS^)KmoHi+2-ttLQH1MppijZ`a){4F)BG+f1Eubo`=u5PV1G>YvCX5+IzD!O>0dLiu z&)pi>+WR^fO~;3q?$_Bbw4BvHGl|FnUE0}$*5S+bIg+shGm@_?&*cH`{QQG8hKN5e zgd-HA5lc9q#GO^KXVB0|CwE3$dEKMYi>=!H!Y%4{XytgF>vkUY3A^T>g;UB+T~VM( zkACfI*K;yJn}DO%wVnY0yyc1DB2vO~J_Tm9n6w0ai$o}~iwH};=}Ls^uDw}<@lWzk zM}y;nWK+GprR&k;1Rbx4ojlsAyhMl{B}h|!vR@l>o2XtnlH9HeQ~AF{fP8D2l$OZU)LGob#+y`|uw13I==%TPx7t^F7Rxz5=cE~qGN9wm*j zH0=EJY@Qg`x4`{WeaaC~Z^!TE14tF?f{q-P{@zhMK@a2CIF20N_ZT)Jul51e8E51KT4!3Ati*lR5Y6$S`*BZ5)yt!@e7 zBEpQg_^>vY0D$1Beq7vIheni*Ea}{cZAxQmy7yr>Wd*EuPpUz&y45$m8NjMhia^Io;gLvB#p1f{A%Ur|iP6Hp~M z95%TgboP>bTdH}Q+X@u63(SANtePlE$ihu7{XwbBcW^2sVNfakWc*WY|2&}l23~mP z`lEamnh-U>jTcg7uiPRGZ_>S@uluvt^FAL&Yy~p8YbYtd;FWGEXo2F(XM54{g-0H& zYGXe0lI)>*M2c!VOR^VjzO8q@4&8XeRcC}H4 zxG4B4dB6x(p*f16yMwA%D%&Ji09fNNeugr=PF&HR7&^l*CBWI z$kOqpRvt&v1!)GQSiqHvSg;axq67&HeC7nMeLAOiKrd@myRCn!$1X@BFGVnNa>iRR zm$QH#E;pSA#kpU$6lan_z4dN6<5s70*un^EhbfSWNKHBRa4xsnrH5LnHD-A6FO7#D z|Djtg%C&|=k(6T5H`)Q(H@OMx?_G^ugyTx6KbGNb4>7y>f{yLznNb<&s((n7M)F#ghu@O@Tzn6Tux_ zp5hCt8b#F(niFb>vucCHJ7!(kLq;Z+q}L6LcyzqXWu3NyEYr;^e@||EZMl3-dJcNPA*0_! znwPKpE=sRpW22z-Mr1v+oF|yGJ}*V2irDRUHP_{dy8%tmEzfBhC$8U@uNIv4fM+nAs;&L$ir-4oR=jGM_$u^^qQ)su#~yuMmp)U4A3CysHwk z8spl_&*_z%%V)m$nPiUVl$XVJsx=vp3o0#1jZ!ZY;E@%n@`@;4t5ez+h&X>&+_Kz1^HljAdn14t#*W5p#(p&g|p)@3|&XkHxC1e3oPJk|l^i9ln z!prk~K%AgS>TJ=y>&#Szx-Q_sVFktPA+1|@8?D-G6>|1UgsbjX)v}A9aC)wLsgVzU z_w786dsTA}oM%a1^c7I{`M{rC(-+_~TF5-iCy7j9#$qcR>+Nt_zi`7LAMi3ql_Jvd z++#DeSV3aLxA*jEmABa$?U#Og@!v0gme>DKMT-U#$RjN@NpH|Rs0Pu*-QQyanIz|> zB~pVhcl0>BmIAP7;AmO^X<6u26Q|>#UkdkaY-7>?;P$ax4ppgNLRdIBTv- zkO#Digu(rtIfMJ*&Vm>o3|cUXBQc}Zh_}#zbY-MPuxaIOTpvEE+duD)^wn1WrwSCj~#?CZH4TjhtcmrSEp_m;^o8YUq%eqk|rd{|zl(2mex) zMyEIt1=e&33L4eyj{5QuCif#A(Y4QuH2ZR45~OOewPzl^nor7?R@yhpyE6XrR9*VA z?)V}#(R-zKgSjemd9#a4L8p62bs|DXniZ334pO&DH6KRg;?dC9RBeaHbLaH%_-#V^ zNi8ZNK*ZVV0kNGjy{1~B)b*&djU(kd921c|9bErpTvdP;Mu9Zvdo0e zV$u*=Oat_=zO6Pl`TD?pw=#WGGFg*wxj01-G;y$`lz!5@x6_0Q^HCL0*Z4C7>shF( zN7l1ILBKil`akF_=;f>UcJdtNp(j_DXhz*yWl*xOr|D?Ch00$Jr|$9i?fSBclIL<2 zd2oGlyxqzARTVgbG<^KRv*xlE;RcTExU33n)qhth-YcaW-!kjX!pc;)QmtpJaBNZyJ z$`H?yQmMWLlBmu0D|x}x;40cFSt$a|IFuv9MBXBwBDb&j;X>DH4%b|_ zy0letQqhib@Mo?+dow9qUjFiK%yJX-1Px-jWyQ)-w5Ung#2pDVb$^u;OFmz&?+$6l z5%SZF*xuU<`3`xqZIt*zjz5SVcUXTXzS|TUh@g|@r;Z(*VBMpg@J3RuKdHR4x~z+J zyOV=swYL=H^{y*?%yjMA!7A008zo;|h+Ra|A*p0I*jPa>ato3KtzLygzp%|%iw6!R ztXjg5ByAYeZuh+M?9`Xw|KsXyzazh@JKw(`FYmoC?kgHI-3@m09%GY|RC?%^)Iz21 z#@GgzN+qfEP$jBVQcF&=0OJc8h#is%fshID#7yj&VHVI}j4{qyDfe1=Dg6ulCHM2) z=etjdaaQP3)$ie)efHUBKYaJzu_>ZOmT3)`giw4)vVtGbl(d!e_i>aS7L{;vPMy32 z;Wa0PiKXd!T>7&#&u%fq!|3dMJu))H_9pdg!&jqbVr_ZT3-+e!A)#ar4u_MUnPy#1 z@bEp|d^SqSpspp~DN-V;{ipw=-SFs{coQa9>MJ8OD02}lLqk`i;$sF+#-vlC>Dc+Eqw*riwoX|R+167Wku>Vs3V4CB~|UC za<*hLL=G8Y;i**~g%~ZF`R$NyO@U?qkKBm@ZfS32lo#%XHKezhTO0(7b$e9BF1^#Zj16h3OFa8rzc}$LQJS zb~u1nF!w~2dqRA=Q%h!O90I}@q_W^40>_RjC6Z9Z;pHFThj<7pWZ*qm>69wFIQVs) z@|lSOkp_ytq|KeoxP8RS8x3hvT0ylH1K5fX}U`6Ce$onaW1UJ?OS5iUkFc?YjW z8eEDXXovVA&7HHOJdr!c>21HyIo}J<1njjKI1Ow*I4JmAURs>eq7D^9P1grFi7a8( zO0Jv=4f&gXJ6yPAo~7Je85|hU!94nBHO-Y+PWhI4M1>LB3n9 z$EKD3f2vq$@YNt3Hmg^BRep4Yr{P5ZDbmW$&~xKRH^t;3;$$Aj$xLyQ@;iP9yUc<7 zBk0o)rfCf*jmP0-X^H}i_35N>%DdW+}Kb>x2L_U9`K-Ha>9xo`j3hC^d_ZD>i4J9 z@qVY|<->?{%JwW!%Epm&L*w)0H(Ck$jbK_#Pi#vZeg;+py|2|;Wf=PE&4VyO?loIh zf~4DGo|-unQ}whSunSACUYtU{8`-1_G1GS83g3gl5?s%iH>+QHSZiia|sMB7D`Q>S8Kgg3W2y%@1iPWDkG&~!uR-p{0tfzq@cKX zyF@azsq%o3TOK6CFTGr=#Y|&ih=upXfylG>r^l-1dw(hNvV;#2@u|F*#+KcS_74!- z;9N5C4QeX^m#doW48qRWNc%zG2{+T_n+3(cw6W+tK2lpzDOR4?GNM}Vr*lmh#{qFR z|0_Nf97ql_o?bseQ5-bzie7!d(?SC5bgA*1N(*PO6h}RJn^b!*DA7MoS$(Oh;#T&A zjtWiiw-$8ctRX725PQTo7hitRZV)h*>Deo_q+sr%3x%#ZEaex0>trp1T@}#KreELC z1=Jle4TRU%m>6@6O)&$lq~g{!aI{IqUVSO4ycvfU)?W{M|30s2&F=>O;k&vnCZ?Te4kLXn^gfH@7Px!Q^4 zK4O+??vZgl)#r6R1J30D32Hh**~TpI^opQ3FCci8;wWA~_7&Afa$IDr`r{bR*1}O; zKRHL>B(kf_FyqYUa6^@LH7xG}9iJXl3qtflHw-vzv)boSX+at6x1#-`@ zyk3H4@#MyC3FMQ<7JdbUY2o6M@`&zLWtwWRi;75RI+mm^)dT<-qFv0omZb$SF<8i) zT*NVhwW2s~YID@Oh_xUNsCNP|PVOOI;K3daLb(JTW_hCZh|9sgC+|k(@peQ42GvUr z(c>f%W9#uM)~kO9L@>nxoCW%4t+{SSIHG_DkR`JFOc0Vyi3$5ijsoibA@Z#d3VNKR zSR6yYk$AiOj>fIBJXN8Y{4AN7dbeAgH=Y@AN%JIev*2Qkv5c011|*8jmi8rtD+`*Q z2w{eu}?V;-xGz$2K_aRUw1Id90zoif5 z5U_Fx)fe=?%WEc)b1O;>e28HdSRg2pPE)ia&RBU3yf*TT>rnSWqZon^5J@zWQ8S&I z5*H;3bnEMysLY3JCigM5aQrelyBGf18dPbtU5LS7^{tub324oPNg{N&sKLsj22HULipHF1s%8dU!<{@5+ogu-iWm4MPzJ zB!hKToXhqFhprh-D`UNpjNm44q=|P>gf(~N9;Z7Ol|(O(hzFpGf*+#Boj0?Jnb1Nd zZZ5|W=Ozp5iQ-Z$Fz@!xAbkANB%6R;}yqD85TaK*u(vge;hlUc&{I=^z3@(UwlZRiq!G9+9w_=5tC9W1-~NpyXV)W zxiSG$@!@fH3qpeA`P5LcC+eDcHjpy;fbA6W1)q0_x?@n~wYo}_<7IJt+Z|>u3U)i8 zjm|gjEpK>Kgmo(Hh09VFUHqd$u!YRQJh<-su%juep;e00Yd>H7N>gaQV1iIah5?#} z9fi`S+!i90fKT0EYhC=`8XpZDMOeHqfo(nSj;|_+s2cf15OIj8QK&SNcgf}Pn`q~B zur*s_%AJp2)AP_?V%=JUh?i@1awEXily9%KTj(;Uadf%~3Zal=dg$1{w2H9s&TN7T zq{6Xy#7gSt7-Bj|<88UlV zqKaMrty1%X;V*4??(B+>ScgAo#;U`pe^ooe8q2TL2FZZ} zg!HB2P$npa4j{*pD09*c|Bb*geyv(fDq`Y=KSPfnoca$6tHYkYFhcNCfkFNXzPBg| z=c@@)P=(ZgU}Dj2UPI$(3rQRr20F7P?IG|^?6XnAWYIW)IE5Gywq5Lv@e*(kKaz@Y zXq+AmsfSS?PvR(w?3?D(s&krdZkZQbhcj# zJ<{%MC$`j4Nvx3&(K`wxyI|;Nmi??1WT<`=ZasyRCe{mTayW#UupywEfRR2eK|=d} zIe0~_!8p>3@mS66MTm&guhy6(q?)=sG1acH~elMUZt=6iSX%^^v$oR7Z&ICVjre}zBHmaBPOR7% zrvU?L)GT8%>vqyZ{$)Dcv63cZz*h37$Z9G8v|fKZ&c-!NB)=3BJ42s4z0F+A-2sih zTq=@n5cQKZC+7D9@2_qA7!bg36c(&#?})^HZyb<`B?hm%_KAJ?0FgjLUl-{-$Gh^Q zp1L+14X0z9?8dxe8U?X5rzK_MLjuhZoHrS8I)3WimCiLUh>qiw>akt zOo<__f|ASQ38Dd{G99*5nI%*DxNsrFc>%?hnuP~v$-S(fcFu_I3qc?bv9Ra3jU7@7 z52LQsvSb9nvS~@qp-L+xGcXz=xz&wwkcyCj>faWT)=C6TM0 z22q8m4X0)48W-S}-tjaUT8D2fNUf%rDr%|q-Wdd9PU})ra^@Re1PCd|XUK{HH^hwl zREgBaG|w12`qT9>WUEU?G+=Q^7|qs@zpp+!*^lT>Bh}xp7(81 zjbq#Nz%zJGlxD+wTIkSj*gMa`RMdF}qiV1fpJ= zdR|-NQN1ph4qv4nOwcHinFx}yFo7SaBfWNAjQ^RaA=Qv;FJUjGQJUhLVukj3ZON_Z zpN8Ly<3bA{XTQ_{k71KU4KFRa?X+yG;xs-R*vkZDl9yD|CKsFBtv0O$;>-reG1$tQ zL;zV!{cy!5QCj~Sp!?r)Y24-#JkMGok0TX`u+fPBubNy zD-{owCk92`PeT#f3O#oht&CbDc@*H|iezjC@`W#$V&?nbYxs1DERYJ$O>=fI1BBF8|J2j^-@9{Or`zqo zf!=uaCGB2-MgoJGV&t46cjfd-=`tSl%sleuBNc_8@RQbSGaUR?j6GE}wR|IDpk<}W zvoA(}6p5h0ojB_FFqhf?1r8HZc#Zzl8txY!iB2sC7`Yhbu_UN=wnBpL1Ih5C+oypz z7jXIV8@hQdI*X5MLgxCkTNUS7XxFR7k%|LA@43l;CPD3OShRY~I4=Wh@(M|n znQ=2z2#%C7O%NK@Dk{Lv$BB|+3E8=|&l}*F;{Kb~lib1QVkt_-91URnN0${Z$MZ%n z6j3y`AOVL%=2SBnV|Mz3lOVi1sBubN{>Yp}H(n~6?S#Z2;NLo#Eew8HC7=C3Z7@o1q@FH#K-E1~x)tQ6qIhz&rki4SKsZb3P^Kjv$oBIy z+j|g(l^80w0C^g5N_oA>xYUN$5i1KtK|}vS96Ml+P!BJI(@#lpsylC~jeCnx5p^`dxiOFGH#GcjuG0w2)%A$xl<9 zL*aBxXse1rSVqqRJ|KI>vu~{WTeM}DKVkE=R{ZqFFb}*Z*?;iGm!H)wxF3s8+xLTI zbn6Q4Z`LSt_>R5R0`DjXa!7cTq~ei%?pnqGsDK-rbv9J;771n2oV8(BnyVXLRkR}7 zpBKp;8I-A&+_tXA``Of*K;%rP5#AbS0-@|kq@JFOjTX|Cv7jHOwXYJVX+=oi)F_hN zwH5>?VjKcgjw)V?b1cWu3e{1(o!Kq>{eeo^`$VEXr-W+pj5JsWJ(kV#htk<%J7E9M zUHh!x1|CzRY#$dVF2Iq8evmK`i2Q<5(2QkWO0z3i4MehWh9}(wx{WBiILMYC#00!> zVzJ6^beJWVCp)8D}^Dx*k5@tl_ERoAi#Zg;V`OKM3Kh|8N{KIB3lX^8ojQYC zA!k;b>nY}eaGqv4feL&s8#hv{LLYqQncV$k$p+4H^1st~I(`_R&`lAb-V(>Vkw`GR z+z^d~Tum25=rWv~8N_NFbRWVEKF&)*W{P2)2CAYH4^`3Z+83}!3O~Ah(_*5PH`0uXv&ldi0XkXjJ4r)hC^m1b9SguMjq9emrNNN^#i=F(s3~8WvwpGmOK& z8)*8d6mHifOWN4ntTI3-S+rAg%eAj~KrN*(_%i^yTHXc+r>H0#P%(z7yR{Q&RkVa$ zNpM<4E>C`*5DIx}5?etP9)Mn&(i-R(#6SqB{#Kr>s!?o`1CxT*PkcpNf zyiaT~o4lv{JxRZbXrfjSL`>d$@}ZtU0W77!m^mpQIv&Q3Lq{NSQdi(TW*=q1sdbju z()kW0#>8QRZfDVT3hBms%x9xfCgQIy%u2{i{%3**1R6{w5oajYmH&D`)dlQ5k7_HX z@A>XgX!0%` zkPA(^Rb@Wo@S&T3-3Q|wVvXYr8^{AFDpl$G67^$$?JjQxC;-g~R>zLTx>@UGdn~7-H$sd$*KwLD>!4YC_K$prCP9f>Y zCzx&`fRI_8i^UVltXGP<@QBrMtzPn9i|<6xJ+0m9y%i50uWxPjQ2>qFhp)bKSDxGEHSdx;;zgv)?b`#^(GJ(HZXBosnhL?VUI(vY&;R9fO| zZ$A=}H>(y`GJq<)i>li}VvJOcizaR19X`)vbD63$ek|CJt;n@Ad*6;5&(Gr0=EG=B zYcddY=%#Z#yraJp&HLS5B#}O`2eIUG>#3>qA6#e;DOt?aTN(GZqyJZz--`j#f5z)a zhMxHhn;VCF_#d=3n^XwbFJYvCralhVq{`D3k<^x#H8>M(SQb}x6a|*xN-K~(R$k10 z5CsZO0r6@a0Bph@h$hNnVJS@@0(9kJM-<{vc$i1Qpff(o^PkmDf1@Wd97kg!#r!UQ zbOl1ib|vv1sd_Rmb5qe0|aD}oj}uyAa8mJo}GZ>_WRQ~8H4KGeOp z@fqpf)9&VeIpqnBxyJR^`On-8L2zB5{(4|Yw^CqM@K%laWRM;YnXpHC&dfw3^DmNsu%R_0=8A?NmIzLjf)adtJ5{O4+xV@6LO6lL z!2SFuo=GG>79kw1=R!VQA-YbXz7)BQVU0T!dV_{$^~z-GgZ8=e*Dqk$mjfE z6jlJgc6zCC<5JfyO;r+e1!G0K>*Y`TaK=j&>9Qr+oSUydCDu{-Qt|JNYvJzYd(4|k z=ytHz_~0lstv!TL3Ig?^@RMa8VkvO!d#G~+**lRG)r`wqW6Jf{fK(GbOm8MLD`dA+ zvSi*;XJ@aSPYQiA6nF-Q&U^h7-=#J}apN$=jkAz|RPvmf zmG)x!R^Dy4B+s1^yQ{s4;)3GNWfwn#0|uI@|grzerTqrW+111=x86mo<&V))|DcwB|HzjY-&YR z{h?xo2Hpj{@ejdR*FR~t7=bP-E7;#09eG2h#-1!Uc_U=QVPI;!$_MS!WC$?IXhCK? zY;QXMiATig#~uCe^O5Y{#mDBGyI>QhJDJy|VJG|L@|)rjKrC;1jNrEnu2y$qKLTDl z7uDub*S2ybn9-z<1S9X)Zq}#0RB`d0w+}o=T!JOq;+Fs-1pDcSUr8?{cKO@O@A>;a zsKAi8`^XGWK!JDzFFW)Vs{l4qb9@&XO!C*4KsPPv>mgaJ7(^N>f-O9TfJ?=Nf$z$o z)(^kAUwS>m#~0uB(6_qMB#?Q@Kcx{dmnf?F?zpWZqD_{XUuzE4C)Js@BK@2Ob|Gv?b+ zLQ*~Fiti=ugI~O-?A1vLzC7MKt?5;(@V_`*WP9?hk6+gtR$KTGr^0!$onqkPM+^k7 ztVRoMn(2UFI*g)9#9n{s@i(x!M9VPXJ}^}mdwC$M!&N3!LVIjU9XPouhlK17($_am zso6Rzt_%!>%Wr77i0Qlq+YyjgOadQs+63p7wP&W*HF+PW6{EM;PTJj}{Hn&j8Nigx zgKCVXji)B>s?38n-*cb!NY}tFlRuh=Zh$QvsD(VAd{M-N0+AHxwZO(5p1~Ggr@}fZ zB*>LO6dz_u9~PksBJCCOgc?QirwUce4LzYBDaP|JMDI{eI(C{#xgS#Yxo%PZHnCqL(q@s*dvkn4w*(t>m! zukJ!=->mn^x82UCDFLZ>g58@)_Ew~_*Rw;hg|m{Tf-(Oa4-WO6zL_aW@M(sEzpOMi zy$`uS-nxmeQ>N;5a`slN5stZHZZoQ4MEmj1F7n*F2Ty+Bxnq`b0HK1XrPl2APt)Ox zJIlMjxxRikLlSKPD*XM`Zd6TVq7Vhu60-Y1Cc^H`^E3HQcSY3>@9#ng3oO_i*$8J*2i7Hmxsioo`Vsy zWe){mQW$MlrTCK+9_=UkR>3hQNIh$DyxbMI zvpiZX1v$(_U3w+@F;Qj2yMyrF`=TNwNlSi3nHD*}0Z|oQx~9fJ)e1i%526?}@@rD* zAfIOK3>sDv5k)(Q{m)dUB@KZoE9Ox2Qyhin%$efUnH$8GJ?}pGiNy*xyNk_qe{R!_ zC{G*!H(|_ZriA(R&*Z%q^IoxrmPJ&Eei z-q2)G+5e#c5D+8{?-hXP98!Lq5<|{=3DiieN;sAn3I?8s(C?m(j`ZL!Jne5<)TBv+ zZ3~}L?#y6V>sd_tyd5(`-w2JxOM`btnW|{ho`@V@+9wKB>0kLiyxvcKuQ#e!Fe7^G zNdzbu>$>8~Y_Fg z^_p7!AdCWnr4@)DRuX8IED=;fQYG)%PKQ$GT|qMV#*lR`oIP-p>r{a6&5>{XB$vPE!fPi(v7Ha+)J7Bo#)ePQ#-3b z?UW@eLQp3c?1%6E%B%WH$=NkK2iQpuXbXT?5%tEOXWwd;?N^ttnJ5oqmF^9D`WyD> zB)s9;i+=JK^CZ&7$e^#(2HMGmVEM#ny)ywdzy#hIRLwn(=F*J*g~3=Ub`}npZUXsO zgf*do?Vf3mm;U6jihgbv!~|!I!)f z6_XmDdG#=3+NVGVu-4EcJt#qJbQM8s3a)t(qY)h^gQqd-f_E+L7+YtDIMzIIA~Pku zFLYmjNwfM|P({v{L)t$2UAUA(n(ftn3nJCGfz>?>RY@3a4M^U`>hU8q;E7QVU~yw* z*;|X+>tr5l=X$22{a951%vHLwKM&yah+=#tD&$~Z6j^fp=FKeKX0;PyQ?<@yP@x!s zay|TQ%3_Klvd3g(mt>33g6M5!TCGgp71Ged@yhbl$zPT%x$R!mDwJ7(RtsjRSS4IR z=h?#JWCRya8v;Eb@OWArZ`bft%tLGr>cX_cn^SAGMScZN2OSrUbRQlaWI#tCt4?TP zPzXV-2R->gsC-Y}N;BbJQ^RjcJqR8rlQwdbwM|sbHjac4i};B&h6piyRA&+=gc$b8 zw`-9g40?}tiGx18nUi-)MaoAnWy!l!iVB&It_z{3G&W`z^=c|{lprH&;%-VZbTp+< zmVnPr#aTm;WQkTgv8UH=J}a_xcfGz*9ScvWUWu;&E%Ng3OAbG<$HZ1)B*|ss1)U$1 z0371CfJA|&MksfTCFr5gmdsTR8A=S(?r#xl$!ISn?Y^a{J^IS4H^f~OmLbRz{=;3o z8aclHY=q;qNY8%$vwGkrcz`Hg(0W1N9#**Z9Jh(U_z9!en8pj6vMAmv^oSS|heeX< z#v9i@nF3()VW0}{&Uk&;)ipACDyzTxi-_^1NZ??hFlH!c>&?J5XgWMRT{=nu+MyI& zMVxO$Ii%ng<=(g(Ch-{}!W9y#`b$-%z{`^7`)Sb(A3geN2!FQ0!o6bl{+mo|(@PaP z|ro0)!pB{7D6P=X~|tgfhNWGG#8_w>9HnG*Xfwo)~a zY4edHRDM5xi}v|6>bX*50OZ+plzOI7AbQg14r|4O1@E83p1h?`;r02R)|}a{G=jWG z#lJH;KB+AV@ak6V!cI3Z6{;nj;{vVMXJqEXTC<=P}7Dph`5gUPWoO%VzD5B5U6y1y4{H4l9pc znx+)K4f?ce;A-}$7oH5Dj-Gk6lVY>oPb7b7hi9Jbrl)>|14|Ama~R8=imqat#f7y3 zc}?^ClXvw5_CZZ@+QdJd&PD2~7KMbg-)eqGl8)|&&XmhEYf;2dpd`(*sfIsrv&Z(z z=k+15F@7sNXKDlhLO*XGAQJVZ0M80D=}U^Y)J$Yl@vlYu;q62|!>$FuZ7I!hA|@izq# zc-r0#&njkH*(8iBjg_@tVv0Bd)rwDi^zmOMfA1oVyap>m))et_oO=r>KtCXErSOig z6s$WR0G=u=v`#U8bU(C3sbp!nl%&$U9635oXV{(bT&JOXU zzQLX$a+g*&8><*;(7Q^lrMadaJb7!+55`aE3BYCf`d>XEpj3)>@h{pwsbsr;}#6iwc&eFPY7#6y9I6-7xiGu&ulmVM#TNRPZUy zaxk#@d7etM7g@<#u?~GCu9bn==w1*js8^f>#wt%k} zs?rk|>L8Wf2Q^I)!-eCVCj=0kGQ{58axm$mWE1*M=BK?F!!!+vhnP;VImJ1t+mn+gvnw5@E$>8B{$FVc*4e)L_Z+B6 zEIbDK#fREJsJf+X+S=Xq4gEBE@4%@jMk?vP-i}+b0JpmQ-sS6NZFr+NxF+w>F{WTP z;L*i;V()gNa1*w1oWp<tK>Yx0Ld~Ar8OI@?oCG`wzh)G zLU=AS?cLxGEMC`K!P8rnXxY{Zk?CCwPw!e|Tj}3T1gC~&=Rw2-CB*a_vgUQIU{r}s z;{bPmJvgZ7hrcS7)`iu3X0CO#s^pf&VXF9F#BTStZXw=kf$tW$J2OgSdD)L3uwEnz z4D#>)qS2i8KK`+9XDUE1{Kq<`+DU45>Y!25Y42s8uvF8F$9}<5Ro&#m?pbbIEnrFX zqU~w>qowe9n#N6+d<3=~l$_a&q92L~%Of7@N2H?YAy6jqqewi}N+IXNgIKtN*i&>j z@WcXIb4nGA0!Y}TA1mwIUg*X-KC`aBsa}eDZxTd-o@51;0P%<~{z zYqidjv7%?q^o|c}`uA+2ArB`%OP!2?C1W1qo&_4ODRQz$WJ%pOa!mgFk7QJncXAS^r9{;Tld1FXBT~4x(N;7#nFuQ8Fh%ori>}KLk3s`MH)HJ4R zPyd;I=1UM9e&N#U{9S*ENuy5yoWNoDsg&_%Pp6H5QX61ujSy9-2Dk}4_XinoFK!C3 zqL<$^UGRhm*j%Hcj>tnfV--9A(i7sz;MVd4WhlSRUnlSR)7rSM`oY;@9Pf?wRn2hn zII*1^-P6OD67LDT0Q5Gk>I3$|&0aaV5;!FnY^k?I2!Xz0UIh%8ldO-%F%@+?ELqx} zz3f2&hfUiyXU0A(7*5>35U~p2&9Fl5> zGPiuK=JcZy8YwwQzLd(SxZl}BMPE^Fv?Z@jwm|PD^%!DB6jEcvG?nchvKeWaRSQVE zp$Kz$Ghp^oDe0__jFGW34A2i+MoO~{}PCJVhteHv};Yfv*ylTRDgEP<%l zPMFDPOr}*76w-4dHk#rxdQPT`0Qpf=4>Hl?5TVjog6NcU;28xvBeb^BxP-kh!|}}M z&6r6>J|KQFassPkR6p7sV} zE?h&$G-t`mN0iXh%p;6YP#oNR1Y{;kUD2&mi$WwqBO{0mroX0-sYG}lQeYtB^`74$ zIUnUveDu=Ol1Pg5__yu$2{APf6JM9sYOa9XqzI{+t4WK9fHUKQ;m1GL9pHnJdK!y} zsl)|6oYv?W)L~;}IzM6)eNoFb9&r+-oKVE=mjxHbp>up3Gs{kbdfGmyQr!g|O7%nH zQK;4_2#$^dlxiN?p}J@KF7Kiv9HgQj$QjjH)+13qWA?Zk%R9|%;89`%2h)fQeugjy zyzSN~HhWBRajMAR2`LRMj=lN3M-=G?o4vLrboG4nYsz>W&n4dNiIoly9VM}OA-2Y0 z^^E%^vs$lLj;U1QnX{!ln=#eH!=QqEdQ_h}C&|+|V=z==mGq&wDJh((>!eWR676wF z+dodOHH>3hYAg+%xwEjPyBZa9v!7!m_h!&^LGei%poy~T17mJEoP40-RVg!+6_dz$ zInFeUFlKc*Tf4Krkq$3_f6g@p*$_TjwqLI(CR@mNuhzv+YQa$@U2Dq}{c|x)9fBQE zD%-;m9vCWk9cQWHz(cDmk^Qh08Vw`iL@kxAfeVF2N8`BI#sq0OF1mvaPWK??KFlw>rR|H-@%9VrUVDn4&#}SY2qmJ%` zqfB=IK{D&<7*e*cX&zccSnlL%_ToKAcc$$ZY`A?XJL_SBs;TN7`RG~xO2(BnihxH4 z^wO*iwvf(wF%AzPDtTfr#Ny)q3XZ6YL+!WbnzuYq)6x`Yyc4!QV9}i51FH7>_D@CCH&{6gEaC)d7_YbYJa`%I z_+*m&vmB%*BuD;cySfObl|uBh;#(0|_q7DOh)!!=j>ah^P-Z~LCr zGnXmOt(Q?AF_@Q=&wytyO=BZ|erG4W{wv5G-+XvB`kH=K&vFwu>J@x5Zh7}GWe^VR zsNf|MQNpx(iXcJbK}#23{^wEwLEm(u%*dy1dCCBracG|XKU~GHQ65F@OmwCzPmt4H zBYmi?rF9)22AqORM-^|gRucJ86IlvfbZ>ph(;KE=m4~@Ch$e?M#{P^F4=>+(QMXXV z)B{?^0nb@$8h|&5P!IJv+k{N$?qNGG$X_x+(#}zXNpk{BNt=Y@SYS$Z78R{c=5@(E zBqOurLdya-XCcI^N(3XJqL}7UlrtUw!*=>#N;;y3;dcs&b|tmGRqO7ugTT@s1U_>a z7esqV)&{}*+%%lxQ>c{u*l%jhF8==F7nTqx{3svd)7TkZ5c0tnl=k-E*C!u#+=sjt zHp2Ch5eUv{B9@$VUY7Nog!qxA!L~5B~!G!G&UGF8J)!amJ;z6GB~Gn5wte9fl(+%o|#I)Kkf`_|ZnuB`QBN^~R7XdYp6VE;8;mK24 zT;|)g`RuIh$29dPKa|?St}Sr_sh6EJ(S+P9!Tkp%??gg^G8LvYlvJ_i`qs^Hc52E> z#>lUTXhD>@+H{2qdLn1x_{$$@@XY`edS-$c=tWev*LCRZzmN`KR+=F^v!D8a9N&UA z!~<2fRN86oS3K?j0hzz|2mKTkh|{J$8jT|m4Fazs#xLH>I=Q0#S`ZxPXsx~uXaNIV zzQ`C{Nnusl+?tL3+jc2D_2hqqfsw;3oa1}_O>=VrWDaum4L?C zV;X}$I|zea2gbz*o&`ZU^dryW+Gq3-987NnPjpiI#5f5j&DyFT4k0Lfg+dToIbjf5 z6M~t~xzA6*YEX1^eKN79pXvYws2S-WSwqah5*<2 zW-wShClwINq+ay=et=zdjT8op8#z`OThIr|6X9W0vY60fLl-|F>C!`yIL8TPdOUkm zKXZ;BdfsbP1TTmfukzM#oN>t$JT-nLEP9|k!|Hh3Qk6BKAbne=^C*+T?|SMP{YD@h zw@vDV;?3eEh)qN7&Cut4$5I0Aj#P3!cM_UH=%Cb3v5Z(L$uLyM3M&xTRBAG3kt-tNiX`)kBcv)No`CQrR;OHf6Bn>+zYFFiSsG?+3*d!i}M9 zc`o;5E_l8R-SEH{Te)TdP#j&9ML!>8frb^Dqaq6Qb#(jCvxNRf8w(C4%Nw0SqUD*V zwR~Z;6l|%ga>k-y>WAs4ceh~r>eK@N9IR^B4dQfrXb?4CNaC%!BN9Z@{Sx`XERW`{ z;A_->Fd*y|mtBB#2$L^l>M@F`LUv)Tt^0Dd{GB~Y~(%~6(^py6Y(l*+DV zfuX2WfDCDAK9u7 zO+jdXW4)OpF?40AgQdAhN#2882(k|~twxnUngbb8A{?B9XV>(=MfyF7y;!N()aW)v z%ZWmRrXjW&jU}{@;n43zX$qp1W}(K|W;Mp*4=(wMtE9jiY!7= zf;k-2gQI{qz$F~{qFKmf%Y?hMbMYbt9Iw?v`Vp-0O|NC>L`{Lp?YN&E$h*vPo;np zHjg?h{phnhAM@b(t=iIe2Dgf6i?mE1?d{TJvNQhz5(sO!8lMhJa>UekHAL1AsWq}CgMSsX3-Y=%5?p5c!bduZ#bx|QA^C-mD zr;vjIx{1}IM^5-rl$AXQeYS`1ckk zNmN(3Te7&N$|^2pqO7!c&T2{3lM^j~apd1K6-m8B`v{fGrA##6 zPe1J!ExNs!w}R8PE88?dA%q+;6x(7e78&Mw@l_3XRN6fA_bdq5 zRN{Wk)+FjJELCY)%kM`@)%Cbb_FMP(JQbDW$$N30aouWS8?jew)09XWN^&WUnDaLU z&py5&(P&7!r&#^~nP_l#hjDDJH(Or6HDxR%_s^*4h;HDe^mz0opk-lPJ#X@MF<>Td zjq(ub)AczM;h=@Oh{3a-l+Fj#W{wC;L5G@~Cuq@SL@Pj!ZI)9X zmhSLWohFY)RQr#ggsCL0+%%D8Pyv;jbl$8gO5Ryz!_V^%x>$m>>_nP>l&wo}Ur3v;!zgAv6oiVyotRFO! zXYMHJP1(i0QOS3pf`XNqgk|}SXNxtS?2qP(#wpztYY(gGl&IHnD z{wiJJw6iq+MmR7BK{N_s{9;-rfHa2GC()}Y-4G0qFYqrp*?wmt)7?vFo+dW=OX#Jy z0b?K)Qk2*oADRLv&jf6X>o3#4bZyi`&jlrr+Yg&p)$XPY=H03IP1MRHgc)qAE-i4Of8p)Lz=Bk@TS0a5qJGh%-Q149HT9m=deYvm1wZH= z)ZMD4UUc{>5FR{WyMru3!OZ_n?9l5z`OqsxZINR+*)t93kKg$39-K0v!l)CVUEgtD zsT9C({5yxDGcxf91g-l!U`s`73eR~>h<3>(r8QCRSe!1L@KFn595A^~g_nF%|7m!g zUcmwl4zG}~Ac5&cp0W;HOlkTVT_3$L3lz934qE`!nfLUTj-4q(d1%}(=WB9o5i%up zs9hufp`L{3>`$w!$k%0c2|0VW6XJn5QJS{2$mwoUiPw)VUO6^{sKEDSfpzW#c3m^Z z@sQY=2Ph4HlJ0{2mDUVUf`jS?rivuSJ3BDcf(OV}k-k?f7~JPml|7D5aaAs5n9)vE0U>JX9B-p6$!G4<7Fb^BZ|Tn(iV8G2;Z z^Cy8ASO1bcks2L*!I@bD3_rtqV5oPYJvd*D=cP3P>4n#uRRut_|VMSI~kstQ`=6D21X~#RJW`wt?HKh zsD}!y6ecqJ;TWK3kVOO{E1!g2Tx$5Cf*$AT9yKh4p)-t$GgG{G`JKxj#6ziWnIf{- zrdKerG!tj_JvP>sc&7h22E(|UM5X6oYR??iSUoX;Q2|zr_JfV^6QlzfwwIZhb!dJN zc2D#ZQjvpF9qQFrU()D=8coxoagdHwI!W}q84OS+(Kvf@i9>rCGs$W=c!Q(x0ZUzI z|I)S3<*`V=6M1U9{v3#LG|)ZA%e!3m2o?K%hj6#@^5p|mFq1?)U%oeq`jf?d;7hb) zRO(Mu=RglNfx{S)s&B5p{Ni=5S+TsPZLlg?6?Bh}xnQyo1OG`!b| zWY%;p-mAD@e5H!bT3W#fUa7}EV$6zHk> zo4%k+QA}XH8AZ7LL<*G@t$`=es7^Xb9sEHZ1_yeyeAf)e5yz;TJq%M;Gkw|@|LyWM zUB=PRMzo})3bF#MUd#wzqU8BS1J2OATm4URcVGtX=Pk51n*5y34n7wGbDseptbx$@ zUjD2fh!3hp2C?zt$IWnTjz+Oau_N;JB!UgYQh){DuS?GUQf_>#%cuBH6r^l;iKssv zg+ig&=74&49&bAYv!>1ut%ES&CvW3NHx0-t5jfs|X^WZGGG5^1jE0j%%3d@(&rqdN zk7R7*rF(t>(bj-I^O>8k`gu61fLY~OsPlznscWRjNBE@mDAH%(9^ZzjJixCTjKbNK zt~(rFSQ}UzA(9k;J?(9k{%+*frVn2+5^u6w&w1jbnn^yZ(5GqB(xD#f@ceaMsD|#P zoi>3~3A*t?IyZN^ev;rx9{&LCoCB{=#!3Hmiak6G4F`pg9qEXaoPWPxr zPetEjD<*_E8;_Zz)>ZRDE^tkVjZlMFE9CEbF^Gw#X7J^4LgsyD|5meDYpk0d*FWj; z&JCx*wr~||pB;A#G#5>GKjlB3w}=9AgCc`EU!M!EBM_L-X_N~gB6EYZ_i2xu?^$hm zNr}=4gs$<>GuhX2>VOXMA57C_{Lxb@2zSiN9!C_NAxfd&2UVZvR*O|{hdnrXD-#>U zh8!VUCq{F=uLY{Shc}NSu;^R_1+rXG zW5EMN?#UIDCl^jh+jjQEkS69?-&Gm}4T6#p$z<{`szAj764u5nrlX&rj{-DwVlUt| zOMl|Zng>ZkU$F>z1)BLH@>K5UJ4sJcp{n<<$raTLIN`n(Ppxn78yoWsTdbD~pY)tkJm{!rayM1cI#Kr0z zqnkKzwlYrQc1@so!W}DW5~M1X63{WeUsDVsh|htj@lknK7e)=$e>eU5TNaFjkTA=;?!75=_2$|6VBcvMOaD0Ll8Tgp>FD8H04 z94JJ=F61B)zA%&EkF8*9POoXOsUbH&`UDkTgN%OgDqF1tYuOgf6vmC63hT1Nzg<_u3?}B~#pwutu``5*zrp1+>0<)A&wz{riaX)7- z($Mpoq0o62pmR@e{c?&vcVN&PtP8Xyc3{6+TGt|^PF>&BX{clu4xZV&bIsf*KnJ~g z)JuUhMb-K2E2cH>AzVNqh_|{Kttnf3oM~N-hSkypst;HoavBgdACed|NW{8BaFHsh z%j*DS2pjY&A7Oxc{whB>va?7!g%Y+^M9m9uCtWV!n<9X|3~J4fT_P2&V3Y;>^)eg{ z0uK%FIg<-?c6!1i43w!(ceINwln7jQijq0hEc(sdE%=5m-vrYV9i;uz26*1dyL#R< zFfMW{Hq4_nZ2akycSW7czbxh%jC8ja+aN!{%d;^aW^I?HlcM;tuOUaCKZ<81|Fugw zZXQJ^@nU4suQ!S~4S+nDnkp)r6x&3_OGfx<8OR(!;_9ls_^&nDy0xDk%;?w}TU-T; zey3T$0ev0M09uW%z4%=mddC?qsyI|eH@h1;cxPAowr;0Kb$IEv+mQyNy6)2BG~fly z^eaHa8EjCndT75olSV0!Rl#stf*Fp}%V95#l1L_rnO^$M^Hce=Xz$b%3@>B98bof6 z_W5}@w%SyqQRGi|%8^;q59fw))X7Jp-*Bz~b@I|vWYH=q^NgaJVb6tAB4Td8)#}qi zs$dDRy2hj0(T6$2QN(WVf=jEBsxME))wa?PLh=zi1}p2=qVJC@G?eXURf(h^V$GWw zZ5>r56-E?x-Ip;MN!xZP#IQ~wAI(!m#ko@VoxW+gi>3Q*=Dm{C;aLzV6tS*Ne(9)} zObV_)38eJqUi4zxBFXfyylQXq&sqTvlk_ie{zUCWFB=9y?ybW%l|aqPQMAH8;A8-j zZDy9gPPejKInFFPrYOOZco~MM-mBX5S{B}4vl3>or7U08@A?(t;Cf$-_w}blnvL$4 z^b(i@uZ`bKidc!(pyCw?MQTg2VjCPtc?gNMH~}p0R3ZQ`a(J$n?A}o5QR*N_r%-7=HLao;-(WqW<<$iJ<+PH8tDJ^67hu|&pg?XW5Y)^Qraj2-+M z=8d-8%QrrG(~no?OHH0pmToC-i)I@}@28&qGb`fFKKZeOC_XKcD=q)r&nT!<|1s+7 zJo4n#o)c#EUt97CmcLlOl71G4c@|ODs{qiIgqPT!dKI-JO6F4|bN6A~a0Us1Q?`Pd zXn~4IKLLSD@1-gVHnFBQ6qtv$?S8GTxC_@LcLMC#@InD!ijBXQ zzN!VPf$!ATIZaNKtq5PjR807K`61+EF@gh9;bumKBM>w}k z@>p-@vcdbR#>bj!^iMu2HJYH)_LCoY9_nP0#rsrMhz6zFgNz-^FSUi}`VHLCVyJx- z8J4IVq1#EJp)M7;gxW}|7UNl095Gb5$=l%$ZOLh@&N~)JJCk>bITWaRXkLKkgO+$( zL=th)G(a^suKY#~_-Vy4gbxQ{Ix+SJnQK>CG+(ip=EOgGX1n_6W)TA-4{ob0d^uCA zyg=4BXX4+@!Q2)m~# zdGW8+1#v1!MMVM1oss*VIp90aV`OhkxXMqO0jBMbBC^+Op`1Nr>6da1hYuwfqAp2f z_erJpatL^hEfWz$)^+JiW)t9?HHl*d=Y4Kvm@nhNrBJRgnqD?4l6U~y1*0igPVMri z&SoB}gjl{GT&RSNe?fN(Q4J)kr!{I4Px4P2=&$~$1R?!;Zkrc-hybuwxlwNJcsq)`cnPA`7!G2E<)@TR7Zru3IMqr%5J}s zh%e>Jq(&o$hjA?VNsCwXkUwcn1(Us|rk{H8cqcXsQw%*ya=smTZm?_BvT(kpUU?@Y zKua~z37t97gY^9Xw(Z&N^aN$~y0=GXxwF?wJpo>|+Gv>pH;RP)_$LwTXi=XZlfl-i9PZBpZk#V@p6rl<+B{ zp&ecotdkliR6E2x*f_M5`;n0yaJFHi0Vl&ZrBN%w=Ox%c948Qh6c6@IseFpwvy)`v zz{LS2;eVUGUYgnYc1-RXf>=PD6bvg|f#cbi;D8)vcZYQ(LdCkR(5ZGOJgor?2jQ*w__YWa!>B6}&LGpDl@fQe z>+tN^a^v^C?rv64(;{c774KSBINiJ!39j%{!W|?ZB%MZ%)+5Gq<2-N;Wu#JRE7@3| zGXV#*b4NUDy&jOL?fK%}&M-K>%X+dBj?_Aeb!sq!3F#)1xoH{_TID>NuNsxDs0sBRQRD7hKMYqwD4iu0V*&Z-lW?P{57;x+PNc53&Yu^>UhIC{`!MC>3_R>#zWQ`*yORnU)Bf)rMVz&LP*dtq^D?Ao#%Q&cy+PHvxt>|< zISvUu^K7Sv<3MAgY+T{R@V4w!%#Ex+;mN#svz(Q|18a+G{%XFl+)6kG4eiA@Pdze0 z(Uvp#!r&EpPi_R$oPOWvAC%6jW`P@v1Z=q)LxQWW?1l&-2Ahoueuy)vL#9`*#j%t= zEt~Iw_%cHXVZd1xS^(w0&Rv1u!|l-D)gKA6q>hmpxVfni{i!^03|b8WG)HgC4k#0o z9_Sa{HT>Fuc2j^-#NXyWU$Y^_Bcx9{3A}<|F6Bh1H(q5ltK?l)%1PuQb$bwj6}W9Y z+$?^6&Zz29odM4*()qy{s$PMpKj+CdDex{T5`NJ?_v(7$_w!J}EK@O7<3|ZeW$EGv zC@~dGM>L{TWy32Ldd%m-m5zwiYN`WuF7j;-e$+FMY0%3Gc8z039{L;8A#$qwS{MSk zISwcW?culaIHv+6ts53a#`;IhuW4-^l}n1spCvmCaJpuJev=Be>_4r%&;f(Q@A3I z2#t&a>1i~{rM%F-0?Fx#>=%B}Faq>i#1rI5eQS@#KprFqK;UuP`_(&(d4p!h4$$1;Yh%Qp+@``SNp%;>7vDC-AP0Lv=3x{a3 zA?LqrA@l%^cyVwl#XNVL&^saoSg_itNHwuaU)2By=GwRpn5Qk^nr?){YImttsWO|( zeuuI%F}7P@{i^T63;Z>IY>X#A)r5P;yEyL3qre3Q^&{eDYV5QV`xrpw;(lpw1>!mU zK7a2$k^AZL``$75R#3q87`aMnwIU)2#dSW0;q9xH9|sBvmlDu?mBNbHmFOg<5TZmf zXlW+{z{~g%i9wr?EXg1R@$$IVTi~KJ0cQdAdaFfRw$eW|YxzmTsxH$@^Hn8=seW+z zJzcX+=rtRWxajPkFoA+VzL5Y)mpCBdg_w9?ee?+U@5LJx*@%36%xrP)GUEU}EJ%+5 zUzqPNC7#{Jmz$4pZ|ONvO~R7~HB|eL;cw4mPQ0zdvv9=*ClVtH$}MEg7yysm;U)>5 z>6w~`$=xb}Gr|vVP|qtcuwP6y`WWz~<%`mNr-l`WXWYecCLdqaJY{WaoJ_cALD5AK z9lldnAGW&Y5&~^t+L-26!ZNQ}7JwC_{UJ+~#TG*r`X$Di7uK zUTecM;()BUy9Q3Z5cq?wK_(Zj5dc#vn;CzlowazQd(bL3s|9*YoEsU9QC-*6Q#{=R^X?c z{cG%wk1pQRYWMcXsR1D4TmSxvTj5GK;>q>9o$d+Kym>*NQtRT z9!H(!o6kIBwrdX0Q$j1)7iBE;Q%S(rC-y8@YhRA&(t@3omM9umBYHb>eut_ep%p$= zq_U^mIFyf_sDQX-JSvWbf76MVLQy??s=5RzXJ-Aj2ZKG0o@!5ni2i|?$nB^$MFZd> zx-wMua8va5+ZAARK+w>#^?J_sK}jPcxIHh6ODsfuGs^#6X`Cn|vg~L*8x4wPjrt@n zUD^IRj@`Y5TK1l%!o2+d2BxiRXWT4NR+F1vkJ4TU@x;X1Wxn8WCl;9;d=!}VQ*yi zs2?{|3M9^=tE=>qza5XlV}vrA!&z^qvjvs2x&a6!M2$96hT7E$k)~cpZ4QM3I7;hX z@UmLL_s$>=_>EogZ5Abjpy{j?5|ZnX3f^K`5Yo#+sHrKUSDYIA0u_`f+W^D!#6L6z zWK~6Y&4?W_)oyzI)9R6zt5+AG9gIs#I-UlN#NHU>t@hX1p0~*qY!Q0`LoqQ|!t%?Z zkh1;oNp9DpVL+>3fx~Hz>`wO-nU~_DRA-3hgPS1kBi@EzHjU7R`rX!c7gwI3Qqv2DY6X*F9) z2hprC7I)$}-JQIB5GkHisE%V51MK}PcI6rUH%gfXIEdmwOQCzA{qi82uwm&X zE6*yMP;*j0mN!ajeQxc#dm=LM8Cq;Df z_U5c-NP7tUy`<@ShS5ciD2NTQ;$Ox7sUZ^p4v}_q{_*RfEWqnQK+Pg-e&C{F(a*gs z@{%)FT)wcQcYV#b0H{OoSl=oDSd2TEFs>)UJ0F9)XY|n&n^Nb8oopb*i+}ECt^m{WJ+0kzFn6h?77-EsLal#P zPJOzEc2dF*VS~^t&<~kMNfv3r5TQ*HVy2Rt$Z@A-Zou`3&KWw)@s(3Hl5y)MYD7)m z+>LT0P)pAnY@xeJl?VSp^b9J-ksGN%c!+DCzK{UvU}1CPn+T3aQ}9?g#(*;>|2Jmz zGb(S=i>PF6(!JVE>-i~^J5BcfV5k}&<|D498lbIQX(Nb;!j-YNHNsm77YFo-mpa4z zEAo0`TAAZW^DiF#()bFS3e+_J|Dk8hxz7GHwfpVebQx4hco4CqsD5?@P7R@^Bgu7I*V0td{4%53miipp4)&(W2E_l2@mn)Rm#%H2(>9eUCOeILH1qSQ z@`7(t7~F>8#EqD7L&p-1-vaEO6`9H@TKh;U>_ zCF(c~jtbJ8jlVHKkt{x#$>XiwPR7cFCX4M0`8`6+(o++?)!9X_(ED1Uw~TOf)sz;X z!|XfXbn%a+loPK)==gSQe>g*;va=Vca)I30xV{I^YaFA3@#r0kHE@J(6?b*gIG~C_ zIMeCyAgTySEzv*+k5WQ?^5JpfQ_(-vZYF9k-EPiqh*8-_(K1Gnay{6oHizT_X!Vz) zB~ok>Ey`VKgTXnaWXo^-fJpcmS|`%UlnPU= zrb{SuGmP);61N6aL-ABgf>ESKOikR{l^HdoEPhi(|HSrsVU*iI*-vy3YELbmI}60A z%W-?0f{xZM=w^qdjK*Pa3s1f%JHVk81~QRy*`U>mRnE&WeJLdOS0^`$krF&>6*9IN zlPX=_Tirxs$w=b=$&AoCJi7LD5``93Q7BPKnr~_;guv8D;Haf#T@1)ci^bpfS}hZH zHR>V7qzn4A6Dmk6Bn!FHX(Cg3p8P94<+v092o>Q?$)0M%1{}*2$5yIps3y|RJ&UK7 zBTAC0z+@TCT0_H(qPZ3nAe_e(ceFUcaalD`LbElDHB>67Zbj{^T!)h{?jx8dNg6VG zhtJ_(89vkuC=MtVJ;eh~mibP=;YyYJ zRXr#$>3W+Q6}o9Zc|Ushuo;hbeE?uk>AH$L1&@#-2Z@9_Eec^vhMC@&oAXh5G*)HB z(EsxN>V@&z#d{_%TWnf@pnb;VJuo@%@Zk9}lAn6jO>r`LH^*7e(DYG}C^U$O1Upo6 z>35RW4>rtt5!Uo#0R-rHOsRzX7iFPnI!YVO8#$m-uAJf+gg<<-^yOgn0>eUA_XJ8# zX8KW7KiAKqBz}#vD)(g@`IoD+t@;W)T(777LEmJ=>ZWQF=UENk-;pTv1Boft3JoXp z$t8H&XzVhlbVSn(D zR*lorUX>Rn%!Qc40wp4yMgzQ67Y_@r(%}8U_6EYbmsb z^mB^6%j8Sdj!mZRG2(Zbytkc|CIDI?bW73!8JZC}L2pQ}0G;QwZn(CQTm>}2Gs89s z*+5sdP&HZ`jSbClb{80TC9s5g9%&<5Kqi9*Mn^=#6&S?2e)HKs)8E@Z=6T04y%=Jj z%$7R6$#kl>A@GUVcsS1fb&-<#oj*0*?qq^D}8Lwc@DK#yOJ;WgvrBoz|9DYACt8u*QE- zs@6>2Iv`EOtWC8K0=iWv85MXh1lee3cVSMa9(t>9ctC}@eGE#C2jQi(g#c-Njib^$ zGLPz5eKPDW&8SortEZ33VChWJD)c~&dv;qm=E7ScEkxhJ&T1`_wYeIbc|Og~>sFjU zAw+l@z?eA8PP<^Jt&P;=yb!V;-TleC5Z5CKG%C1&qN|`lC7&zSRPq^CUhs{Bt$jSr z9%-vb&piJ?jSCEHZzvil=~Dsy`C?4T^O9w;SsSqyyhY*NL4TH*z;Pl|DNjo&y;|>R zy`3)dBq&=51#zwV z06olbxmurTNu2@^9pDg^a35rp9zi^Ld-78esxiCad$l!&;a3U`B2Q~-f!E|2+;F6v zdVg;Z2BB{b&nRK4M@{~F>c@Cc0JWqpj+4Kvu0jA>If@16kp>>HKLXP2F`%Oo@D%O% zp_4#*mHgBT&bvuBHB82xlqoI1e^2R1T{gSo9hYzTfu-lN7jzchCKEnUqUcN1u<>k0 zr|l?OO%M9o#DS8Ya;7IGwxp+;qmL*5qn-=6gG%Qkd1n3lCWm<{UmSLX_w*jr1KHQw zm?65uH3}CtTjtMo#1V&qIlQ^B6s}42A4>gob~ujK%D^4aLCnS1{@l}5b&)Cpn2Nt% zh&tYu3^Rl9Bk)?CO@8qPSgd~ZNfXPJE+DGr<`>lBNAp#@FB(`k#yh1&?Hy0v*1uFt z$WSP0&BZf^%IIx0O)Pn^*j)9x$lM%G&jOW*lac75DaT|SThSBPXeXrk1O)3rhVp}d z<3=O()fL67yMnr4TEK(|bB$DPSgNw<~BGqX8gl*N8@SL{>;aZ@i zGgg|uLmEt5tWfi`=F)5c1`P{mZB3Kwqsitl)+*HbKpB_%nTGeZbz!E>hC>#~m2u>S zb@9Z@*Q`keJgL|(z1?C{p!MoUhv5;%BJTlJnDsA#t8|FI^x{jo-4Jm82;an+fjX;I zPvLsN7H4c)H3WTe8+Zi$GnwD1>ebeyM9Wq2E0z1j@E{~=qqvu1E2GE7x$kwR(rOr{ zBFFZCaI|LI3AflYPigT(cqw}M2M*}(l_~^m(uA;`o@LG&8=ZIyYUTgSUF@%mE9w+C z*XX6@#I;JKs~ozCYVQHz&5^J68(#Xgf$NDDh^AiBo>p{mTva;T<&Vrba+imKNh!FZ z#&ZN_&1{i4GusG`9`~~L<3_|%{D?JS9W?;eRI8P6?}JtJs$ov1L4m}n2q;EQTBonU zTN)t`T7PkT#}@V!acHxo)s*E4s&@+z*P9ew0I-I*XZCoQ$%Kd#LAi7iv4_G_t&y&l};vI_D9@$%Sg*pJZWkR{9P2U`w!&2oE{B z8bXB3>)~=nuO z0ge6{siGdE+*Q<9^bufuJ~G35V!g4VPbC}L6CsDRR-M4jEF53D14uTJm7@XPkM4=8 zS)0_RkWX+jVu6y(6$^Ab++sWul{gE$qBnuDD$E{$L#1+P{+wW?*q>6kOY#C>Zs7nt z;FUp~+HfWw8V{I?Nkbqkgix5om>o4j9c__!R zSNP_oXT6wngG_fMH3YEwBQk|z625D{6aavV9Zng7dRCgXD7RMfX?31nJl00l=`W*D9JL05 zRR5BglUW!T1+Y4PGH+_tw~T90E>jOa8XS!S$^=R<0#0ygxT`yZ(+HfPSE!<-h#V+n zLj9AsQg{K?zEWOri$tIj5{&~VHX9su?S=+VkrgldG2T; zIs82pH&1DEKpCREf}|B!vV>YzLGT70Mx!4O6U&MyR^daQVHP3*bhAfZzle{FN(l%FfZW(Wu-9Qd5GBY?%r)o+!rBGJ47fL}5uA=oMluo*8Nt(t{ z?8oAR%e?QqHMbkRA~fa=>Q#6LuGU7#M4v=8-;Z4DDa|Z&(jCVo4eW(t8&L0>7o3Cz3$jFb`B3JFk|-6_dxOesLQQBrfOMLMfANw} zh1;B!HG8Q={%t0ocWVEIQTHJb0o@3;&d^)*tkN9`P6fTAQLzFw(z>t%D$GLS@ zzygO6`tb0ek_Mz;mW`xP+*m#dQ{%k7XC|M)cuJ!VpF(U&tgK{Hgq55UV1_S#7h?rq zlp4_?>T_C)3kRow%pX6@Ac&? z)!pp**Dj-B_c;R4vS^v)u*ttpep8*UsM&N1(%ncbXMT8wg1clFV3@5c=?xSX4T}80 zwBY9NYL0ufjvEq7a&PGjqggZuGyjE~RF*G9N``h+#zl^#3p{gRldqYaVv zL#-v{^M;{MNeoSGL%&BAL;YAJq7NIUrl16YyWAgPYc%O6!B53(__b0xH|j@ z*(sjr=6G8V8y6%9kTpRC{W2vLkt>j+_;yY9#QiL;D$yb^5_!oQ&~ne1SPR8eVov~Y z%rAG_Y+nll<2i;nnDUgvcZI5|2&!&e#`^TT7YM#5yDEd{vCc?;0ci^=J#ppR3V!&; zSrjYHv_PJ{-!A@%+kp^yR7e31c=M)O5QH~=N$7xSF^`olEx~ooihI51vwlzUnqQaV zHE5A|^EC8@>6~bbA;bEYm*4Xpl9vS5Qnso*SmMS6W5KC!@<-8wgFw*F&)9Vw8x_Sf zbg=JRy$;AWCt3LJ+Nx`65)0Q-db?DlWz>wNR&Nd;u!sN}=7!so$0A(mHWB&LoKK5$ zehr9RtH#!+;>sTG3<5f%jb-kvum zq?#>xbt0aho7|!)R7O&f!!JkeyJ@MlLH|7D?C>m=Yau=hi4k>gD|!Ixgp z@MhH*y7b+A^m5=JN>>h%1wRZyB#%wa5?CRV3u`;%lyVA4rtx$zFcIflA!Fco5epPq zPC->Y<)!f>wz}cV!c`io#b6(Qci@CKe;J?!#t>Uu49vp!D)X4OVsy5BVG z<3ftdmzx_A0HWff-ic8C{~3Fi=18yWT=N%r($Nt;>H%r$uu3M2b6t2kaR&=GBT-eN zSQME+CV&JInNVJU0(jUVk)qqOZb!C)vMjr$lIfOsPADvi6eaN@M>tv24q(uA&tIfw!>zi;og*Iw6mtz{f8(U(wr3b~s5k(DQDNKl3;9AZ4<%V0mes_vG? zz?mFKS;TsTX5dI5V6~OPK6*;dMmhW;!a5`8UkXqRy$#aiIf;s-aP(A6lgdiH|MSjA z36`iP@ufLGp>NR5X>Dzk8E$Z(P8N8OaZ#mQo;y(MYbX8l;&-FUZ~ zFlezNie7)22=y}M;viOWef`z}R~E@JaR zUgbv24|jTLmD6cg#H0LJV~za~I=O1{o!;aAf8&c@F!F1!CVB;?aqLjsvbMUhsH3B@ zjAA=B;1}%=YICm%d8_g+7lk#v!FdQ*fZ~C78vrw&TfyVu;xo!sT1Xmph zVsA>s?FE)==E2ecaT-kgwd?QuGn|1+a^f7QyXA7~v9d?6KJW_`7MuQXbFe=qmKASC z6FUD?DkqBoWLbsr)mcFA_2sUC9HxODPU7B>FHk^8y9ePv3b}^al%a;*iHZ%qboJFN zY4LT;6yGu0DJBexpejxx@Ag#^_CD+|KtO2|e)^r>^kJG9tjj-poG39fwt21FUkH;G0~6pC@)JIG=n#nlhDlA$T7O= z`7e4Q0CSW3nym*~iKHUzAJ>fAjV3cWcs7tP>zVE8yZZz&6^l;FP@a(6Qv^`kp)p75nDe<3Ewh|9TWS{ypZNh3o16-Z6};|X~5KVlL%aSh1{J-ISTAW_u}iXi@{Wp zr2iuIvsO-OSHJ3>6!^8q_g(!#Pm5r~zm*0_M`aouxSdl>61ric3U)FpI{7lf22Ur# zoHXPJW`Whc2wGefI~#DCH+c0AQWw5n$I}lkryo?;ism_~;ia{^)AzzUf2;$kq$e3T z?_|`O{@PmvBc};?`uoL6fNTVm^fJymC;)HgUfBir2uSIX7e!hIE$ydDBkDN^-RzPR zofFb{6b>NrNKAfzOXt85Re262^71Oc=J&VMDOQhMT3t4;jq=%$e!rLL9_;Cm?or1h zlcKU2@fv!fR1<1h(biH3OFs%f^R>@dFP_q!0er>uJ^dS({O!D&$q(ceh_ZUWg53ED zXJKfq6UeJ%iI6k2f@AOPjh$Fc3BeY{aYX!MOa zh?5&s*qzau=0_Qec-bX2wO3j|^Z{_CCKm7CL-T1%L=NBcJq4N<3R`l(?=H5>OfH~E zY8@S1A02rHyQNNmI9c#pKTwl&+Z}T?>z=u6Lb;_Cr*{2c!|IF_M!8D~~ zdAdPrv&!KX4_Z-svWohY3@NUr@83e1;yxbudSbK?#Vg3B`OP~UJH9h>#GW#otUx)R z3zJ`e4s0<_a5AZorR~JXi1L}|)O3I_3gwZ@AwB$q-JP26V`>sXwcFGzS5Ly_!Sg}we;zkqsxurLwcQ70r` z(W7@u!7mN!-Yp~G;Vl(0&G5GL;E-`B)P%k;xr{6^6VeEa`UG~0^1vhZY8JCQgsKlP z&N?ac#ws*peR>kJ@brC*31Ys^@DXjR^#Kv!Us`LJV#~}97P3fZ&2=4KG%3`{x=L34 z+WwSy6cp0ff`L`+4Wju=AMzGSc=P6K$vx!d{ooe;WvbzE_2cWeMUO{6%C;4%rYVUQ zuM)9c$JM=1`?eu9G#W>og+~*v#O?(7p4U-5Kgm!PlmsIUt6V~9a+wjM;;I|Z=>Zq~ zpAPTQ!||s-)-Tw4My82?R4&>bWs+G?Okz*&ISdNd!lATI0K(|+?9EbB2=B_U+j*{9 zC9#$?ymtOiUD-KvHE;;5yteP@g@OV6Z&s;w~ewK&v*Z=?j^ZC-j zuEI9KEe|6V^bE7vOzdL&FyyUt4?OB*UAtJ_8*VTeu8MAzxg>`tx^5KhfPH1}mf0$- zv{a7yn{+G0==49RX@{}JBllnSON2hrZqcV@oWOuOI)9uQA&foK$M+tH7nHLtQPJz~ zdM%2XMf?mAFRU7YcdZ8%0z>*Se>qW)ApfI8qP*slP9}4O_wX+_s@=U#*?#4^+tkhY z6TRtA{nJWq?0}rnz;9x+#0ll1N75y@ub#wbgT`yYs%pS$V4DxAsKR#CB?SE9wk{lWz$1Mp_`_$@BqUY$G|u2#2(C1I)&tr^XW=7dRvZi#l52zK>G1pXXcbTVDlB$P230S+Sr zDI#)Yf8(75XjHk;bSEG;mhJU-eJw#0rMGrgNoa5?{$7uI zmBFbucJ(l>CCwJ~AZ{&LM(aZlVf6>8d>GTq3exy7+cZ+@rB2|+7g+S=#V8}|#;_NQ z^WQ(ZDY6K~pvOml1f@jt9$sR`<@yF{<|G*yocjiVoh48jO42iEpDTGZg z35;+bNl?{ny^tnF9?j1$n$Z5|kgd78&<-;fIS=Z5|SM zh(f=ce!=jKPx>~zmqLt0w<5%$!w7R1sV5s;pVc@SB|`OXvP#1&Lexxgg+zSD#5>h; z*8d_;4M(Jeq6t-Nk_B{f_iNG~A1*E85pmQix=y3m^X8M!=y?kZX^EAv4hidlo7IsDq`5n`<|L@hvcs&3btmdJsyF;nLRFK`mg)Xr*_c?0BT z4TudHKamWmFptpwpxtYFgYMbLbn_5;LE+K&iyEH3bCP&C@D0HfXpO6nwcNXnwL6QU zDmZQZN$$@e?1rc_4tYO)NVLa~CLqLMLzQ>xTXd)`ZEMi!kt+T4tw&jVe4D0PK5j5G zdx_Y?hQf6=_cFpqAIb6a{_X8j_(jr^{;egQp;n%GKB(NZP6{QWOc8%woE-=+ZnGMXFzOWp(zXF7y;`f?p`(2$s&aiVzC~Q!2_9 zoZ$`Q9iGRND6p%0Eq*vD2N52Yn9mu2r>y|Nda<-9=v|xx>@YXqEy2#b!l&|OObT4( z>{KUbPf6{xPabbCRVSXvT98oE(W%wH7DB$U&NaFEXACNZC8M7w7ET?8QL?(`0rV^+ z6GB4|0%(w4MA#;2Wt5DszjCGeN&;b}@ikx0NW~^JjIh!WES`^L|0IPW&4dD1vKLWc z1ZD8(dgN;KVR?BqwmvE$L*CQ3P3agBbGi)jM{V@NG=K7TWnaLp;{42 z(L9@(iKtIeLe(EQWe}ndU(FCoPiFeLMvM%fzMInldAoXA`m>Y|u{!PI=IhUx0Q*D| zvd#nh5;rPN@c;Sw|ND)U9JNQbztTi{X4T?d^4Q6F)(680s>_TT3$K|3`j}V^~eS?tbVk5S9fcLhXyrV{#l3PxXBE*Xdvm&JDxRe8NRmlp5C-X4E6s z7B?a*#szp3ayxLGdKd$3O@U=Hcxi~=p4N6-WDY8fXKfOr`CvnwM)k`O*LE+NN|8AX zT;gbVf_SRXy{B(Ir#E42dU96>}>vt$Osm z)dinyM0elUsh<)NPRUB)Qv__98~Su(eamlRz)4D*Wn!nUg_JQYDBN#M7)hlllLlKS zV#?apVn}xsU!>O~nm$Z87CB?Mgay9w1&Dh6_IGL>!RTWf84t9p!AsZGnOF@~@E6GyM*_ z>B_^n*crMSUil>fJO3!jk>Y_i`u(g_E;UG6q!5)_*fxP4?p3wt)%`&!7U+Hbrk_TQ zPo}Goq8y1HKxduEdhgbvDR4At-$pibHmLNZ09We;4@08JY&u6)co%w=@}s=AOe4~s z$!E<5Q7?7EEIcf=s8nF(-j7iRTCE#%o8)v{Eid9UwuXVH`ogdkCeWS-7D2#UzBc29NIsoQaS~DNG8_gXcJJV5i6!d zB19*p>lP4?H2-KGbQ+9II==H@Go6q2pfjp-OcYhCjiwZQbr^|lOn;_z&NrK8#*BOJ zP!DM`wE7{@NHcm}yqrcu{=z($N2F$tAL&uMgRq}4o054DGl|!xT9UMlNgtW@f>D^4 zVh*p2C5@tIb2#);ySKieb>QPta;z70r~h65X&t+XB{x*>N3W&zyVKqET_ADA++g&q zw*sQeX8<>bMA09l@pe|~^&#rWl#c|+F#L+#e2QhTiZaW*cj^Lz^tzAWu>7GNq&rS^ z6u>Bn58;K>2x{q`GliRgZofO%a9(c z8z8{~7(m^1#1p;6!lQZ7rXNNG*Q?OX#!{{&6u_|-JTE+bL`%aOk48Ru)d5+ScoPiK z+NTR>saCr6)KiulBqhfQUwdJKIg*v0?5s8Yb~0U9Av0kxO11itJl{ zGppw~{SP!St2Se3obQaPVbiHVXe1-=xj*?of1eh9G?4M7q|XtBfoMR#v^6@8ORwq# z|LUl7o&k;#;pduB!LM{5KR9XqMIYW;>zoCa6c320lExwrQ$vg{fw+E4JIFDj)#I1A z8s$!D6UniQWCVXwXfg#+ztPMEFK}(v8IZ9?DZbV9JE>Gi7Ix#O-zgz3a)U*9VYXoz zc=ZjBeG>n$t8v;NRHYznO>;~m!agBpRI6Eg~vsqPZhs3 zJ-&-HFI9a+(}`N-lTn<0 z6hi@YMi=yzq}y`_RTZ0%0Ziasf5my!gSB`-6lPMU714IW2aKPWQhUy2!A00SHdQn$ z_(`7-Incuv7BfP$y0Igcd7bWlCg{1`EPd#t6&`p)Ob~aI%j6}@wJtEVVp*jilY!Y4 z7_mDUWe>5@7?_(^s_s4h5ml2G4K|);MHa&mMma4R1x%`te#GJhTdi>!b{@`^I8tHX%F_HW>^=T}x)!%yZ89nLq{GY#}8r9O$u|LS&qB`j6 zH{%u51jTG~k0T_hc&(2LnE50)E3{@NCc@_Zp!mdFn8YQxOzv@<5oxeavQH9IhR;^+JI*O4 zqlwK3bsYWVYxIA@hME^4T?mqHSVa(aQ!XH;8SQs73LygpV7$qo+6(ox(B_*kK!Nfg z1jrqa!#ksCOTZAG%V-YzWf-_@f+&N8XObLbq+j7W6-UQcRljJO=nTT3IM|Cw6|F=9 z=-1;u>d3lO{pq7h#?tC1Sqq9_p(-Hg=O`E-No^@V*EXDYKYc3(v<^z5vxFP3zpLkd z`lCl5n??GabIbvrsJ$xiQ7&C$Zr-F_N0gHl;`g$dUV2R;wIP*Z?z~s8_icn}@{tBt zQ0C}=@@b+RRpTh2(u&F_SxhB#hQNREfLp`m4cT!v(?^a z5?B~vq2BIgsfWGV-W!74UIlcMuL_JVCeQse|J=+F5}rG(++bTk81Jb z3Fa_S&&27Kz5@jRx##}fXmD=YE1?*jxXjshdWtU2 zxGCCd=n*QM%N23At4b`T`xIupArckq4j@2yn z&CIh%y||2->WdfAx8F&-nqJM;aWG1xr|G?MI7|bJ+fO21fBfug{_tf@7 zwW&vBq)_@WnD(WCraGScm549}T+A-F$)xDMeetOqCQDR%Aj50itvwq7b%ycco1Epb zm^5A-=R|`czvV8ueX?gd9DISiE>wt_d=v}$JjWV#<_o(-pi1eC5Yo(Dj>bS(`rM?r zH7a!0?n9s^O+alOgphyr<}*5?_L1MQRoa1bu|AV%P!jV@Q!(3@Mg+CtJa*j7=f?It z4%Q-An$8hADM`LZaNiVEowS#tv0uf{W$c(2aefi9N+WoSActVsF{4ax7N{^?I-=GoYxNE^)!J*N?kNcZ;(RG7h5UBivknZUr(VVU($=6 zyC}8XB>og)5T*?GFKP=)rIT8$=#4P}A)y2yAWPg_Wei^FM% zIHY}#5-gqF^I%!?62?qN$|kp`F@>|KE2!n7|ACK&@Y)C8>=n_|qMy9cpzC|pw3XlfZgN($XRD*Y=0#YPrZF|@7nKZgw@?fc7+TTtR@GJ%vuTdN_e>{PDy%FY z3i=*NDMsAp;_AWxQe-e&JwiMVvOW3$Hz7HOY*)cVK+(faPq}4n*Oru(5DAA8hgCmY z_{VQtijIi>2dLkfFHGDtjQ}B#M~B);FEi{*C~jbBnff}HoXNw98H&0bt;-l!^~z2C zqp|&>sen|`sgR%YA+IB|tbpdq)Oup*sW8Z+W##W!$m!TTJR}ysVexBZc|&VFDEUVb_HcGI@w+qweuT9av$fn6P7 znH~(i33cUDI}da%eN>Gxt~Z9Uw@h~tq!@vkWN`u^X#;|Yr9{tH8|Os#qd2%)rR+X! z6}x1dZtPl3E#9Fl&$skcZ6`$>4kl6q16qECmv$#Au0`|fum8_{00~_it7WFpuUA3R zfp8FhD9ObNHhl~)b23O!r0W^hV@`^W`R23F#|cq~BdQXGln1Ai;g&$eI5riJ_$-RW zn2bQ<1{3h0b4UPY9HvtDB!YuCUwMB{8($}G=oVPJRk zS^QU6M1Q%%T&m<>ga|ED#Nj(TUYOjh^7_N+LoX_<&5ugI9{Cjc^_5zjF=3{el_W)@ zI7SgFl@3fR|D~6ov2J7J21kvnQB8{JKXS~;yn+dZ>o8wXf{HQ*$gJ2V+QwC$`I8qG3)_JrTODM0XG|)@~{V>1t;?|q{t$~7;S~~fT@cyYvBdeP6rKv+>^#6-^c=FwR z{IDhlfWM@EAQ_*gc~au0%|!n4ZP31@%ff~54Yur9;l6-mvhBpV)1OtCJ6(iI6L$ts z_(h8NFg!GwjlTVPRn7@a2jJ*`f6nK7g2M-lQ1{tfdR}-?vb%yxG1q9;Ch6>Dl9Hkc zo!>_wNl}rUceof0cC(2*A{4D9BMnIrA*{XfDsq+tY5a&SP>;0f&UGr0kmZYq7w{mT ztgVkCx^43Ir|(1nl(uAQb*js;6_q4_kkeBh%E-$trq&>J&z%@5XZoGiJRhm=$UZ;$a=^w3=p>8C)f*;~s@0={vN)9& z=lw88UcY(M92oO!y!z6dC`AYtFAoFmC4Z|=(8)TOsu!!J*YNmksxO8%wM8CdQ6Oo&D4DXp6Z;I?+aA(4>mE-aL3gpj~9EO~y^QFK4{ zbp)^|$5xzt^^;O56qOjC=r=;uo0MCvGrf!HIS#Ta*5gehBB+}8Kd2aa`|1OIcKR+6 zNs|b3A{%pUIQ)Y)q=!W{39ujepFSSWR}Da564$s1?E8tgv>H-mGJf3PE?94XRsKwv zZbti0A}rs5z_C#TpD@Ao%$6j zGiO82g90dR(C@!uVF$y9XBo;64~w}UVlBtP!oUClq3bI$EOgszoQxK~;HH6jG1`L| zKQ&i;n^HoQiP7C|)}ZYodvp#gws}V?jX1C=L!b{&X&g|3Gs5$sf@wIFG8zoWY}V0w z7YwZS3bxFse3p!*p>Gj73K;vO#`(@3J=7o6ZrYFAIakF%GNWu@B+ws2``@~iXZ3&S zaIfF+S2&DwA|>V?CJ&^5yZ#1I-A}GQ)OwakC=Em7_lnLTsPgDzJ$JuI6ZGY*`2sa6 zF`L296tqwuCDraF6!c9Qvb?Ge8KH}z-i;_-ZVZR5i#2RENw&@TT=x&e6XoKxCiGW` zaumLVVXgwQpe%WC{rCDG>KO2?;pIglxxd^>D_J(K>@M?5lZZMU-LgVUrk(#HV>%Fp zUWMqXQY~}(3(WE8Cw!Q4s?t+kC00?)b^YUl-Bl+Ve?wig3W#{1E;zZlf5hCPg3$jA z3jV1$rDziM9N_5~Jp2ppa5&6PPa~00Z8EJ$m+rZYRG?) zijh9Be!R1~BX35It&ZU46K3vy09vRjuGTc>2{vhMVgv6f>FKS)KctNOkE@4}E-p7Q zubE&%jn>H|$bw-ie`py5QGUWlL7X-mx41h<2n~COleM*-5~eb_<~se_JhatrQ$$fb zW)Ya@y=T*ZlLq~?WMHGB7oQTR3((cgl(HfLY)-!?h$f(Qfz;LU|d!?E?w$GXh_&i}XorOi$~3aFNnxfrxuy_R^La zOEpvFJd)6QAjLS*>VX_%CmiW3LXY`8RL3#XD}1Tj{-5{+_lZ1BNr*;BC9mc%09ybh zJBivL8Rln=_NKomZRa+CRvgrz5D)wJpXT|iAIB%jtwI**=Q|vM;1wpBnrB-sVcE#P z4^}SoYR)lx{QWkC8`+73!$nl!)9ItHJr!LdROwPDQVeU4ptY7#<9^o*n6KjNH5fL< zvC0S>Es{rZ{k2^D-|6)Ia@eZ^#-+(i0oj9I>Wt+<2KgE!So6`Hqo`T*Ge#XyDPj%{ zE-7hC+4d4-W-AP^14S6L#33O3Ff@aMskFR`8TYGPH#Ow+{h-DI(8MgMg*HhoQ>NY~ zUnv~-N8x#Vt?xhO&jnPw8=>(Hm5Q5Er!;!-cUN`UsVl2??_+9|(U&T?P-VLEBW{Ua)ZgiBdth(gC*nPixy-5K8M=k8nFQ$VmOJnjg((?ZLyNcL*nf=M&EbaFB)|nYpXo_K${Uh~a8UR=DGBIiq z13$`eLW3@00U%?84UIvObxmWNaFw(b)q-4osI$NMYGj~&nd zW6c-nl4-OvIKGH7H^L1W5{q(De_c=pNii`Bay&aBtZ!#p2TcT(A)<$!j-D-8{wrLzFI(nM5Z0mUYD+S=a_#iq))VT2GaVy*%;x}PXh7dInSjPsOuKR7C@1j)~!SrKTK)C`pq$MTD~lCMN6pT{$+ z&j+arEsZwTLC*IElLSS9eydF}yUFNlOisjo!hrF&<972;69UndGv02ODXX#KyAA&x zIUGS)`bK@C7jl=XU3b8Y&D!4J&m;b;rrV}F1_}0i=mwots)1+)G4nmJhXyCT03aR- zkEVNyDW24cTErChpM6z?g@aTM`sQL_ggG-WLM6FcE0-OV{IEvN zClS25`AklJBluI=88aeMJEw#9L##kc6s0Xo#gpdKH}}6kGPcj#5O>hNofAPUc%rm(e&_)l-{dPdmH$bUk#39OH1os zjz;a}YkH}n@o5CYJBw(uYTEo^*^~aUi4$aYZ&Q&cBdmrk5gy@riX3E{6DrW?6X*fg z6HqRdX}9ZmEVWJG*?u5d@IGi^+*`kFdxj-Ab^^WheWl&(lkPlJ|ik_H-CFfS{z0 zBhrS?#pH}StyPv(C*H<%D&_8d_;?e=i3#EcAVLeYtfzI4*$UJOI0aFPsCpUeyycd6 z1UhQkWIg&EcvD!s5mBH;i??)G!RlEa#)ee@5^TCQh+wih(`t3psSn#MI;9&PKP55= z&_Y>95@SiD4855!g8V9~5WNg_*w8LQ;wxB@AO}?>xprUCf&y|`TU^kG2{SAFc9?y2 z`5@Idm{f23`A^7364%98siYT92vT@?q>CMngbg zSPckW`B7Y?PM?|*eSvyDnO}9Knw4IAsmm=Ri%=`TiF=%?kkF6c8sf4@j(zq?5do>r zr>(fr&yrN7^~>}Z8N1s;n9$ZeeTY5$%8JnX>@73;DSc!rg6?e4iRi%Q# z4W>aD#k~bv{&6N4D07;eBunDIasqoPiMmncjV%qWC3s2+v1N|3jCXHL9DS}E1h z7O-qT`h_kr6I18v2l6TO4@3wYfim&Z2uTJd$WeDoiJp4*_>Z(?P8?S3Na*2@ba?;i zi@6;|Tf3ti8WZ&<1W598^WlUMBqX)&t0h#LY;grx{F;?Xtz}y1ln%2y+jkbd_BO$g zT3imiK^CsRxa?R#*ZeWoB5*@=w0#V|WJxch4K|%bBGWVjx^R;I9d4$6 zTYx16>>1^%aRm_DWcC6ALF2QK3eP^D-5$^372j|QA<}qNR5CmC(7`O@Z0ieJI=ReR zvx*{!lK{0-vrnpi_ND=l|42VdP0;Jc;%QsWWqJv4KOk2udm@h?pR}}%Qdgd0hpS*h zva*0R;4j8KdC^NXb)?Am)dwaHko2mo8nSTwdo+ES0ss7SK``|dq6Wo(-5Jg>CEO3c zqTR~42*kp2+Ak^*d$~0x^_lBd7%qxfq{rBgY01&iY?Goikq-Ggx2VT1MuO5`T@qX^ zU|`%tfN-`&bdBX<6nUil5H`aP%@OfO%y?M|aXd$l9+qMq-m%unZlKYhc-Dvm+^nSY zWjFN*es{H}RR6flxbDZCB@-SHZ;gE}VD4*?<5p=e7=|XBS{X#oh~*7`CzMREITp+O8Hg(CDXr~scDVt z#N{z(u1`|3Bfyo+%yAM}DHIP*-;+D-|y_to#ygpYs})fkD# zCyxv)BaQtHnjYxpFwU(^<$@EQy#ECgfbq;a7%p|{e(Et0ccu9c-l;F$K?YC5OBa&q zw>l7RN3DP!g@Z@E8e30~>+I7X`!Wz$uLA0qsDzAAVA6#O(RxB#giYVv*U0Evx$Ne&T_=T|p@hA)qsUy&UP?%$eY!6QpBT+BdPvGsk>U zNpz*pY)hE)jQ}372;6cAZA?xXqcfJ zqucU);Wufct3UY2IP3sWm{^KajVD0Q^}yF(zmYEbHaQ({lHMlVX{$hK1j=gl>6Ohg zz>)(%ROx-AYGz&i+%1gI-&jLVg5f5%bR7tq=QSVf<{-}<9-M6d(&zLzc-O0pGz2zX z$bg%Fk)5d3+$wzgIvhIP7rg0W9I|(YG}Vl5G<5bu)2NTlFq191#sgWd>im`ShjZhK*9!-56(Y*OB7+qUm6fse^rvGnDq2;Ijz+8OjCWjHH)h)wnOH<^4bmG zy_HTlEr5H1Ct$53iycC^Xma(U(lfSZ^XEWWP!g?QPTv_v4{RaQ4nKx9TGRa^F-0xqbVX&jU7s`dSsz-n9 zW{m6w{B7#A^jngO-}9G3t)(~wGog52%4`M`)$pqV{n?ce96>Duhv~tG(k<8TU7Vx_ z9u3l%C2f{JZXtXnWx0#v(53+s#cNIp!0;&WrBwd$0xQgT6u3&DG_CcsSVhMs80D$8d6%9b_^Dk{g6xaN9U=% z@WCt*fZ0oneSdNwYwL3XL}L=*??l6;4jzLgL>SDqsF^->df)^DewL=YON9NB<`!A_ zfwQSNUdY@hZ@wC|6}YSx;&6d2q#0i%(J+0lOprbAMD@$cw)r~jyCAacipWK9zS&70 z;>>kpj0^&vh^)hJ_}p+~9eUlp)cnyW|kk5k(3T%2|mRp0G5L+Wy)il#7TtoG!BXv?9!X%IG_`jNv_hppn_l2L+TIuX_b^t zH(Y(L8nb{>qU(D2LZDo}dHsFA@?K|noPkShQtiXi9vMaek!o&TIT~}MG9mHp6}VzG z0}I^y^jx6kKEo$VO2kLETOx!mAJgmk6f$X?OoDGEMqr?f?#L5QJmDom+*XFGsBYn( zr7T*`xsq944M9W|*G49Hr!lt{_-9y#@a`a7H7uDySk5qSY7a8xYALV90(tTRI;YgZ z8+_?mZ?M$#hCu1I0z3jCb|#rHDZxHf2x?)I^EgqFR-;PwD+kH5MKp03&MtK-3J_H^QFc;fM-=)e z_WeSY;aCo;CrG@G?y}sG+|NCrq~6cytFop{haL34*A#QC^ae>!4@VaWoV0o{v^xL# zqr~G&>6Ydnm&C9#sj1Nrg>^t^dyu~+$xs6T$E~Y zA(dVmYUAGf)+%0rF7g+aFh|U5>s{=1U)aw?8+#rtVKfWSMfn)-zy|e-sNa=n%)=sa zRSm;-2iqhpOfpZHT{`w|Cu7%y5r>IB#FG_pH(L<^d%m<+g|K=IBCLKw+B%kLM%3u_ z5o;z`G>X3HMAp5Z(fDF7=g{M|$_CT!hU!_R-4m8Wz$Am>5t#qkHQ>U$S6eL#d?1*< zN-NBoPKta8Ok}f8MNx3F*;hzY6R0U|glq19B9$SZX9>3+_)JvS%U66Ry9DOF2TCz& z!9y)Pq0l}JH_AFt{G|yS^JD#hqt@l6uG&$KD*#&e=nXw`O>tezE`3*E9j^@J4lk?i z=9(OQIT2;Rh=k!}i3V6T-87>J@?q}iO}G9xzxv+vk@go_-L@;56ff zSYmW|t(7X#*+qbOuZ#MFfKB---{v|TR39+%Vi*8Xcw*jt`59eKVhz8qjY3+F0+qKj zP_2U>)L`IR4Os|;x) zU5lh}a5{y-rPgoy8%!HBs+txEr4imh_cZ0eBEdo}tcOQZJcaENgOm)Ypa}Sbzo){F zc^Y^~)Rfn1ZfTJdbg~z@1>_=>ird7SORj%-{X<>k?aj>{y_H}Eqf$){{=p0j{{z<( zbJ1%bQFIeg9zFdKepiy+!Zd;@5gzt8V|+#m53Y!eYO(4QFNwU!{h}HdHlwQ3%qA?P z1Vu(^J;9uCmrrf3M6M$WR@7&|#zZkQN%Lr5AbQia9=EbB#lCgKxa1h3=%aq|0~xb?0he^?)v_GlRaxxC8y?XcV!JIYwca#?){X{|tE@KQx66 z66LnTuc#RrjKU(OIqLaNjAG)aS6|5zuXXTFjIJ&7Kifd(>s8+EL;)S2UTEmS>y);x z5=AksaI*XpsBo`JfOdV9Lk@JL)_(Ju8`?NHogzNbo~@AcYn$`8eVVBFgi-C5yg6@O z-tkk{XX9XTHkqInee!O;{<;nW{*3V3^Pl@%@Y-KrfB)(uA27bQ#_}SNQx6dEBZ=zg zVj^W^kknh^&F8ye^#N*q^Dc)p64y~kjRglmx_-EeIwg%zJk z(_jIIL8Aa0L18aT*B+}&Z>D928X@S>+&c!7`6XwANajJCNLz}XS_estQn0q+Pd_m3-|;0n z5$f2>U7my$M(_7(Ds8iOf0Dpht?0m@Ecilg>FJxViYu_R>(}T>6iRj%jA@#P43XuS zGaIp8x~h4FS`w(1s5FGnlIf4pv}U*`p72S^)-T**r4<2ACHnNq7%kzi>}6Z4vo6IJ zj?`jJkEg%XCse$fJ6{+?9D|D9h!1hDk;N#X*D4`};ukUS5;QsX_fjIgGy$>m2VozQ zCV#9R4N4t$qe}$8E#*|6ahsR1lpOd7=uz#TP9OcNevN@Mz z-bFGRm#{#+)L9wKNyN0a)X+W1qgbQP?6?)S73H#}xd|l2H_~4^r2u&t+>G4B<@SnJ zTG5*mZpJUxYnQe+*MlA^!op}Xqi_@IT=#plzsnqj%!$*&$^>g%@5s}DnPj`B`AZy0 z(|Zn^wyOTHHs9Avr|;JqJFNQHMO+cBefn`pClWfXRwlVRGub7v_~qXZfK4CRh+F%xyBnLeRDNm`gJ%`p~e@VYQ`p~h*0wx#W_)vA={xYbHYw=vhy z+lBk{BL+B?y#n94<-4-AS!i4QYvNd=SyJ~`k)vabSjnd8gJ*+UArL+{u)xP}G8L^_ z&!jgIp%9^nHWg?13L_&0gdp^9z6RUc0r=Ss+epx*>+k6;3$!=NPY~fGrBnqB$on(@ ziL5Fn?5H&VV+(jU-1bw2Z2R|1?P)t80ijku?AI!B6#_= z`}+K?ThH-7?FS|yyRGbg(Cp*Gh(!>8iOh|_dDg6oZ4QxEC?%1jbri?8r;2ikju)Hy zK8cr9d&DaN{N%svFG{~~(8@F=iB+kIL6fARj}dl2#RsCSgf0l^gEizO#sAuaLN3*D zmKlakleM6T=2qm@xu&TPTYK2fTu+k|*MRh=pScXiT0bJLS#^j{8lI#Xzg%ZP$Z1#k zp*~BI9%g1dZaMsP38v)t)-K{1z72K3wY-rmsXb=W-D$fI$2RS zJ@YGBWT`3S*I!SLfRMGpN8_j!v+hdIyL#@xdj0Tu50QsMCLl&k-12zW}&hnXtf94>|#H*%>69ew{HLH7>lvhP3W7d>RcWexYt`V0_)Sseh5YiTcnlHz6{fUB^aWmPDSOF2byny%`4 z45T7g>z{h1vZcKn#(ztQP%na7gd!RTg!gP^cA#de1{zeG(&?o24T_ z#b+LweS%TfpQrC&M8rWinD#5i(u|>q&t8hC5-6L!jOa!Af9+jRD*uy;r8$832q9d` zRm>@YG-Wxy8%O}!diZbHEkqekT8>312P6jwB>)c0jDydR4Kw$s);$k#_-?r-jJ4Jh zw-*=npT+HM9TBmeH--7q2S=TQY)iwt_PYuDGxAM&csO39IF3}I-%0c()d*+ZMy1XY zoOg!VjB1T((dkW&TBixaBxJ=YKHkb^t}{NTU!)eakF4|NlM^gq3*Zhyc@4}n%+!b7 zgIW<3N-}x?nk5$&=k6@~dxw+SKbLlQwA3@lFu_I=Ykp44ILFb;^^zVBh#a6nFJ)e4 z9eObDF6ikPSwTz18fZi!gMF(BLGW5i@@ox&DJ#e#AlQL120eI~H^sv#OoS+PVt{sL z+6q8WLoEZvXvQFA7G26ZD4<%%pagm?=%IT2Ad7-&{}IQkSuA;9nx)l0tiMASs8Ey# zrH)-#)A*F~%*B8`6fcNu+%5iS>t#El9~cO_N4|W@XqR7vjfB6ErAem}gyMM`IioS$ z;0?2b1e+enTWcGdO}syR+5hoC0{5dusL?^mFe>85v<@90!RT~(5XDN}_?XvLH*^T( z66ICBtH%LjhE?7FEgOM){`)7tcJYyGLax>50f-vK)apjyUa#+ zY0v6qG7Q>xyg>vZ!;v@ce?g}qP%$>UKML+I&1u~t_@_D=87d$k?^8S{!tWnx;*kbb zvo@068R)L(yQB3^t^;OBf&7Bnfcf|5-Oh1X;%q?m!=QBi7m71V5Ilis`aSv)Wmmgd zG&{XClj#)dl`SycHKh1nalsmTwNF`*H^SsPHi<_VT-gRanM5%Sd23Fd2zaCf9+dI) z>Vxy47l!Jsl{17>gXZ-3)bo zDfx17LfSxdGz89{>|!MLQ!Ly=1&wtRMP!`T3_7i%@>=~sSE!mc625nvzVu4e67g(Z>_7PN8rM=frK=h+EmRGw4<$R*pzw^l%mJ`A9TthCGcC@?j_k>m++)NJ2n#=b z0dnm{{NSLoZ+l#A3DuONwgL=%XZR&Ph=!)O>v+_ds-KQ{=ulTnpsodb_s;-+;zcpQ7*jEB42WUTR^?mBf)7fO=NDf~cs0Uo zFU^=OU>p_xfdeGj%XhF~^#to23`srr`fsj235%=No)ZK~3^)eq$v-L83a@W@M9NJVnc8O1_B`;*b|bsTH)a zbr@B@$F&>JynDX*-UO2&^fWZXY)_P$bah;R3YA0Bh+jbJ;KFE^f^`cMfZYO=liV#<2R`A?0;=ip|2n<1@cJ)Up;u31?F=x2mQP;9*d9hi zyxDJ^VjJd0PNx6j?_G3S5yul}>(I0ppxRMtt^3gH*)*d{K7bi4K-B2=h0nJs_jI&H z#{0?DT%vyby~WbtuLr6?pPoJ>>#84wVotQn2!x?tj762QN%nB%1)M1}sRI4k4C?Cd zLssG#nm)p(a}T|R~&N(k%X4sY|vT5#dtag(+DIA@CFRwU?f;L|o|5 z{yK_ewL~?_(2X4wYiYHe9vkK!7v0NziK(nFr~RoAoGb`Jl&076rq?+Am)g~LB0tWz zl-vpSzjvjO(rnWGI`Y9U0kTs?InkUMb~C{JdIJKUX%^X`il<8W``fgT?{&Pzpo5M- z44e5DS(vy*itdZX+Bi)}A{sq?I2y!ogqtPAru)>2fY7U-_oJ7{RECZ4xj)x8Pq?@u z+1_c|GvR%gK;6o$cPZFp4B%8}+k+alYSV{I4emLm)TNFFwR4!13{us;Sxvr5?xby^;SJ)Qt^!T*gBtW`MUgN@ zAH|&n6@8Khc4%56_uenKfJCzcpGl@=vEXV%TPT{rYRz7$Zu2H3AMtDqlxl#0%Bul2 zni^&MA;}K~T&{m~{ia1fb|_+^VqPdl;YH^1ZYH4W|T0_G0jk%K9+!z9&%70U zbJ&S#!QdNXuti-IQD)%g+KIBZ#QFB~1@dt+;=kL8J{(Vd-kdoX-W)g-tU)uu0#SpJ zgtA^o6>KJ~W&aZ@hu@Q1ur|@bHLSBjO&@y5H2Cz}Aeg-PlZ!H95;a7;POoNyNGnn6 zo&Hy|`N?T5?_|*ok@E3LY>a!!-vrOqwC&$~Y8s$a3qNRqSVvw_Ty}66Uc)S1q<6Oz z15Ig2z71*UHFFfDeJrhgN~5zS{!gd&^yi`#kh&va6QyuPNp(h~yE_T^Q6B()Wdf~t zdb{#wG5|y*0GRUQ@sYKQ>JqPF5=5+txV{$$T2y_seMczH)JpChCXQi|{7Bn|NLwvt zOFMVdeEPwDj(J9q^xGlmunG{TA_7k?MBq+GlCieZ-=yDbH;jvWt9rF8pK7ekmp%Zi zJFA;n$G$)Ppq(Zi9vk$@gv>VC$Go;kB=6(0b{|@&Pf8$%h$nqgQtfJKrClN_^zq|p z1Y>d#Nnr?M2I=5;pIUQ#QQ#rPHM{zaz8MLsQkBUxHLv2!vp|NL5$Z(?o9p_|WE2H0 z{nlaLTY98LLkrn9Uis3X%9+r38uDx=B)>%x&99joT!cyo`Wgrs?DYx_S)oI(zq5y| zFR^{JaFhk=VALWbVj60lzzg&{lxX!T55wYnHvZ3qq02KJj8joXOx*&<(t#>uv|bH2 zlK-pCk>?TEpwENzZr%*I6!O@DcMH{!@s$<&+>25-UU|iICr(j)DR9SZCU%W@Ob(^6 zvGkYQiY}{)0_4>}iA8IYtB73*t9;F%>NuTStuHV*wsoF^EQ5QcOyh|bZXsBWF zPyInw5|28-$4vpDKo#>5#e&+HgB2dIPV3s(CS$-KYxBYF9?+vNzJm43MIj5Ux6r+N z)==e?8Mrd3Yg;h7No0VKriW*U);KY!+S&~}fS;B_{^?nuzPo!NGnpQM0aY?}kG?@H zo*E}}%1TGBsky7&x=hH?$*k8C#n`FF+GJV_tSlN*uphLT$T=NwL@qAubeS~Yqj zat8h4l}SDulbN7Q6>GG2j#PaqQDGK9MR5_RsTuzv4S=PXyuNjYsG*Jh?0zYZf$Ee- z)XqfEnhlk*(SF$ejG8MI_vpN=9fYSziIw~jBka6zev#dO7Z#Tqch+`new-2=fn`n) zGbFV}kOMEp`hZBqL6nz@=!>M9o~I@V0cm_KY;_}ge08$_q(5?7Z6TqX z7TpGqfH!Svfb&`mM$Dz;+mWIlp}$x zU`U#*Gx>8Ob)U@2QZP*-K8KG(UmbsUi*#HCnls9Y>nJfsz$=iVVM^1oTMzH_l{-z> zonIDLosUpCZzfn=^72sl87pAw*jZwC5LB$G*vi6oB;IGWV)oXFFCKQHsQkE_sLe;; zPgpkrO^!#1HzM99A8xEpFVqk83fuK__wW0!bUdK9M;fQ2*}#B+_LrM}^Yt5EH|<&i zL?&3kV+y60s0Jcfsn*HMde=Apv#EJXhq3C~-k~A8u218(pG{6Ff%2YeE~PJM zF>WfKohbC3&JNZ--6`?Dea0K5az;~luHW**air3rqYj=Rxvs{`FNH10d*~VJLNL)T z(_u{#Z~Ob+#mT9eKB@tKrEX$k1$l?&XF&a<4hZJV@}s}^0@R1PBxF!c47oMK6qt>I zXXC9h9TIBXV0W2ZQ;PVwx>XuFT8pj-@RgyAkMUZjq1N%BYUJ~vG&LdtM&LFiY?1>IAmxQWHTe&?*q@}JA+WNP8)hiX?4uepBgiGdAdNymiVpqgN z{7|MU0W#HxHn$f_gC&lA9>+&8I13=*nNoz{T+{|C^(arai@1 z!!(O>elO(dXm~5IU*$o-o5dIc@F3QqwQ*>Awf1Lhb#2EqQcoz64_k}-nl|t8dFwX?=0xI-70Ea)RgWn)~! zJYY`WE_?xP5zw=ziO`ELWw67y`uhO?Ju{n^txvx*1AD|M&ybwaW3JqCVIbq)G)SOR zgpw#lvI`{+K~~>K8IjHmIUy~pHyQYmnuZ+{#W{{^-Y5;-dC@^9+DBoRC~Kh&;UbTx zKh;liuItlU5V&_znr3oiYpgwi$Kfd+@-<}=FCcnEjslEDv zxWH_2mSUeHuFYc^Rpwt*{ZK|v@FW#chZUnr@L=E4tvWcRETc9B z(1{65&{<&(fawwi*bmwgk1Da?U&E}ndg}m&wB@6$f)(_%doq zD~84iN4|}QZY8}MaAA^kP4H8H8@5ucmDsRZX<;=MYv56G#Gsq*`NGm=syY(JJuhzJn} zk-tK(3YefnwX|KE5S}o@%Y%kV<>7#pQ~e6JT*YvqSWZ%7-7S&q=>2W7tC`8(cQn3##K^SE07cpfZ!~ z>Yr<;vHLe(yYKUPHToyWpN~gSapR9KV`Anve^aL#<366Gh{MVocC)~gNd*c6YD2?r zsXs#VB$kBR1C*}Xw|6Iz=fh?Dq-rezBY1$N$YMVcQ>xD+S1M<<>d!_386+Sxw*%8@ ze4N*;R_*!`V@C{Wsl3sV8zyo~Gj-@*EGA!FUfGH3xi^ReA6>HdG!EEnS?d&e1g_Og zae#Remyo)bnp$X(S0MN5Hx1#*K?H=BS-v}wq3TeTx*s(@yVuDU8g0heLBd!O5#8L# z?O`kHy_kDj737E{KKV$KkJN*(A*F58^y3O+g{E=Q$SG3*VFd`FfYQD?Gv!{S5GNc@ zUhV~rj2C~z09tPdQxB9;O+@}Fuyc>l8KFqO0)XHJ5x=xCmHH0KBB-K<+0VS}w~pG8 z$Wjy25T?-wi;v;>@g<#rvNCl>44`XYDHY}qj|C*lZD|6jVmWh}8+`ZL>ofRLk3V3qK494l!qJ65NYt_uoAW%Kq7&DxG7-hUN1J?J{ z%2drAyxVO;HMQ-B3?T7~7(w^@xzA_?m;=N7I))Sagdn?!Ak(=t3W7FAwHwl%A+Q{M zktWJ#XBN_*3E zoGZQHq&n&m@6TNQ*Q@vQFVhHj|^_^96xG(B(nOI;oXJnRrQ((eC z7Yzf{xSq$?jF%!t)Qd4QV?5nUA#+`7r>KS~ii$?!ren7q0yrv6VHAI3Ta8!!bFMj~9)Qsy#e*vS7xHP>Qh#_xQw{JfJ$i zO}04wNl0$s$Qc;b5tWo9vXz8{7KHVt8?RlcLke9hJjz9l*O`M*Y91&IYnMsX7*t&Rc;c@!RIY}rZ`e+R`lO<*wr*d zPLSa>@rCp%naxu?14-+-)gAvWrat*gM4Rz%_4oNkLvPT$x)fr>LU?`v!jY4WhB!>1 zii#FRTvn6`8jVY1iELvh+iye*z?dBp}EwVB6GgjCB^#wt?dlFy3Q%# zb&yJ0g5rfW`t+BQTqYJ>6$*50`tT^AvO==eXb{K@z}cZ^Xigf}hr5|D=H{Js69gcJ zsQT!`c9d>R-wn4_MmeWSGR8M)o}23*&j#HGJ|l;&&k_eDlQ#we22+#s*{z03z{9S; zo%HD`vFGc~x_n%guDc5-yDH&CxERN~_&oUG`ZwFf-#< zXmf8LX%{|GX}0ChkxVJdgfK-Y5MIB$bw?+B*?she9(v=mH@y_VGGxl*MUA+yy9nwq zKiL#|L4O$xvtEbfIRQj-&HCQ5f7&RJaK*O0FgY)NgQ}QHhYQv*Dl1}OJoUHwEPbJL z=>8i0AH#WR7DvGwnp<36-EfO}=@!h7;%#ej-GgwQSSy}fQh?!rS-RK^f5)QeB(2$= zy7z{aPQvd{-jv8g6e%tXA?OIHU;#}kdIE>bk8jHG@_|zYQ>O0v-*AqAsI?$QafAv~ zpsJyiZOAn0B@hHNMUl)-+vUhHpx{rYj3xIohk!pvc2uV`$@}=iV3c+PQv$|ujTh$T zLw3ARSB@CajY*uQG6m^iu}qcIl<@e5)%X1}RQ8^tGlKM`J}>7ZWRhAoO%kTIpxQ?Z zTa9ODn?BOv;4;IUOMFHvifC$Dw92pfi=783k6MMddqKr^*#P1oQ&@ytS%?k-zX_F7 zW(p44X_8}c;qW0>be?o0Oaqhd#GO=L^e$9yuy94q!3zc9)SEsuO`d+mi=v;LT0qA_ z9-o@rAl_Ci5sEGom!&qhG&-wlSy4+M1tJBy0nW&>6iQRoqy^_sJiNZSq2*$wWweww zmTILDQyBPJANusI=)K!dhFk_w<0~eWTbr%vh7i;X6~~Qh1i(XAuN6rqemb03w88tV zm&9dCpLMcsTFH%@pO3b{7y_7{(n)LZeCRV&N)580a#J^~j)pPP?WM11tqckC@iS8+ zZZtu2A;Y}jjUc{1vc-w|W*rL{=f$D=bxN31{v2nN_#e#*0u$uJBlhR#lE6phY6V-9 zd?q4!m|@sSf1&~GvBa80U+(Nq-{JoGHMA;85(ECqt`$I*2%9A0=8#%MFBway8}d>m z*8A~Waxi;cfL#u%bH8W3Ws~g>$R&ECu!Y)xkZ#brz@2#^8cCs-l+wAgxP2M{m%i7% zI1h=E<-7V^Bo;AA#ler;2nV40Qw6)mGFntMnoXUoEDvRGr0R|MK=WR|>HX0=wZ`y^ zaGP~P9A*NV01eizUX1LS#g>UkMlS!nv#W0MKBQuzSfbIbT&1@eo;bo`0NpJ@FjPVR zVbG+Hxmlg`^TAOk$L}EpiGpW_RTHi?VO_fkLOgn7@+dk(YjY)}r#)FjuvBZ%n>XC=j=xN_q9|W*0uD+W;|Y{~m|$@^4`qqz$YaP+(I5LiX3!i0D;` zv%YBh&6Nu7wc5C!BJPYfEe}cOYLkOrgj3t2Xmoq)&rBBOH&|c@!*$OAb)z(K9^=dp z@($C!AV|wZUx_m^e|x@cp#B_E)gLrrD6YvJ>@Oz{`U&|+1}5%wn50fF;BEeYylxq8b~kqxt(!z_#);_YCN*yINC6!~5_r-_}E zGD<;OWthByVP_H$S@kaQdr6CB*Co{1Zo$IFt>xy&%87Q1C#LU$3DFO*`XKS%MEi~i zP@ZQu^nk1=rJ{W>!iFDHQhlhZb+2P#v6-Q_@qnu9YN)(`{0?segE<6t#09>26~3H0 z!lK`7ZkRqwRmZZFJjg<_k{K>AFtXLVb>iLC_Yp@hrw=+Z{(n^DK|7^~amrOqP_CvX z9&f=l>O(4elAdVZ)4vAHjPwQ>7xS~dblk4~EBgE`Ose1Hn;s%ZBd;$KYL-W?mDn8+ z>R!euS15H@Z$+lB#hnlxryr}qkXZ4hmv8#mK+>q9P=BTmDIq111FuXzbKox(O^sOH z&U)Sfg;>gDwJ0F@>ZGvn7BC#23+7(t2_!XxbAc&_A%8|l(;r*2Qx*vhi98;wY2A@L z&kdJKcbrJO;irRBW;H!-D85*#L3#gv_klh|vR)x9ZnK1TMpVl*8KAs?pLXN5@U&yT!7OuK;Y58 zX7>6N>61YT1Qg+lVh?vOYfPSDOe|UtP{y;Swzn9Z-0i2rT9?mk+Q{!h`;iY8L8HCb zaayyH50>mnAfi$jxMK+~l%f%kgCh_F0t^p}3G-@jkV)WPtwficQ`Es?pH_L14iHgE zNEAOuav8$?N^1dsN-w-1{hfKA_V$cA<4Ib#bHqUW_bp$s927^Mjox|7Tnm>$tzt(NESauW!yPyLN{g>XA%PK zBe@1c`Wf`7bYs+I$`Ru`H&jT5ek{TOxVg;fvqtsKkY1yQ)pke>reEEjX zexG!WFXTK-OM@G%2>k&JT5bts_MlxsKa*Wafb>>Wv6gz4=yqIbUkNJgLPI}y^BGed zVF2XO|0tLjlLVDFq3>2FN79z|`OJ{Zko@^5Mj~z8S)W_n)(b%FV0i_S^3LjxygI?e zMNxH$G~ZG)QIY0r1!=Cfv^61j^>Z2bqSj9OmrUkOsdsrcscrC^6P${;c#VzuMX#EG zzJ|J#HYn_8-#ce|Su)@nH!bK1w`Juq(faV!{S<^5LkLk+))CaMaT7vG_%}KH99e1N z0WzX%pbnU8^=gJBg?1@bB9q-&Iu|oH+8daCqARmGOIb3N)md`I2*BFxLz}tR*%ev+ z@lF&&7KplHGLy!q#yjjM*jH%4|OKpj1LWwEhfk`i+-y-4yvNC}BGAXcLt?tzpf)kT5S zRd1z0zsLmfR`=;5C)B#XwNp_1z0&+!Z##Bmkw-2VAEo^1pdATHH11#Xf-tW2(i+zy z99!mB@>`AB1DYigsnF^A?@KbE(+!PCXq32d9yk$e0)Jl3%6iiqrFk7?Nf1Etw%0k* z%G%r`pS7?>NiDpc(BU0zpkX(W%z ziB>rOe5oP|gA5f8{>kf5tLg;#n@TC$hB(e{AjJDux8sqc%ZRxg4iYDFGW})2Fa+#9 zOkF;GJQuVJC*|s6^Ih{V*JhysjD~(EOx63hM54`M+q~Xr!}#)-qrlILv|h!KAkLV!tZ6 z(4Sv_{VO>--~~bPwNQFp3)drwxO&?>CKOamd-T0TWyrFbK^ztU7bIc<_+Ok(1j+mb zjL%q8p{z=VB4BG~CRMGU2%y8adhwlQozxefdd*i{^076IK?)kj=7(}OnP3ivB<2et z;Oc}wwc9CahJL1T3DuII^z({*Eorb{1K~-eD-(T8PC(2xr2~gVLSwahz4ay>%Q5)DO{kC=4ZL)<0heUWHse6+)`-c!4#{k4oEhQO64xE)hFc@D}@ko zfTQY%p_Eci)78hDE1k5HNz^W}_En^$L3;+|c$X@MRF-PUUkrEQ?@Sc#;W(=w6n4RU zPAuNG!m1($%&Ad;elMRAI#GW(9#7wcB%%yx&sN+)Yt02>XL?1pvGb+tcOp2AEqUGw zR0?u7Bo!xoS_0So4q`81By@_w5F>J&_Bc%lJxhQd6q)?tBe>!9!mvH`ycVc&&$AQd zmjU=t?kDdP4iD`z^BPAPT7{O*bpku_`pu_q`Kku~StUuSLkqabw1g~^ECErxyed}= z)O-U)H6#+lEhP$^28*63R`i3`36n-KRFi&6>3!@f1P?aBcNuIv7pwzpgF>OJLJV^k$$+?h>i7xVqm<6r_|b}&(xWl%3J6Gntns7jWEh^9dKYo|tB zbozxU>2DjF1az{tFVBUD)*D*qE@JKItA&tItCF_8qQ%?rY^zS zYp#T9kZ)c9f;J=sdqqP;gY1$FK*fUx@+(J!YGpJmDzQ?&*?oLfg<<(GBU zYXIZX%iu!w(1>;7+3I2xf5<&CLm=aLh&MF9vN(TRw8#D{6BD{oh#BC}g*3{@F{j|) zoiO4rd07z(<(7Og=zzdj;$jE&j}i~9QV#_n>yUCuo~sW{m<1yBqS(PCL4ZKEPBO=g zpi)`Vd&M^=s$Klf)i7c7)*ZTAw%S{?C)IhX`?MZ5{pGQa@-t8B9}J+>e*oZm&%5-u z4UGpB-?wVJgr&C8q9VqaakF7V(i;M&#DwFrkTVNn5iE%)1R&uAVW71zXkkEo0E?*N zlbjDgX8Ac+KV?h5)0&=s=FhyQt$k85{#jkqd#M4UrxYDWzpKX*0;-8JI!5o1@}}RD zp0v2#G(i_B-_~!yjOF|OMVB>i0tlo0k^GJrFAnEJ(QksI=s(Vp`Oxno0IOc*++-Z}qB!Ay0voHuBST@s zLm}vrV^vNFIh<^`7sf%x7=cU^k4RIQHgQTXhRlBc>FEAKV}kQcLV)>82mx)MvIm(0 zFC}2?olp^x^XeacoRK_K3O-lu^^2Ay1u<*F!aSW~x;&s$LbN_Q>FhcyC9VU2Bn`@D=Mh>z zT4?xpjjh$B1%axt;^$8aODkdEPt8wuv%sT-nDF7vA4(RrWdw;|M`Hpq77okZ7=LR0 zc7RDIIEMciW??-fV$A9)T#_|ekvcjhCP2i_-(Z9rDY(#&2nlqc-nTUzw9|)g%F;BG zUKUy~`-d=b_znx8=Sh=w`5i?A}Ux&G&tzC|`t7bceU<8I!t^ofs}{(E27QqBhH z%S80lV+<@|dedJ5CeWttzy8!M{TDMOu_scWS08BI=O97SG++`NW{Hy$QA5sI?F~>Sd(0GP@~jbZ?{j{6TW9l5$KZ3+3TQ(;QZy zyG)y6vMGLX{i6~tp_>BZSWSF1rPsSZE9)jT$t)Kwezh@Yvfm?QQ#urVB{P}36nFZ| z9DT-Xz%>$L^6wA3M~SJqjkxk}@4CGiJ6#9I)ipmV=TZiYYs1(7aQ)`&^XU$8z$Cj1 zG>POAK5+7%`mS*1g;v#Y;%0{9E0(}GIZZBLiY*RDF8>%W)bC>IW0v+c!}zwW34foz z<>zo&%Cw7FTU1)nSXAGu6Gltcyh-???MY;F2$9i))C@9B`xn% z9s!`GFpIay6H6rEnv{=!x2^%be(VZP`yj;N>VeX_ULkQ z>(~K)ZXcl#(G9W^1Tk6}*j$@DKuwAar!C50Yya>s6SV<#;gv_Dct!b4uX&V}e_WgF zb!Fi8v)q(KN?jX0Mkponc2p!*Oj4+MTFTguN_qomx^98;QF)jcebJO}y|JY?bx+Yz zH6S3PTetyziKouMt(}AEhk3-yB$v6v-a>CDRVh$EJ0g4qRq zul}(jCOthvDSH3la{tB$qY{cPGA1d5#qPi0Is6tD`L$%`@g%XejU}yrbz^67TYtED z^C=$|oL{C)s5>3LkZcA2X`lE=2m>YQ`t5ltf-={(UVGyov_fHkHkY{Ll80MEaiCXr z56jihdlB0@52$%@LXEh-qH1+rFM_1~(g_Qi+<|-3u4SB(8ingK{H58-6e~gUJix1e zx_&#iPj^6+%n!B8#+O#TiY>y(;+#hQ;t*rh%fgdur;I#h(4Buxw0o2agI<{JRAK*A zRah3q#KegpT!#CO)Bvd7L>O2ef>{$ckioi1Vjz;S2?fibyf-m$$WNPKq_W5~eK<-@ zhC3ABQNjCQm=%7JX!CYyWR_R9z#H^B{2iK)mmzWYKkwBcd6oMsAOiS{sIK0of%blu z^1kNGISf?dZAs~vs&nn3TH{VFS3uvx)i`>Cm7U`dSPP>_(WEAiU6qgi?KiC7O zREm0K)B}*LGAO#>&*TAWCHSTZ-|C4}S8I7ZN{B2n#9sF?!pG|dh)oiqHb_?g_V^?R zfs>x+t&Wra0!@)+h-+yBuxJm7HUx_N`OjrI$_+-iFaA1$ypw<)e5hp7(WGRS5aTaFYU2 zScUgwST!c-q9CtildkC1@5E@x?2N=Im6{oRm4TE@Em!_za_K4?1S9PFKMuLW4q&=e zNp>qEgXM2gaoWnfFW0&>*JLxvz0p>q^s{ZOubJ}1^Xk0A`OUSE`E5#Id;-+NqNWjxmQN&Lg0MlTT5vy!(dHiV)# z%GU$8ndQYX#ieTO=s&}Q48g7qqGQd;Bt~nS9QSIZU{#f51y8a7GijIH-_qKhDl3>@ zkE0yXOOYYhJE%4@wbliY@m_?}|L9B8j83R2wc$kp=yk@Z&nh#b)Ux970#b)aGI715e$<<&xc zur&U3Y4{r=HaaiVQSLEczF$)K@FitD$xYNh#E?yq8aWGa(dR$Q0A~JWW%`b&ORU-F za{lNIod5}`R+bddFw}os8f1L1eU2pXQ}4(oi40Bz^zLkLcrX7SS#Q@J`B|NPeuA9L z*{tU%jtOos<{U6kdY4;osMOuYu^&@!l1fskM3qWv$#j;%7?Q`lgpfFK2nkIf#F=px z*kEk1VXaj6T5~jybT)I!7nrXwziZ#uzN<5nmBebP>i@s*eeZj}UVCrb5)ePt)jzQu zyenKmO$7t?b1%d3kAy%L66~>?twZw(0%a+LWlYeI93u@`e~`J&Riy# z62p^GbQb>PC#+bWNpC=#T`9E}5>^@&?$<9*0zmNMOD|= zBtasYX}&s{zVk4>1o?xpU;=G%P_nd&YY^8gjS+?o<o4yF^3vBPeiEuTe-u2U63nv6nUp75RxzlXD_b z=odD_r_m$SB*)zk2(X>x8*UG)Sw3bYHr*3_-8a-_S)_EA*QOts?s!|oQ^N7s+-^#GtWCTw z%o`kZ@|9lDN_I6+Q9}513@Pac69M~HOLdcLv*3vkJ2#KhN!}4sBg97^_fo{y*e}x# zf}v)-*HV050ELPlh!dypoRr?}E{F?xOtc4!Df@p)P-~iiDs8D+CIFrS4scZ+ z_vcE0it!#0dy0s-d5uLO1@w-LAU~&u4@LQ7aYqp)o1}{r;T%Ox^DT!*HrV(pd_lxD zHiw#uj;V-M@yQaPE%-!tijhS6Pq+5>9!&_g_=vEeWkcphrEt|VlkyRDb~6bPu+;7N z8_P5cwJvz_Bfgl(#g`-Oon^NjpafOem*ICl=Se z%*#k41G5w;_4vb2lHf!GvacFZMjYr4wHT@PZCb zOOBY)cWsPdzRM^KSJ|{BuXft+#vTNuRI3p&1ArP0(yWkBWQ&5T2@l+8G#lobw&GKv zmZ@AV;5IfMNu!A3q=nX0=YCouiKH0hqevEZ3e@#Ej?s_XQNR8WnRq=gwvK60_1~%8 zx7}}+qM@TG;-lIi%`yevC3>xuI48Q>t?9}1#bZ7VcBgL0jpv`#?GzkxR=R9@EMaib z4mJckTxl*j8dl)dKfxovvb^sASgR*YeXDN?$iyRj6E2Vcs7df=K0Z1YQfE*NSK{-J zB3YDbuC?jAK*W-KIwlaCdDt8q#hYHn9J0Z0;*4rNElvi;ivK|9u0%J3O~;v5jp9hD zM{l%Q1~v&>3K3^koa`D?NMo=u()tMm~*Nq2vu@cT_)q^xmH(wL!!l^@{Ht zy*n`r;<`T+u zv`)~Een4qYh{elDE#A0NBGu2dqIR=0NaEwVeqpj0x@_eDcCg@B0bzSZuM- ziYZ1;orH$f>Q|dPO*0TSIDHYtV^2Px)3qWNK=YC7*L4fZ`PVA-xmK1q3^4Kwn`*M9 zpGTaZ%!3z!i;~qh$FlJ8QbNL&l2m_)OiRWqukK&J?2D&BaURD!cfF=}H(Df6{2}Ji z9Vh3(uovM|^#{&;_s5I$h)JEXLe7^n@k?biS-MN~!2==PrH5q0L{63FAZEIS_ROT& z4PVB8xReB2U8mw7bWnk1Ia^1-)D zDx8v^QZ7!njKr6Ooz}q-OAA`oGXgj+7_e4-pYDvh8kqc%OXCCSNxZK=13%Y){_KU9 z($2-!pd)g*P~hn@Zu;yXT{eQhl#|vh^cI$AJ$W~0Gl9=~`x>Mi0uaSxG1@(fP`!pE znf6-5Vrg|A?ec*5UV{ec*>)wq| z|B;V^`%wAc!BFZ`Q#81Nqp7avIH4m0%=yrcr$5!!hD;7g@u7%)IR?)0 zG&5Y!khj-&uD_!Qv3@!o4!EyoA`D}Iq5g-g&l*xZs=;AA@Py7Vs}NH(Upd4w+N%*d zWUwBurUYeM-BB)r07XhUG&|F|LIA9i`uw7gU#gy6X2{`ifV!kYT~+=wA%;MC80ty5P^RHMSS^IkeD8l5h1Pl{s8r^M9HEbN;N1Xm zga?El?N^>~41Q&^1r9fq2b~P!s0lC-6&Lrs^e6rhOQFzRd(cd)+(Qd1QDm5uf<1xl zI_A~V+&eUv1qt~xr54k;2cletV-<+8*VUfv74?6~V8_gJs#B(3vsP+rh+NgYA%6XF z)Mmc;F)?Xmm7{QE86+CkM;Y?;**`PcWa|Z`l&1D8xuFFDsL1xO&FUwuz*JHgtS)## zOeAq25mj_juQJ{lq=8c`(U04D_0W=J@DQEGP;sJ>E}5MZtF>9spG zT`RU8LbaBsatKMZ!Sw#9Gee@y)2U1_VvF6FZ*W3b%p{f=Wy={8MB4X{GA^xHo*)Qm zgO7V<_=>lIgh&|3xpbp{&A6Mq8kinDFbGfkAs;ctKuRH+Q2^}dCt@eFhE@`*Rk14N zR>~gqe5Edsi2BMIrt2V>a1o21T1H*C=Th#{$)z^f#)QhQB?v?HtC7F0kAL#db;wV> z@Pw%YXu5QpHgzK0Yw<}Wv$ko2V&LYyAwXA9zpMnLIYS675_fpl81CbS9wwZEPISV| z`m0!dia?~0h>VEINcXW|>IdJ<+PCX(3wd$%ex95n8^k3-FscYEkA+~jcD1~?5tL^J zrT+p5HMzFYjyrikvA-V&YC`G%^nd7j$eH?YxUkUz(XvTiE+8vB#ry!+XH2TgY`=nQ z{hH1U*1~_)DiD!jxMEdPrzq0j-PP}LxUw-#o7&rSOI6`i<6>Cyqp;RgbEdDJ1!5D8 z8L*@~3wjU%&@?y#1lRBANQw6UmtG1eFtlmXqjj{G`H<#Gc8;P~{wZ+2(x>c`pE3m) z4-=+f(msrpt6z!_ZwDW04dcTHuj(kduykwm?d6rkMakiXdD*(mFf5_43@dW~r*C1{ zm1hi-Ip9$Y+(3w3mME3ZAF-7w=%fCD7SJBb?zOy&tD+2O^vs>(qMNxmGy)%DjL$#z zWHKCuL|0M8LGgS#7SEOOSgafH*lA3^poLixeJFT|Qn6f2P4#n{r&KeB54jRi*T%eN zoSN$}S~65Zqqd5QXQ8iS8c>j-r$z9sK1~O%ow0_?ZiYxobc&s!XY^=7+^Lr@rtkP= zbMzAO8DRqc3OnPwekyYFbU*A>o;p{fZ1N$hXu6F>jai`u@u3)2G~ca+|+Bk#G!-l2?VCaFnza7 zs@dkYDrYDvu}xAzdWVAk&YQA+E)?iFsk=qovK_*BUdM8OQpLX_xo_90KPQtOK?z@T zcY@RKBeBgvRdh0GL&N~W(tf6I5!%yA#2o!_OW|E0S^Ig4b31op<5XVPH~&NM^6xVK zQ{21vu0HT0X+@JUJ`iQxTCIW3FnI&D?DvX)L*^I=tp0x1&+n|MM_UcaDzi>YEE^2l za#$>Z8?+rIf#s(nMlrE!NSF-IxTF)n184(sXoJr;!y}S(qZrSLCrG|U&D^jW#4r=8 zE@FE6vTAt3dYjV0?h#Pf8ED!XT~V3`2~_{>L;3)M0>%yHH8(Z zfa$CI1!N%~Y%?k&E9+Ax1?Au2N5L<<3Oi+&%t z>$sepI8BNx5({^wb4bX+dsG0hcDyZBGdgXtKqN}^Fe3 z4DLUE?8c|{E)Vsp_+5qBmCg!F;`%i`0&zsQ!w^$m9etQPp zJC%6TWkTCrX3T{4SH!~1PcFug3hyPocvmndYYaGnmrsWRz&=_uCVYOsH_p&1tC&RL zHP0$#3MeA`3WynUCagu@9v>F`A#+Y=W0tw${Rji>M3W@tA(R;OD$n`{;=9|MCMw9d zs1Q4FSs3<3+V-2{XRza@$#-~nQu}^>G zY^&8ezN>yz;!X9xiqO$PYd@`!k{GJElIU9}UN~wo(lr7!jb=7Ry!zfFcK&=nMgh4? zjvl{HiCtQiOuvk1KW8dL5Nb&~j{}?ayKZwnDVbmrI$>e;f|v6l07)oNhTvuTm-ral zJsQ5o_2>}(&~wqrpajY$3346gCrLTDtKkbT+%Si4x6-#GH(S$#VR2*K`9tk#3Rv1^ zBqbX$0xpl6Ab&Td1mN0}2<7w~HqXpZ1b?1?sWxI(7am>SjFT+VP_;;XNtwNNn*JJ$Va5|dLc=M&RVqUS z@G!+k8u{HHmpu^ijp{&CUGaiR4p_irNX5_?mWPQSYrsw=Hy5F%Z zO4OFweEdZ!8@YXsV(2)?MJ^U|PqSWb)0#xTa zTjv+WrzvLcQNEvm8xkDv$yc2SojBUII5q&P2%AuWAD_>;<{nA{Z!P%YA)pgakENeg zb?2m*Nnv5GM-o7?8Jc(&fIS3SYn_ z!~+kf>a>)0(Ig<$0m0kwdF??gS$%9;XCiCd3hFUHsBZhYbsSs{%X5GL#&KlRj8 zzQ60ZFQ5sE9~oz(3o`5Z!kAzapn(w(Zv5l-?tR3+1Mt!dv@w1F{m4SN(T*)_FunWX z*Qzr@9B?>F7&8C~#2W^o-la3X`lUYp^kbj-oHgHIc$60OSWVVacFflfQVReem|UEH zII3}UV92#l38tayMkf+`i>Q;=zKZfl$-D!SB-8@D92IT;42kdactHSTGpB^ zyjU<%{GwdQS}QDY(DOoUU^V+zHiws}cyA95d|f-{c4jJ;3<0$~^b-prHz*$!s}dUK z`F;d>NwSdo9*yzieUix~4!N4OwxZvzbeuFqnQLf_JjgPxI@mELgw6FqBTSff!C zB2La@VFGXj!3g!}zhua3X8_heJCMJmy%Kldylwyo&sYc z$%}j6B*X{(Zt9-#fZonNpB;Wozvqn68A%)Lq%zc8kH2UQ0?oqBh~D{9{Gn2)sEWHW zee-09U#s7-t&Nxxr5~2o^mU|Xs()orEF3ihs)P5Z6I0|1sXXJjG;paj7$)HQxkt4r zZGzOQCFIyw{Y)EeaQE@-bi70n3xaS9sb-z3xHS^b+bi>T6xh(`G5soS%ADa^k_l>| z+|fWAmI5lMfrJ{wjpOG=_e>87=#h5cA1I)tpVVQa)`4v)qLX*?j1hVI(k>j8$&7IG zYGobP8D15LbpoAc0$PUDi9C;e=9AtE*9nli8s+$7sKX-pSX)de5<74kkT6o-KD(!% z{;c?k#_q6nXjKWwPN@c;=jF-glTkI-iP~M7x3DgFM1_q86i4i;@4S^Uy>T4K}zyswYrZOAmZMtE^(ZhOvuZI8GRL=R~H1H;CF``**lL5!Hg4c zJD^6|Ey+CT7%hJXX&xp92TsAx3>&7J27eD9aGb~vBD)n*shFqeaGH^wJH4z0z~7n) z=LT1c`)W#T;i#(xs(ERv6}I24)wP8bAM%2JFB3)UrxRIQZU(mDfB7Gu+BT^@FB2Ex zG@9hKBi+x2ns@XPWB|z=%Sx(6$oBcmB-Wx4-SjuJ0ObY?;c{TOz)fF z&PSIp5&A(#@tM#1iCCLi*+4s*I{bTO*w)E}{~M!v{PFYQe(D5WW^ZI3^Db2uw((i? zdo6|{so$(f!j|W%XoKXyK=dS|DYnNsdYQO4ipkUjC#+msm-0#`S$ZjqV8~pxbxwrT zapOjTFnr?1W8xRsrcM-BWAku0G2DKoz+?6^4tZ=|k71)FyP}GQesH_IGR%W4FvP_1QgHq| z8La>ehZc2f23@inH)4bzoC9)9%tt~j0l`>knt9^K$U*dvY8vLH%(rOqSE73lzL^|H z@`pW}g#u&{`3_%E--3mWhJT`tR&UlcaQ|kz;@WV!BR4;XdGE^tTc}2ScBpGn%crAi z9Z!FjqH>ohz-g{4$z0OOpZ-(5LQ^beMPH*t7J=!%hFqQ=EcYHav>Ntiup3M!i zq;QlnS(Dp1D~MUGF1{))qzgzK zbTKb*CT{l*p=nE<%hvg@TsS;(zkeGM+6QtGkbLBYsv4r}i|G5{+lS#T_9LkqO0COh ziJQuK6w@OxkDV}IJ&xv?M{3aQ7dA0Df_TB|2_#3wGFUJdMW|q$JG>+8RR*IOubr$v z8eSlZwIXoYVf9SZzaYU*a)f7fj{jC#>+I+BQ?$Ztz~wICesuv7_i=wtvuqSnb2qbY zXNS;XHUd^35jt3e*q^0umvDr=Nom=fO21l@2>zo?yo6+kvnYnJC|$&*^YN`?m`&fx z@Rui8r%j+jO+O5DPDzEfraznhs`3>PE>J>M?x+u_uOe@kOE2kgkWL%1)d+0H)lv{B zW9GDT52uDU$cVT;picye@7v9CZC`l&Ni9X7H~Ahq5U@t!2}kD8mksPG$@UthzxoGc z1Tki>5G*|V^+zdT0uxeOZ(dO_r5o>$YWy3x4zHpa2J1DsTo1Wm;tt+XV}d9J$;-U8#-fw-{^Wbu+|);ZD5_ zL6ZvBxCize;gh{qqTJ=7!dg0kXU2{oi*z<&kD^(}#+G@akGtukQ@sZfZT(C7U5!I) zmz7X0{tA4VKv{~{{7aAy$q)N|DJ5?&2BS{j>0XAbwhpvFR7%EE;1UiOYE??;>8G>z zDF{Ezr#d9+2pC`<3MfZr##*%w!5)u&An|T>a zm^b-B&4Gb`)ZS0Lt|P7jikU~;2KNk!B~f-!#X$@(|42Y>Bvyxyy`8ZKg;H+{Q{AgCmDBvHc_ylHC)`0=<_8G{ToEozxJ-=g zGebNfGn)ELA@;qx9DbYPi>bF^7EK;zDd*bcFs(h!Z|mO`Z$JM@Eno$_wVxBAx|eJF zrOt4G3JHLO$$!fH2osqQB_R|X@9*X>)R^jb*MEQY^UTBvUG()xMPP2}h3j9?3f;jf zn(w0OPP4%X0Tv7uFGc0k=QFDmj6_CZgLn~%6dvR4uU)_D z$axWF;oe9uL{*?^xj>qXyp#^%@WHEoKgUh*WfF>ab)&gs($WgL#K>G~F8&d;Dlh=c zcq=|bTK!tQ6fnEc4;n%C=mat=Kw)>aYuZMlP z9Z4i2QL~O8nDgBP-NJ6Fp!|JYnkY5kuJ!}l(hOgLwU+=A*!|Rl4K?V}<`$2;F~dSj z!%pmm{V11Cl)L)S`@*cJ;i7qerFH3FtEt{;ylE2H{C+=!+7I8DzNJUDNS6ll7ShZ3 z(w>0c>0yoSjn$21(-^pQP4_95PKHaPz6IV>UxWvE7%m zIF4E!A+vhfj!*$&w)Y#vaZKyn>n9=?&K5N4@4UZFDyB-je%mqT6C#ZIBP7PcMzHzd zR-{xg8BC%{CsJRe_wxFIIVLlpy)Ugkn_uFrrTX;GVy{Pqb;yvfYl(hZs~t7h3#s@d zAZ5juM8->>G67?-i=K5UTUwT0iR!ev-4vc%EfB|{cmlU~l(n5m-%=l?Go!yiKP+Da z`K;O(=Z5-LcI-uM)cMH3Lef6m!ckka>Z94F`liD2_x4l^Vyf0}5EofY4OHba ze~y^1wcG}_Iml#sgO$dVJXY)yhtK7!a~>Kach*qZ+)P8NB^lk8u3s@F8R^%d&MXZs znzJG>CmC$HLl*lG*C9m{W<#cl#kENMB-55)vH2Q_fcFubOpy1l)Y*`ZGZAJKuK>`h zcOaxJr>o!V01~1~=|dfKDP0j9@T#r?)s*I4k{<0)>bR<U1+W3=Xi&$OV!sTO;a3+1V%^*L}6P zPsV4Cuk7Clv`*gyTw&S5l9e@4KG^R>|Bq$>|H!%t#U8evdI=X5x;W>7LZuoE37ZV-6-VI57-JOP61UaY&x zJ@OjKkWK6A>VD|`cf2wo$kH^&#HJv+dNEvJ`gh{3=(HN^8UMio69%rrXGjUkVKXVV zXQm&0!y`5blG@y+NE0seFynzPi>>eaXEbZC0om0(Lu;P}MH zy+2mOBzM09o+YxMC2tcgqOs)a^$bw^7o;E7vvig z$5}4jB!YLmH>Yu<<*xMQw3>s~>As}@I$N&7j=e>Olx4N(u8X-EJD4*s-eR8dG zAtF2)VusvBCh4yviA7)qI@3~{CzO*&n&^l_f5Z|s=kn5}&=lRG(gI2Ae_=5(SUm0W z<>7JZ_u>6k=xuBq!O#6^1PNC8k}Ku1ow09BGxPcLZ5TGre_Q>axJ+<>VT%dWNxOe6)Ma?5zzHsA)KY+44Yn9Fs^P3w>tIOUcNn+yjWa~p@VLaD4%F8&Y3EYX`j?fd@ z62Kk8alFm@yR|AI^-b-u8Dqt-*JQIgN5RD@@ollwj^FjK^kcx9Zm zq8Or=CCY&2?Lv3?Fq$lZTxWR`S7iEbU@3X$%FG%Z_wSx2+AxMGVFhQXmx?=4PG4wc zfRY%YT;EbhNcENWI6ghkTmyD#kLaZ~NyD!iYHqj2bOlJiRbEgilayJE(E{RTw%TMY z%ZVAMh91aK=uKe3gV+#(kiLmc^&y}O(?I4l43h{Kh75@nAN%wl>Fb(|e|V-|r`tl!1~^k-{=f_F6Q2 z3)!VsM}GZQKLi`M1Ta7{7vK-7!b}S_Iq(zo#?n4+-pJ@~_J#VZ@O$Xc5APA2kdiux z?B(1*XloTwFGU}hnvfKoxCLwsb*GsieMWLO5l5SV|7R8#YGJpcjff- zWV@8h>DO1(#Ol8bXf1DT+zL!10FeM84q6xOILcc)i~a=BU(r6IVcud3fOTnb#Kh%< zZ89f;r}#_ASs(ENeaSN}K^ws$UpJ|^B7zfP&DQ0ESNyGw#*SVkh)sU|i-@~eAtp~v zN~p)_%aHt?TI@vIc+*2YM-Snb0%N5+y?;ObfF`oX!0GfZT39gu#=F^kw4$%EOIh?N ztRilnKT(jO`XYVz5dGd-1M@KTc}#w*YKNIc(MXZDoT4`gj)>F2tWoJB@q|c^flED}oIX@8S#TNI=G7l$$w@i*`!;Wljt? zkwGhpV5axe4Aon^1=43A(}83!0;B)r3wqdcsnP#;P?p;GfRKcWy-inzmoD0~dZ?90 zN{1F`6=hm6L#-gl`{WEFzuvvu25zLzyz3*rztzOLkFortm{$jWxOSDp|>^am<#xdm_7FGnLsyuX% z94U+(@lzQh65oUjVXXNycQk`w~D`o^61DuK)4*5?A6RnN%9L>zNVg#hEl&TM{6!&9li4KPk-mrpU ztOaO?6Xqf$YVn!j^j}g%p+lfRMDyhi@PBXXB;b`w3}Vx z;zYxd6I%ReA#@5cJUbXv%YEr7T|crBX|R0Zk+w!C^J4WY20XmzrQNH3*;=v_k2r(z zudkEK`o{I^SMR6SsZm|Okk4o)0v%C}I%QndXFS2p%*RAXh|*M0PTvwy33j6{L{0@g zF6bVWbON=Gz|_iN;|Q)#jxVx?PC(A9Uz-Ki!SupUHNPbXfZE=QEnR=az!ns@wuNhg zf8l!iFiDfv25)QA7^x_o9#V|3F_xoPY~1PlwflmosE$g%?hZ2#to;^O!=IC@#i>`|C}0%#4?hqC4j?72atA zskRT!&QH=QEDaUPU|TuouHD;DRlSR)Mzkeedx;LL%xWvql_d{GmPRp&T8A{5@*9=) zSK(}|5sTCO!9&7WEVd2#J#I&`&ixVysOAQ96j4}Wj|V?EF6_}Dqy7j!jzd>FhU{3+ z1sY!-wjO?6ymxDL6}gr|sZ9mjFbB&J@)2YrDS-dh56O$~Qu~q~pbayqutB-Lwl#fs zm*zUroz!~nSQ`m00^7^06tRQhhET| z$ERmu9ZBxqvHytOEeHlI%4^MV{HE82s zdJ3>UZ-!)Dcx#zNcU&~AwW^p*k*_JIiiR0GzNsH?g894iKOm)PK!`3jYNG)ct(V!)xaQ3 zYAU^>n{*2IUFWj)(VD?H=M~z;P`ixHxq(zLY#O05&98E1MuT=8>{Z7z*$ZlJFV2T) zOqQgC^Om;aS`|aa{{Oe%JTpD{fGRo zDa1G95VciREo-Z19?`YqVG2nw3fL&}F#^3#xzF@xwKY$J zSQzlV3^N{1KS<2}^wl~vGF;lf*Y+oLnke=3tBSW`G)rjUI9KruSgCmJ-z8@Lp8m?# zAN!d3g~(EAlgYftRFsb72aIC9__zZWL;_2XAz<)d9hceguRVI*kN5 z>y_ner=)I%A&zkg)$pfsaxwSyk$(H(_ww*S6Q&MIf4ulWq($Ml@H7^IO4I_O29qAl zJyBqs+|O~shSGxx%_{Bdu;~;Vt>R%^=R3&8<+3F!_NV~QLPiZZeDFqUvjtVwR(pS# z6YdE#nORh<6uKGtH-kD-18M>u_Cq5a9|T+kHe0EA=&}f^Hm+Z`1W~-bB*k*g#Mk_w z&Pkv!x#^hx^29{W2DdG3@8~I5qgjpP3yf=T@EOdp(tM>qzRb_7SqRpXKSEguXXe+^ zF*fzeE+91j_85GNegs;hfP%l+WadwWCai-%y~sy$Cm1p)&hL(jD=@PuwxO0EyU1zl zLpXl^&CPxd>d+ezT#bLf(W{M(kO%xBodB>bGt3x;c9RT_@Ab~7KiRM5?z}ch$jH}r zoxCU&Ji3W@wj`Q|NJ^jUI$*sNTcF)asiDMa1{5%p>VxJ2uinluDX~<-F`7H2eJwV& z{Y(vd2@Xe^1bW-+=-@@1cT(Xm7_yY)0bAkkE7Fo3{)o!?1H^i=R7y?f{aePm+OYnWvr`#^eI1LW0KUNuk(*+n+pAmZ27pz3irdf)^B4PyBKkBh;c0OtLuN zC(xq|1a?FK=y_P}wrBP8Mw`QGVjaANdPc zKi2)xL!W^?lb|Y*tsJq4KhOyut%xdejFf1I68%~c8Gyv&1b>3p=^aPF&MR4L7XF|+ zuilH;RP471JDB1o9+h9+I7M65&csyugfQMDpeVtdi&n-a=c5!f$@RB<$VkQMKl^jg z5oDf)1JHZ|>`IU8lf*UlhYXyM6i`K1x=HITdXne^@|V)(Lu4OkPFl{q0+;B4=rr)z z=RXtUaUwtvNqTXRAZol?54uA`-ZpihDO*)xf;R^bqpZDX(s?yrtP0ZMAX$=Sq%2j< zLmj%nF)JXSy@P~J25T596oN=PN2@LMUouB)Gy0LRi4>(yhIGtn@hTYWmyxMwhB1Y0 zX|1uW&)_v8DZEYfy){CEIB@um$B$-F7QK;}1-N7Jv?PUnqVJNjCqGTl#uQpwjWS00$D{dpZTWM44rPkN{ys#ME$&(zt zZaI%;r1w+O%K<2`a@}|m7{1&fJfazduBV#QnbaHejfQ``x_|Z0{*f?1X5r>3xF4rH zj1n~NeGwcU3Hkb0CGyWzHKRjjAPL5VZ%rI>s)iUTqPu>#jtA;u-}zoQYFTZYOB6EU zu(j*W;vt-6wujRb7!@3cp4Hon)f}8uT;Ha$*LwdvAyVx*G5>QfzM>WBt674Gy)UkuLsWLVpt9fPk40gkCjkCeKTv`r``@&?v>^b#^r81Tyq z{o&zZ=wn1hMh=?O4~X9DB(MU6?C>hLdy!i_lxmZ~P|Pf)yY<!<2V{^?d_t^5N0~qO|9GueKjOOl+E1O2B-UN8{G?yO&gz7ZV-U8(c`r^B zSHuWu!qpS3ci6@oAuvKgeodWVk(`Awvn4=gFB_APAC}_GIGAdVBxFl+i*9$N!Mnmz z;zrG1HL2w$C8-ffq}2KfL@yFJ>NmK+d#84;DM7IQ_-Hc&0X3J-P^O0+HLpx#3+s z6PlHNY_J`|CmT>TnX!e8ZiSovl^0*qNBP7{pSOCFWWzThEN_aJC|51MP(YQ`OXN}c z-h2qqeyJJrx|O3%J$gQ7(AUGSrPBGw2qLN@7{FVz2?~VHjSh#Mv>yJ3VrAUDekl*+ z9qrL1A4h*~nJgAHx2fCoJDVI_Z0V?-@Hsgz5Fs_{l2TYAIwg#Gf!ehNJi!)rWnKjq zCip;5k@Ria(2FQ;zYZYZD6YNSQNyBRCYN6;G)H+TH1jovc$V4uo*NMPdX>SAw zm9|GU$9d3-vCr5*Nhzw4$>E=-wauIg0z60g*K_0nx-0ea^s8Z;LM=iS!3G_@(1hWm zt(Z-*#ud<#AAY@aDdu-FO&SgB#77Ut4w~s>*l9o09lNZxo840~fT9aE{6#5Idd<+K zL<)KEc^+Ax6vn9TAfo4GGA|6IXh$`SxxN^Ev!E#f|JEQbG@sn)ALTu8>-5T|nuSzd z$c-{L0uhUbKgxidw7KMN2&@_aCMK&w1Z`#d6M~dF;#~$=azy%3)h~?O(w}cP)>aoX zl%h5OpP|~GmT1GdT(5Y?X;)~tn#r$g=bH}Ld1Z1tyFkj{|y`oXh1Jk zlEH0rKl4!^2*90Q8O+rIPc-*GC4iu07d$Wm$QIw$@mK9NKBj3Q9gJz z@(~?;eEsvr~WOOz zD-%a3+_Vh_)3;K<$jHW&_sfV9#wtN`gjhJ8zUdDN+`+4p@fXR{fv7kQ$0H__=ym9u zd>tHHI-0_O^l#Mj@f_7}#Edk{Jw^+gUYz47_r_MrG#KU$K|^(5TI9WwqtmCp37 z!NY6?h^>Y2maNvJTBn7WW9(eZx`pXm{vHTwL|c4}o;G<^qelROwv1q=V@3kF!v!(Q z-_M$_)y&7T7-C`Di_K;(Y@Y~aikQ79ev3xx%gd(YvgTxvSvM6L)Mmg@W!enQLqZ^u zLnxJvB+Es(=7h3?UQRwz-)UH;u$v-s>0=Rks_j7)#{V#6CimBCqmw8%&6`H+pcvG4 z+(+N-_3zrsh{6(L+C;;|(l}+qEI*tlUjswd(?a?s;lT@8oDb?q62U0_{0l%BdK%Va zmC<1TTbcDF|EE}RT!fwNMuHdVAYJi~Lu#Xp!hY71!|c>e+fT^n_=IDr6QS{h zn0J)U2xif&ntct(mN&7EEt)SQhfLp$4A9OocFf!3pf1C|L2CYw+RH*}Vm2{>`x&GI zI~f7;0wBrzg4G3D)pXdT+3z5FlOYpqH{lAx8=-iLF6j&VrF}m2sXx5@Kg<}2TSADu zxWY4m;*OH?7!EYRLCR+3IR``pGzKr~%Hg}Dn!oH0+a;OU$*c;2BJMDelA!i{>7?8Y zW5nww7{k0DomZr!r@ zJamRU&*#8^qQL+2TvD^0a06d>;*%m0!3j8ZFT<4$##FRQjJdgRQ%||IGq3-=_~KLA zWg|=R>};Ua=Q(`@j{ID0bow79{yTR;nB4Sz=JvW)OD}jj)hjUIwXe~!i27pxi%=@{ zi4nAoc8!t5W68&@wumx6j|TC%s?&C)DR2B~5O+3r9A=_EQ6Gc3ehZH0m!`osy=?s> zBNp;US`CE()dtn&Q|!>5t`{0wkXzc6B*5(IuX)^udLQGFbMlg80tQuRQ4CHa0kPqJ zsSgfmWbc=9F+=jx4Ywdwn6r1TzI@&ix~4SHY4%R*pip3!y@wz8Y+)Vs_sY%|SAQ!? z`r8PUI_DFkq)mL+-&#~0g6x+6xk6`VpkrYv_f!q6T}#D8`qnq^*d0Nud-FCMit=K= zZa}GPh9iNY4oP!Fq7|A`fa+JZu`5LpAFq5$FdJ^bbM;dmS&(T8@4r*4=Ex7AImL4A zi;TlcY6*eKSE94UXjrM%!!r=2QAueZ3p^*ERL{6^?}m7X4qr7(Lv}cJ4`*Gysjs%D z?_>mE4eF#qQSvU$;!E1rcvo}`j1aN8Wm|RC;nS+B1r!u7)hq>)?^tbU zxoajzPXO^c%~8`9t(*{38PwmaIn{vKQi$j;-x@Q~_9*ikKpwm};$kt)MO)wRXVWpI za51NS8JmmSsi_oZfP4A>{@ZG;O=BD(yp|XcjYag)qfhq(=740)47$$%ly~~>%sef~ zY^IbaQk7Y*NoP{5HE?8~hv|%d18iAaMbZeE+m%Uw!Hjy*gHi%GW`WZBBH|XXh)|B) z2ybPtAJH04StSC|o(yO3yVCa9{SN;Cy#?u=Die1O>tg z`nTiD={q{=ql=TMMvxv;ECQSIYo;(s2*usS6nd-$;QM|6>C2$x)@)FfWRJ{NZ30{e zZkv|7H0oPp64*Lb5ar8BNx=7mEXd~?Yd(Px{^kc@g5;~FZ<(^X`;oiKx@_+u3y*PhqB**#cNfffMW8^e(NWLt~=eQ z!reQF&iF6;HAuyz*};So=#)c0;=n+HqALgj@P7euc#`goIeeb0hajLuWTZ++x5h;x zJ<1-uQf&F6_k}4l|DCi-g)PUK+fygbPU|S+bl)f-O3?AIm+p~gz_Pp(@Xs(#!OYUR zpc5uzxu%#wv+j#Zp%p>*LR8Rk{1xOkH@i4@`q=(?i)3z=N;$&cVqi357*3k!Yiey! z;A$$iqIr;ize}qK9~Mo1&cPk}4{dl@&!UZNlApRisuE3(=BY3c`z>q&S>G@IfBwhUp{dJ-DCi{J09pWftUk~Bgl6_y7UD9*GM&96 zL%twjWU)_{6ah|A$`EAf3ocDD6-$CgQBE(IgDJJR*x*_)z$V1LPc8Buzo^h=k{cKC zR`udEooIZK=Wz97X;o3PWVi|YeA5ioy+8)&XB9w7dV~_Y530a#A}qsYuUYrYA^Y>W z;3s3Zq(&(ANeLw@c$%4_m{O06ohyg&(XRoBR*ym6Np#d}reHTaW1p#>uYk$^dRCsy zVxd^2I-&SdU9sKUWiinMA$&VGsz&-em-UW~UA5@e_UN-)dgrwGA5*(XG(vZ%A|0dp zgL_fym^@xQt{n?SbocIk;V^Xhglt~w$1 zjN}lT#fY*L=c8LxX=Xs}84!-a3cLu@QxA&{^pb#|xTi;QyjVpUE&N9u7j-?KBieB>xGTIv)`RkZ23e%++uEnt|gzozd< zPLr<6zeMB5+SCmx`itw=eC73)+EZV#f+(~oc2@r|?JN>i^j@ik8=8c&nQ3x>-CFeuqZah%o?_l*og= z6*R}9tTlr&3v|c)IAT00qd*2jxcI`4uI^v`$VUNI{e&h3z~6NVgM*-}KGq?IUu)JQ z^Wb`Ko#nm9ZukN>n)9oxe%&%a8j*!AB4x~pp0hLD=pq`J@Kp; z(TP@@tKsglFz~Lc__b!@!qpNN{|ZM91lPb z!;q3C4~`5WX?$ELw_zdqtS#arWk_U)r=@u?7-Bzp)zL|*mAYoF6**Y$DLis43EG*Y zwSn4B2Z}*Ns5WiC|1i#$FSL$A(#ik9W}ZaikYA6}Kt5|nT2q6EG6BCOK^z|X1l!Qc zT|F3s3}O7~9aProKlkoEwmsAs$w_l|27GL%%fuC^T|h0&}5SIPHt- z*lYSv$sM7AitI<8ROE22b}DBeijztY2{Kl_rs@GK+=&9W=|qeSFGaSCYYQgvdMiQn zm|ryu$ZZ>gGA>Y_sM4xORtf|&HX;NEa6wUd3pW}fA}vHEGS*gKi|hVjaw%%9M>T{1 zsJ?Hi5lIBvN{^B)#|AG5MV>8l6s1oqg0#z_D)Vx_4vqohvQ~e=hgpKY$G{cwaB9*B zTNkU3LWmIKKJ}Xg5PIHg0}&6Q6fMi5de8w1~;q5ocB%5G~+Rrq21kXT|#Q|5w zAzD_kXOtKy)fWf4H4GUAf{QEahhsRTN!cI>E_9pzf+>YQO_k5dX7Ry%U{5J$;q45@@!0%F(oBM;PSq$7T%6e66t5wBq)uy5*8wF`kH@>ywG^}*enuZGRlam zY8qi(9Rx*hiq3~1fQTR#i~3=TZ%8`AmGuIINnJ3+tYl{vvD~9hR9nHaE{2PYqT+zF zC4{_5Lmx(*O}+MF08X3N>RUn~eLR<}9`$uFiNVNowiLr}TLULg|*CaPP+BPwIPU>|;O-qm)9tv9^+!uk9ldX=X|Mw-RL0^dU{BW!zhFZqjm9U9RCHGneqJ*!^rr(Lue8ttm zn1DB3Q^Vwx;@zOIf3K@Yw@ST@+sHda%*%^G$O4kCA#^Znlwra(NP9r|ji_DG_Zy>u z<+q{%6Ut7JNnSs##p7;(k_o#mM*4{MSm_PsWknjtXDr`w1{7{1!F5>h5LaV*w7HKwm52#C zM<0$xf$y7q1Pesr=}%LDot~3IPfpsdNv_@LdvVGqQ6(1?#}bY7*yt=RfrgxT{SA69 zfHIF*O5Ii+v$0NzRQ5mN4l7pK3syZ5D6CEn6QYQTwsB*r+@r@O8PYUl zZ|{uM;HSqRwKR{;w3OfW`BAkW&<(e)bb%wejykmpA(hb;opPd&EQtJ2&Y@kZ zE&IJ_s>^2N(J)!vM++heXHDNc;B_fSG=D^2#>Ld1cKMKn4>=g5D#n{Xx-;VP=rRPL zC!))~8cW<_;AycK^~=|4Uwq7eq&30^C}b~!(g&8UCzfKXQph;{$zFz(FbQ7AW{SwM z9=L^(nmdS+n*4|C&ypG$pQY9JFeZ)-&mR21YhELW7(|jP4h9~cO+PH7Z6T0yRNP{l zE=5U{gx!FC=$V)7Mt31^;Od7m&U68kvZ|)Qx>2HjRCjn7O*%zVnr;Va&-xF(sef~o zGL{5S2CFs$znNC0+J@;!Fzl0tYB^GJe4{umZj}ENh1kXYVq)BQyn#JvPL-SEAG?zp zy;0nCph-Hu#E0Ej4JI|Y+mKK)fcMgfkK`jUd>LroT(+9fW-c!texU!Zk#_aFuD*X2 zlQ>arnRy0n*2=O%Dmdg(>1+kW&3wp;Av^b;fAabG8v(F1!v$z^6Algy!zApIxG`$` znSySI*nM&b>IBBt%Eq(d5v18(@gnzWuVpW9d)Rehk|;uJ;M0D3VLJ$vn($Oi=oKyP zmio-=IqG(UOwm~#P2aWGvr0rdI!>ZyD#SF0dht80>Hqxw|16w}ccyQ*P7*AuTrM&D z5`qQp$o2DzzlbFC0pKX%^z>#4$%yIAdeADS7AFR$>hb^mrH`9?86=B>G3(6V+Fsk- z+R<%YS@8$TY!gcWkoVjV?yD&U3~5rD+xmJDFsg|qns3gUA}GqJ{%b$?lNfHO3bh)C zQM|>gplIOeqJUKhUAiR-QV7wbwXdIr>voDT*FQUl8L;@m1W}V!ePjiwo$<>6uF(9G z7y92o(*&(A6;9KCQlJnuQkGaI^uISOp?u1*^;l?kM?@Jvx$|> z4e$!q*0eE<`w)`&bHCDyqmC*)k9jvg<-3|QlMp8Qqwp9`)RZ~l(g|P}>Wg$VuvMQd zx&D53S{hA%suw^;+X;;`)y6nmU&akqbi>Yh%7MdCjiae4vQeI_h|ZJVr3?(@T5Ly^ zshZb7U9CADlkaW)sG=&gifO8@BE*J3h58`}%m5?$n;agsq@olpx~w3D_4A)`NoPh` zljz!M^fw~1cjmWtr|%}93aE9SWVpT`l1~o*oK3(u-4NK==*Z4A?RM6haTxOaKhP>D zFPi?eUM}?}*57#mtx^5cR`n7gLAcL8gt=k6ot_hSzDsUUkHqWjXCIHij9HxW&@WDW zvdaM9NHp$dwhcpte+Sr`kC6jU1EA)}u)h|-D0odP6{oI@Tsa562v{TuJqC7L#2~U( z!BZ`5jyk;vTWu~a`FM0STBT#$~+AxvM$( zCWnJPle!!I+rvuMw@IiK(|vYn7qUVb@XvR>#qC=glEtmVVW87H^!4?U(aHkK`E%Xc zbRWk1MbT}7-TP4GCLPAPu&2{#(nW-RMqu3IQ8pqv?H^$EMT+u;mp#BXqegOPQ<5#BnYTHg46-l zIE=8yQQO}W1`?efFjDqVqGhe)U>f{cva?$VlJH_ks%Q*euC}8%Wzk=Mn z#+Qn6wpuNB^{+*Caz=<=(|T>#HV?v%G+V<)H%(|r;3L6Z#RaQcklE7yi00T0Q;08I z(677{(M5CpxV(foBS72hL?LvH>QGA6T7?RK2p_$O_3=n)`#tLX9>R4Aa18Mr{Tfzd z2vecLJuYtD&3z`aIhx7V3M<5;xK6W;%*oTExr2T^|!9SV;ZkyTO~h|izht~U+Nbz zO)0d~7{|136a0nj_GItaXG#1&pE)P$-cDQgzrqxuwxL8R{GvuX9h?2>yPcdRgDk6U z({~SO=;LMDf+(IBW={n>u&A)mA`IV-3nOiJn)l!YGgSA~gulbGeVlSWQkYJEqrO=B z%yx+~NiqQk2txNsITx=)w(OVvBn2P!DMjgZK8*7nWw<9MI+M>fPes<0PFP$kZ~mEH zc#Ka!|B@ateXDjw-)Z&f^_!|zvFl$yV!m)sAP*9o;FF)U(@!fMyIg%o%n*#pdgc0M zZ*>D#B#|C^oVt0-I;2>aaXvXMgdSxvO|)I3UGM} zUcx%&b8(3&vJutrG4|;otshTX4DG8{0ds$5^1iXKpuMSjw3lsSOVqg5#&Oscx)ujt zG1zMqQhC!fvnr`hE*=eLTscEpGY=hn)eJwi3Ti->_(H_Cnwz(_b*Lz!b{cPU=sQ}d zRA2#}BWtY)n-9X%AF3M_HTp7DQln5zg0f^FYAy;Dy+i=Kt7(h%jt}Sum$}7fP>YFC zM!g5&74t$Rn>m(IDclO&O|cNyF~W4%kWkVr+Md1}QifRlaV8e%4Go`G1N`#5|KQc^ z2ys&(wzN&7oDw%7TkoJ|Kx}dGQ<<(SSX~1BNa-k>eBnz^S&i)SN>o&Ud+Cg+C6bxR zI2F$#F((H5(oes{LTBPZ>Id(+0q7j?R2aQ zk|`0D)QEEu;hV0l&U-K9aKnAV{TeMX86d;~{tNMNFK4cIw3lG+Et(a?P2gFzt~d5Z zTvWXJ$M?i$5ARbP;1{exSpRqvSJU!`{9hLgkvh2O3*4+y4+$p#arWn%2p2Cx)1zg8 zS-xY>De|f_oDi%`&4j!gWrIK_G!bqPJ;tqYniQDR1+RX|MkURn&i{ZA6k_YnkVXrC&|g>s{v+bj zp}eX8Ud){KCcz|s3L((enYADMzz>9g_)PwXVy{=f_l@KbPQvX*I0G)sYRuKSbQE^b z&VDX$2)9TKP;ea(xU}W)UEHX?+|63-CEvOZZYnGs9rr|CMI=&4?jk|GC}JM{&>oUq zkEo|6om$7y+w)6VGLHnIUvTPspasM+PBUhQD57){tP^eHya!qgWjJ#IByGPK#jTj+ z1FrBSi3C!6|1?40ayP;Pp}RcXWjdaZHKj${fObbwjnJ5DG#5qsgF-2}4Ao~;)ZL89 zDqBa|SX^c%Cw-vh$smWteH)qpxbgGMsmhz6v5M5~;HK-ns~Zc8&*}#yU(;CGRU{te zS32ma1!8=bo*cD^@wr|E_t!S>_+3Kb_u7G#B34k7-$cGO+9b3=pplx&V~J5f$*Vx8#K}Mi(~??lMVP-Ew*h;+cxL9Xn^hYh zpoR6#6`0!(Yv6QbfQinf;CF@hil~r`{#(h@L$})F_%4d~!Jl#2Py`|fv4;p878iGI zacReVxkFuatdR2^ls@ zFe8OPqOrRXsuK!a1RLvnbOeZO3AoNQX(|4LnKCX1=BH#0+YE6OXVB@TKrLBeyVb1` zccNXjzh-U5U82Icoh^Jftyk@GOxxv|93To8Y>(os1pxCT9`hzk%6gY_r&OHk(=^N&!Icww1nHT{op9gEs1AqK*j1PQb{T67g>#hFsyl+{wlr-Q{dijwT)xWJqv zZm8JS10eX9VmLWJ;HP>k;|c9{iMu92NU0xl`yw4j`iSeJqcpRh4YO8j2h&5pTo4*C zotN#!9UT%5VbRjR8ksiEaqBXBS8ulxMngOKQ6@gHQEMX+p%@h|u5^Qt3Mi30_p-Lo zlf7)0*CwPL(b&1(KK(pOZ@z@^X#YKoabM7ZYUs7|C!YG4X}DVo^~Hrc74<$(f0s&v z0_MJOqF{knd&H5v{x zj;>0cevZlsBcwER;|hh?L-a8CI8Q||zC@1)*cp7db_&C$irkj_;w?_!yz5IkH#y2S z!!&Iab_xAzRvT;nrD9y#%`H^2GGyjTEMG)oKf9W(*Hl2CSy!Sb4pE-H)VZ)falt-* zL+e751x%?&9kTa;GfH|-uM%M-Cbd|k86CCt)AUV@p|J0zXywqH`vtLJA#r1bh9j254K6X}& z&S|3-$p^%E4WA!ggc5k-u_s0S8#i9?0b?*l1BI|_vTZ9aT)a>5ujHXaP;a%4jN3-% zo=7z|G=0Vtgs`;^Z@qPZO_u?_X0+nSNAx zcTgNYh;A8}IPUGq*wQ;_kWST3CKD8%Cc)(!B8Qw>vI6i3wfl2FXQAYSKH~qeF2H6d zE+6-o#0*d`Zvq1><@@Fl)5y48x#8tvFk8fjahBMrn9e>-O$C6ui31>jTGv0hj=E9y!t&n{)?P zi6qf=H_w_VM`bC_kYU8>WNSW^8Ek#%vdu&*YNO-stcxMq6Iq&h z(h@r4kzxkImg-v+tz+q`oV@cF6&~H|ciweOR(W{5(FgNMIrz z?m0lTCNhp>W*VBq6xYBH5Mep1Yj76g>n*@F=_@QyTRj|DVJhHqIeizAA~2@;hi-J{ zWYVbcZa;-GiH8zyHq#S8FYWc9SCL8;6#eY#vnj3@9naQfYEJ`qz!b++4TAaCsOyMW z7NJ1zkI-33AaL>jo*GugL3&RFgo zp|C^-syj_>pTAW)D|wsb*hZPTlPQ28io}1U>t9g>{pz>=DD6}0n5b>SAwEuW%YuwRQRg%5l%FtIKNH$S75B<^YpSQg1?`Q$J{H3TF;J=?b| ziSA%-SC7qrUi}HnSh2>^$QnK+o$G}c@P3|;I_dH9Sa%qv-@D3=*U%tlb$g}fwJ{G% z3Pm%a6K$Pb2WWS9SN!nAje9<~C9F;pk(_$9&Kvz&^0-K&`BbM=gtQL*@g^Po!^mWe z2p+c{JAEr4Pp}K+fsi3B`f%9m0tG1btjG1o5jgw&OWqx| z<@#w=f!*Np0&Rge_EOZ1&obb7*dGz1O5H*^8J?ShoO&=ry?kB`61|Ggn?)2D2*6DU z!_GmtAXp(~p^Q_?ycniHD-++lK;D85U5>F87!89m(^CL@5dL6c@czD&@hXr0*MH%gNSuS4l%i3< z7#}AJVGk>Pvc^M|msjE_hyo3|iLQ;D zA)&@mWE1C!=!-Rvh&r1>i6}`UO0HNyFT9?$)r~KB$x90J>+5v9Omk+L%r!$MOI||# z+x*#EM9AWV%;LD$Fdq?$A*N>-2HGB#ld2ErzO{2574ZLx95b@gIl zl2JST))HC_Kw!2YRe0eyGFEsaLO(fO-A4#+_=SPT!M2SvxOeaAG*CdhapC+S`ZY_3 zoaUQx+y*mEqzU15)(n0&;Dye|%pqVz%Z2Y=}+qOX#T z{uHW57QP;)NvlX_Z#`?J^53X=MYzf|wYkmBHLv|2CE^rXYP#R%p_}ceq6Ha zztmfV>S-37De<_(UJ~^m23UpcH@DP#BX^jd)Ctw*1=%(mEqYAgFH&lM{6xZgPbgU; zoUgTkV#OLU$rea)^-}{x5!A00U%Yx(3iH|9`u|5~5?lSljKepXx~<<2zO41r#cP-i zpx3Amt;iJM@0`0nq6A>vCz90Pl2zZhl3i%bbxT6fTQ=-a+~$ND8;7G66Jb z(@Y=LR60Ds;NjxP*(MC7UkyPY#|d(YD8jX*B6!EVWaTjQnWd%0Z9i&xpJZ6t@5&TL z1Vy`N4z_jz*H*$WM$FGVt@${r?i2G=+3<0FbGPF2p7_|u^oyroih(syIqJJi3kJXv ze>d+f%S7&nwt#E$LQ|b4O(WMlS!^`%%E2KPYhKZvt1!74$CJvM&S8V}g$7vMAhOMR z^I=?LJ686Ne24-lTI*3SaKCI@B3Iv`d-7odIEyxA+`UQWI2ECNvB#gisLvy4OJl3x ze<-BKHhWQ!KorFro-YF)ycr~Jb={OHr_+JSykNYx&Xi(46}y^#O8N97y{R^eC~G~d zJ6wu(n*Me-ZfcS7k=XO^#gfmW%wK z_x*@n{Ot)dARPtof_YK^jqRI%1^E~AJwL02=Dd=n=T#f0Pp-~4c1WiAOI_k_-Y(*m zkd&S?sH8m%e&fI4-97Xy{W& zxl%ANYOYzwOyeB_`7?>>G7GA?0u-9SCf`kVIy}P9EB67#!RdN=S8vqbQ!CupaBJv& zdCF|kZ1vZQ(Po)F#WBtz5SVUg^qmZd-yFso|NQai#ZCYD+Z#{mzo?oJ5iF^o(#XzD@9Q;pFR-=r z1BW!oxTAsz0RyLnfR1rB4fA;|ah54zRy%h4B+qI6A}Zk|62`p%f2c&=oYkjs82Kdq z@%C!do53E4wEs$ z);rH?*RA4UDU?vWyT%clE^aSfs zSLbiBpQvE9+CcB4iaF~{>P&w+8qaoQ-knhpYhSy#{^s@fD$IL00R8Ugl@-e9NL5ZW zJwi9%GM&M0W-3&;4OMcOE(@zok#(W*MQ=%--SsP8xbjLGAS?EdzKQ<*HYgM>W13_D z0#S|m01A;x^U8(Id0j2SG=UvMYG#e_>MbwqjDUivzu#)ppDu3N^k>6aT{DgF(so0# zF+I;KZzMVY1+q(y6QQFMBrwl*dN zwdE&TQ5hInT_Ir1@*E^$4!#`UB^%?fEFHBnw1<|4=JCvbMPTX!w>IXbNlo0j*8cJS zaM$V?(M}41a7@K~E>GXBCAFPNJGE{gd-Ifc^!d1ic(U3pPIhEhy>k>-?Qc86H0`w# z^k1ifi`ACP(ME*cJ-{J0Jf7u0gs>zBW4CNyRKdliVB9UZ>FY>`G2A0@gGw%b#X^!R zfI^sfTL4jmd8_)M4y`0g@<_mAcMkKCLK2X#6%w&jv%b%i-d)ol$Dy^+fSInP;~PLu zlbCW3HFfpjz$f{Z#b;NWJ5Ar5h^SFo96@73s$^QYI0Ms^Ke9Du|nz#5hVOEr^Xj5$cu;n_&?$pRB`ef#HOe!N9{28P!p) zWZ1XCdF?V>9c>-99jv0>>|PT=2@oT?1rxRDNoH3qGbWoshnElD&?m^3SB+3GOIqag z{y3xX%_Z|uap&2T-y#pZ`h3km_KK}P_qosc{McG6ym&UFE`h-ktu|FJ^|}#EtldSj zzFP0^-z{}`djReHx)T6Ebqw2}cUe=cq;7_BNr<;Zw)eRI;0=EXY4FwSS1c%PmZRng z_u-GKgUS286zhT`5qSis5-7!kGF0sn5SA6Asxg?(;r*^)$mPIn#p(p z#7n{#CoR+BIAb3wD0Ka*_9VwHLsJ_54NKPE`Ut*AjMO;~SO2VlsCP)|ehKpI>@(Z2 z7oiqbzuXG6nT++k)r55ejDPgtRqc#^$h;HB+Uuhd4?DNtF=MO^NiGrkNnbffsOB{< z-9i92`tyr-d-Qw4Z<&N*~0 z)6+slDAM=df8*(F5&)Xen-S!QsGyJsu%KSPioQ$|0a!|rcQTq(krxAuEj~*Vinw=> zyL!fl>0M5ab7T}Riq3X^c!NkskoAuH=z%KKiB5mQAG{UHX~wlh2n!}que%%w#7xs^ zYr4`V5`Y#$HKk$XsN^wD7?7hU%QHnDWUVWa3p#oaV6CZtE$IRK24EAa;Hq^E)r5$yS1zG}{Bq1OuC>4WI zKU?u%t#0YurzD1N@yak+^YkJoyWs31_dBF2O}X}vLh9Hb-3XE%LSMdo{aw>TLQJIR z=32k3*ZG)>U0D@+mMkgZne;47i)8F8HPd|0FQau!#qmTsx?beOF2DrOZu#Jz!x8oi z*Gi~_k%zPN8M!u$qYP1FK9Zk6ONp$F=8l+Q3k#q`XQ>tmQx4MXXLJP|1Ol%1GHejM z@r#=n%5gf6eMax+-f^8x8?v#d0NB##Qyrc+fi-zvX0x-2Zoga5F4H%`6ct$4cf;C$6k4P z^NadipSsoJ7(D#`{&yr^tVh@{orOvi z<1V{5?KW6LjqtW2UgYF+99G>!{pEO)btu~h1d&C9p>V0fhAK08`+o3;mCd2q-ila z{)?s+eOmi5IIa(zO8;zjBiv3FD4li~_9mvemw3l^KEGw0VVY|{VameLUJaPkw7E(e z4mgG%w4->wxmsE5QU4?^L^z{E(iLslsw`@1GJ2@zKG%OX1oK~&<>VejMh zW|hh!L(QjI-Q{8ncX4TevE)2OWx^w3t39=0@qv#fNzBVK!Ve1LtJo`@qa#LNc}f~d zr&{E3$jFjYB)3F7{t=suUac82k8pW!n_&DXY~aU56}G89IW zmhfGqGOwd6IIITeM>hoG7I6uQiezyaEgn_pWeL(a+CvSV?#E5;?&faiS{eKatLf{t z1mRDge%@y*W6Dco32FtL?hEN*eP%u4B@qdGg1+N7eAA@OF=qV!2&M}KBu*lcp2dw} zf@K_~#4GfPCL{IyQdpCj8`rPj{ zESz&^KAS42`T<0YU-uj|^XpU%OZMM%itGq3a8{ZrJ&bVCYY#!n_)^_sB5Vwo*uqD} zYQC6<0kMVr=ObWG`6jRqDQ!a-M&1`Io*a;Fz-1}EW2w}2V8CR^T`$FK_8RwTEJf;D zB%J<4MRg`22;CW&6c03@QgaJ>@L4bNdo(-qqPAok`FS$SX2oypxpaRBBHvoCp}FY? zfU0sMSOF>3X{^rdbSZk#ML1ZK6&;_~h(o8-egz85Q|_dNV8}2oZ5(`YfzLTSmzTAa zX^icxyBsr*!wVnN_BV|eu$DOU{=+%F^#6(U71_Kyle?d))< z8dh@$XE(Ew7U9p#(dgr0to-NOH=atp9#CAeTSe28MrsAnuDM8hGjGFUH*7&)%^h`; zb$VZs7d`0~zF3AyPyy#*3I-n~rnoVg{&NpFbfV4sKwmC^zwwG)&9qxQqhZtBFggcwT6UvIDono3&^_joDKoU}M1n-m&grZ ze^RM4^R1OE)tx-bMh+6Us=iJ6O2(#k{p_Mw=pI<6w>2+99?-d<g{NNQWS>0~E z7?lr-O+bSbWaGG-O!)=8io7@-ihxo@KG(|nAII+mi+jy9BPq>6>u97ut2V*e=-8## ztx;z$q|q-eH;|cU31?xvnHn~Hw;HIoMx>ovS>6B(Q+l83x72NAQ|tjyH~_ic844_v z63R<@#s@dQ96bkoX~C+y^|Dri4JwpgpyS})$3B$|gP`WPYYisD0?d2N-t@j91;I(L zYgvtb@hP;DTPK2)ra~X%ERHkt$6m6w16He;bW$eZ9lTzl1AznReU0cAW0LLA1n|Dj z5?)SUYYkM3px|E8Ttw9+03ZG!V^sa_!|&<1@7;UxrRV)Q#zAvrejm6l_50S!@quJ& z>m?W1Nx>z@pupmZAEYBojaOuuC_IxFujSoF0b@^A9fa!C)6sDrb>kPW01hbSI+wk~ zX^an*((opw%7d5alnqOJiHh+O+3`at8-c!B(}P#Dh-bYt4xj$GHV1$B59bj&QxS4L zhrr<(-m$*AR;yhbT}&}juG%e*Qzs}dW1zyOLF-32D*U~06JQxVq<fpl78ASl&EPKJ|7d$Yi!J5p{Bx{9TfTc5 zp;K-!oKwODiFzq(wK9-AIR}1KKf*R7@#VXf)dEI`~q}S&v}oEF~p{p7B*G_ zD)jUah=c5RcK!Edt}cxaJNy4Hdw1d|M|s%!|Hm_9vv8v3Xou=(eXqh*#Z+$kO&QSHhH>x zx~B&Xp*Ne&ve|6*_j$fg^^pDj4Vl;LT{YEDKUMXdbyss78mcUpFm>0&6*qilu$Oc- zXRvNub$a;P&y*c3Pr{HBH;3~9<%+%M2A)|6o$WjI{Aa-n0pf3Mki;u^0=Qgfe%*O+ z|801K6Tirml`{za(_p{ldOd0x)}LOH{oC>@?7_x&TQkpa@qdsJ*K)RUa2jM>d;Z~M zFsDN(qQNyw-at`Sn~zN$x%Il0gNt(4u&w0WfI4?(8Or-TykU2+$>Aq22lupGb?xB# zTX;8JEP1*|p1v#_)Wz(!ULT)o2>l_~P0N5H3tP1^5uf@br_EH)um8ltwIuGPV@EA~ zdNpUD23I?|a=;az!CA`k_i)uzcsX3i9rEGUm4n@Z554{-gIiYUTUT9K3+du$O*eqN zBMEWXioE`FAYuDK|1h{3!%b6qtbkUee&-f;qQc`Jakq~19ii&J<*mP^)epbD84CR3 zYr<@=tccIf zg`Q96)WYRb4z0_=;9e76G!<$}$nN)FdGWgjy9c+i)*Mg_`U9Q`9PIUSX7Ps4(Y=*< z$^Fed(qHzht9U$|z0-wqJmvS;0^BgQx14=>#H@B0emh;=At%sRhor0ycUtQc z-f1zYl~d~m`*ys{e4)+^_9JL`bO~ifUCPPo@*D5IWw3Z|31t^Ew5pqK8(ibOL>E7< zdiNC{D634JA7%OE!mU4a2|T@7u0D5akyl~1+{8yiEMe zJKtC)@00P(RN)m!G_W%HAL4vrIrDHfoBOsxy?Y;}tnB8^IUgS+ic4x-kO{%u`nd1} z2Tyk0rc+K!-k6l-e?$0DmGUtJy0!44?f>T&F0SBg`s<8aqvJtiCMUlju$@-&I9UHp>`3>NC(A2tHdZwUFtCT37k zE?YIIzPIpmAcHf6oIVW+4(~c0>=@s~ErGYJEIYlhanw5#VigQ~`emh^dWb92%zkj_ zql0q3yYQPXHIAJZesJU}e$pg7bsgVyal^GA8|=?>&5~21uit{p(^TQeFRT>%N7oMC zinws3I%rzu1?oZSE?)Q{gX?b`G=nR7{P4Z+FISg4+;|c+zMXmXwVxXNyb5>iQ>(*U zGPLD->lZF6S1jJ}&m9TMDUM&cC27!hx!3K+!Ntw1=r1`<7czU*O}7me&6P;*F%Mg* z_g__3ns;$ea*-`4BiN$^AhE}b)sFlO(M!pik}nEc8#?f+R2%8}qP29t*aV92Y0DBT2Eq9AJ3gxxd z6jwOYzfh0(LK%N>!^$9CtGL1%(#qA3>y#zD3v{JExh-6BEweQ~2eNPoLZ&UesWIHd z@cM6~TzlJUu91Wpl8+zxNLe2)z2xeUBb-kT4=R`Q;ej|C|6i3+BC?l^~ z*v+Ro4~m~%WUg_R^Praww>s$^K`ih(YQ*5QXV{T?em#5dy2&#%bn$?Nz`C*aCcE1VeO z2eEh@oZ=`~RX>=?ZxNM~ynQn~12Wj{S+;VeUc?l(6${;g4*j@!DCFYR;Z}}8afByH zLl+jxzIZZT0}HR72-BbBs5C^lbVazgrmO+H1Mg#MS5zGCv!kB~m2X8|)trpG=C&|> zde`9X_iWApjV9-Cm>$bte8?=7T6NcG*{tJUAlHYsVU>kk4 zj=Kh}lv6$2HFmw;56krw?OD|LIVQcd^4;$#YYjI(4yp>*YQu9BVeYpr{KC-{;r+Nb zmi7MTaP%5>UP!906vW+8jG3@sYD|D=<$@m8De+eOUQFr%zu$Cr%Nte;H=MgJy$@RqRq9r{4;pD7nKp z-pICc<=}8AY>7hq3$L8HX_f9pp*Gz%*e29i#KN97do_bI58MIFdn~x}qQkQHT~gM| z8r{(?tHPmrxr-2PkKsz{_3R>){oW<7zb}i9d*N-#+@V-kPF-;|zZOs?kT)aTFt~&4 z((sjsyoc*#M1aAuiPEtEl7%{~*};6~S3Rw=ej{D?RGb=d7J*M^_<;5=|yK`*`Him{0Qu=gDy98;MzUsBru!jGPdr18f<-I^;-BrI2|nTDnd^E43>2x7mak-!+TM< z?WbHz`r`1Qz{WC5!b`q5Jy-^E_;z{tC9ZN(m%P~#DkldAE+M-($E4q4EYtF_sa;`) z^q7-uG`Wmht^~Kk^F&&?tlA157P)bCNc0Mx-4Eq<4)c9;VN6p)AaiRa-G=(;vKv3+yXGKYppstk4eh#8kN!Y!z-8+{$+2b?WTi zbHhUYSjBs2c)V+HrtPxp$_79+VNaK%v-pD0aKC!I1t(k(r?>{k&0HJ^Il;c$-JwSE z=GHRvSKX%lXYPF;oaN+1bs2W?!do)A#5OplyZovU8h&;7#z7W5f#&16^baC;ADU*d@~ zwrF9OKctXH9tW#MEg#hWaN2pHpIO1dNO{DlGeZCWC(eqU#lcW`dP`B66&GFfj`F|j z7T$4NY@_!(R@P6>QifBWWy;=7DTMFy;v8D}pbB4g+2t2srQ56MEZ=wI;Na=gr*~cZ z$?)y)+Pj69&|kuiz@WIVxxCEY@H#+kta;FHa5#D~hnQip;WiK+5GflE<HG0 zJi&2;5^gNwcE-V-Jz)m%r`R{JozXq0p$2~VtrwNq!s>^glL$G^TSFEe5@JQTNvX`w zH&ScLgmGq^8&}Ip!Qm-i;#?31r{(PEH9yPlPZ^O{nyK@;_}0tlZS`jO&~idY#s#Zz z9~3dmIkPcgA0?~@zo&7tc8+-^3_n0dzx@C181dTga%rF9Ci;(lPVJd{SZ?5}gWC-_ zleh5n!~6b#Gs=2F0Ve_n*NtlLx^hseTz0#@TmnZe+*lEMuuz-XY0&$M!WF57UCnT} z;^6uPD;ipSSvRk}X|PLAx3%(PI^3vTby=AgJiw!!F7^-N8yu;H%dRY=a)B)r%jXuh zn4Du;czXdS{XRa}4%SwDmPa0Srh_*{#eG#~!~*rok#P52nK!rc;2tk*2n&eMt%t1T z?vsT};q_ZSee=|#A)Vm@(n5#J`h`tVxz?N%9PH=a#O?D7hkzGzB5dKQhl_9GER5EP zRe9r|4{BB@x%h^l8`(DvD-zEjg#Xxief?!UY`ZuYx3JF|t|w{BeMvmZ<2@vU3U$e~ zw+(Kty@V(s&))LZKMph0TPhwe*ShxlTg@)N@m78wDLjA}zQy|89Lw}wT!>h>_|NX` zM+bLz>K-BPTo`POdBqUNAYn<|#}r1)IoPlv%L!{C>~;H_b>Bn*Br z460%9TVZfn7+f7TbH(xS?}x+Sm&4%FFnD7aT&&rtf%`kXd7`$I0!{Ogc!r&@R zD<6M5O#fUMyeSO+EDWv)gJ02d9}oY&NB>TSe}63uel-l1h7YG63jbai2FvuNBjMjP z41PTf-WvwL5eC;N_+a?=clGblFnoR_eDKF%@R2akD$Mogli}k((g*uA%=DE}RbIO(gnq9+ zxjPI`>xZCEPKN&rt@g=><2SeJOKTMHMGYU&@ctOQUBP4eNdFg&@c-0a4RRHcJoKg0vFu_Kh}kd{zWS_&Q>qUqRgR}sBB%8EsSV*fr}g>S$KsMz{ib8; ztUlVTdUQJ0i!bVrph30iCE>A^tQfWUOLlxnqYTb0KrYeDxn)OR-;~JNJ`q~%PjT+beftEbpgD%Y>Y;C&%IQ!2lyIPYEYtB)w5 zs=;Sfzn@m?ZjWej#Y4&zjkSJ@l5tkAzMQ>Z%Q&fZn;O<)Z&%}g z^{8s)*Hr7ymX%yHn^x&h>7k_7urltX3S~+i!YOSAPV4{Es)Q$xX}+7(M?9x>Y}r`z z?p4UJQazUQ!jARtiq(ewUUl6VlM!H!FXi z-lU0jsGb@g}97yb-mN7Gta1fHb-J!$BLlroteqd8Ktzz8g?oyuX zqxSHFVdAXn*3)rCkHo$(cIBh$|NMBag^$G4ZBg^=JgO`^sa%{^R?7X_f z?tQ!CdfyjD%?Y*8^U8(JW9skPHuf{h^!D(b;d|XJ%D^e*eS1U+Jg-)NF78mMG|$F< zK!tfS)^;_^uPggb#`=B#qnhFN*oxL{4pV+YE&BN#A@R>X89!2KKOeJYNc$Vz%}RE2 zMAf&mP2K2ORk*fQaeh@PosONRitg<8Ft?UfPPK>M3mt&hLmhqslY8DRwM6`P3O#p1yi?UHFDSswyzIIaY`vP0dWG z)}B?P?>wpv{#Udv-R-I!=d{wV9xdCaD&^P9J&lbqzhcpkt4REWD&5nuf(@xUoL3c{ znTV~*#=de?T~G6H>$eE*MCTr&HVRclepU`rLR9ya$7WmiNT>0_1 zE_)Qz$<=k7e;|Clw@b@y52-_YMv+ddp-q)#rPXT>YlY5k3vrqws$Vk`%D?tdoWroT zL}yjII}_>*XYW>Kw$`ZPpVpSHJ*3XPJ+vpxsb=j!q$$-c zt2y{_g`8G4x2%f%*;rHKu0UL^n4xW}irO6673O&=)|PSg9|3|18)y&5%v29lsN#M_OPZNb>i^Uu{wW-`%DFv<3m|o@E?6}sbdAMx; zT9ubn#_b_hF~3_`QK*k@9*sdGs>L%CO2EuvEx7ZPx}5fin#{~$RsPPS%GP#lGk$mM zs1B*OZI8sA`gK~#SC1+YFU6*yzNC3H?sAQ&?1!}dc`;`H4sFc3W9reG$JFe)_m$fg z?J_+bJBJY^?9~Z1`e(}4tf0nM^s<@!8}y^X(gq# zMTZ=n2b9eAh_+}?#~lGxhS}o^nVHnrn-^P{G z?ICUAPsYk{MD?USRIa*qBBnIE&TeJM>~8grt*zQJoKz*9nT*@I!?7pXqjKtuD{X$S zs&jipMSM;<(^;p_=I&4r_M-Z}xjU4c&IaYapHQ7^Sz` zswp!^HiW6x7M1gCLrtM~hY~t-LU}ebsde=esw3xBR86bK;>R`B8d6X7Y?=G2m%Tkh zVaCl9`i9@Hy3(A~u7KYio4A_$Gcj*BXsue?)gSnST7mf;D%J~kD%IWC$-bn@|EwC* z*R=9}Tve|Uw(3UJQe zAExF;;t}E+l}~Hyqv6BWW)(_%NFO#Q)G)e_sfM3dpEISc$KPm%{;q_=8GKZe52?pJr&Ld?rF7#C>N7fqeNMCOG*m8rBDR#Z+CO?m z)$qa|CHCbQ)O}ntpWUR&-dnHDO7mDFq;vK$6?JQi8p-TB9p{|Y#^t1v=MTgNJEG5~ z)m!;PvA%82RI=+e)was2_Q)E|H*WAc>*Ho0OK}ayV-Issm2rNv7S`EzTq`oF?x8yxb8JV6ET0SI@-Bynrg;cG`md) zq-Pb>9@eOHL@A!ztX1xgY2oeIY@3s+buZ_d^UM+L^?qG*ys%3PZI5V8=I&6FoH?Rw z>ufCZ{&8)oV^xU9G3_DsYMt#$@F}%!zrXC6^~ITEsu**7)ho2Dy0g|+WH;2526wF(syR zK&98UvGfkdtR7YyX~%=|nPbXGe^7g;-%wD;9t}(BY*bBajwnZbY^8Hs)n)iYN?U7- z*4*z`5@%zt+5573TYpfwFh8o5{HkWsT@$lwyBdCXtFojyS+0T(FU~0^+ry73p%W_f zPD35+{Fk&|GbhydPALzXlPdb=QLVw;4lSlVteMOlQ&Dv%lo_+Jo%#uNx6`VN-I2B7 zYHNQ-Y$JY8oM+t4X-@7?Hsq>*-MA^auu&OxR@I}mN$b@fiE9~4ziri%`#q`)?RYl( zl-Asj$LyNW8OFa=68*R;#Qe@iSXg({i7@i9SbdJV$8aI&bk#Yl~Z_e&j_I1aUp6=tZ(KpoZ_+x6QenOk$ zrEOfwa;`)Tfxp#nn&}Pk~RB)raEh5Gs%>Y*-e`N?AF)=jwxe%`&6ep z$F;KUVKs?vu1sv&m^t?;bNrsiL*_IOs}gnA>Hz(gtU;KnV>}HyCXEc`CVURkA+tJ zLTq_jW@}A6C{m;g>o;jBo3#3^SRCCATC3hVC8l>mUzv-ot9L*J*IK7sn|n}=zIp6$ zm`nF=<>>qxRkqgcN^pB9=EVtZ@SfEy=N?tJ`GWQky4zyT?^A7gIi8oA8&T5yF&(d; zR`L0R>fSnWA8B^GYIf%ytwDQ8EulTE4OlPNO3v+x3Eryu)E-hZpH{T)cJ(x^(X}!8 zv4BT4li5AW>)Cb6h4!#As&i-T&^D?=m|vrw;4AuKYqM(M{1&C7IiglIdpxf5xYoJ% zSS+daajCIaxv)_y`|=Yi%-#tdGoMq%o4sGlo?WMWZ647^VQ#ZpUhhs--{uizwm+z? zd7^Id_M0yAw;UyD4T=rgfctRQ$ zlM?IqXwL1CvO=mGx-hH?=8viN%&yUk-9r} zg(<&BrO`YVSM|8kG&8v+gv}mPPwDrojkVUP_+HtnruGf3@Z6YcP}9bxjcSEkqssG_ z4?Us<98;b*ZEQMQ)P0^)Q0qQrR_C5$;ftMIGnqTC1a=y+m5=MN@ub$Sw@-DUvsRs3 zEAH$!M`DjLqJ{c>Dn-9vMbsWrQrqR`{IF_`-yc7W=il1H+Fd)R9BLlbH`*hrl%2J) z@*mL&jb{}!f2Z2k?4+i;n__c1rVQ%br9HEm35|NWva7W@7TZYKmes~)Hz}zzC)Bb# zkH*TmRqN1wQmLDLM5i^LQF&Z=QVDNPDo4tYOFgEN@CVg7&fk}Zyqw<=6EdW) z%#SJiTAO0Bp3;YNaTmL3_05+LsYmGSSAx3R)%rX8RbiS3W4|>Nm#|Ywm^-fW^iQo1 z%j<4beLtnNcGkvu#e$#Dlzr!vf|;YbF7TX6$4}^d&uOi3dsx{u8!y8&53AD5#ABe= z`dEv#Qq4&fUu%o1?`+&PojI%}&u!DisB`*8bEi73UOWlykEu$}->;>0j;a{_xVle2 zu6o;PsJb^##01)PA!RQ=5o^-0_OqW+qn(+IjV06ipI5VP+SrbcZVIvd!L?!3j9>Jz z9nG)NLg&X+3^Rvya^j?#`b>GkYFyD;n>CZ}efoaOs)En&(80_zTGhE>)s@+&wCv7$ z9fZd0*3f)Wx+NV?{X75%}`H9#X4=WRAj>Opas;qkJ zR6PFJnout7Vbze%sOs?C79A*^)%-6!67%apbtm(?R9(86meY&-{N3%bs!nS2;dd+j z?cunKb4*F?98@20VXxY9Yr|MrYWIM$bj~XG{!UfNPpCY*k1K0t@7CgGCUmOcto9W< zn>F$WG^_4*mGRtI+!0i|dwahaWw$KAN27&erF

8oe9ls{yyc;e7qpiw91jzecHlx4#Wg+RBm;SD0^F5V+9;l_3G`{N_4lWHJ(z= zH;?Jc_%o^(GqGW}ZA{z-RfX;v&8l;^*0TG!O1(9q&bJ*en|E&4hrK5i!S63qr`&68 zidp|9Met84w_EGgH(od#dmNQpGafd4RaIv$ZX#N-j$OESyNK)McdsY>E=D5<|JgoekTc=re*6PF7 zn68(eRQ-55Hj<-i<*kES@!2&hqYJyWbNto4sydmz(cP}Au%}d?yCd2Jw5*y+dn6Xy zes!X~V`};H_s1e#qYk6{glgm5Lt3lZ^;*nqJjFY|UEN#vU@W)^rEPw%R^0DVx7d9u zRK1n{PoJ&PI*FgkM`{5 zCUs8xMV;S1shZlHjM=a;HvM?Ds2leynq{TAOLcQDc31O{$93MVVwsKG&d%YugS1oG z(c7yUJ{yl?JKK~27uKo@eM8l%cU;F(&&KltTJKgodE7aoo??ED`lqhdN_HPr2K9zi zAQuiOF|Qs~9qWv%3-E{ZVQVz@6_aI^R*tr|DAT(e6xO;+^{O+jGV4C34`(J-@;;sv zoqJG+JYQ2Xnn%?bJCAKv_p@E~&>z!Md-o{zD`Q$)zb{s^gG%t+7A@fArxf<;(QP5R ze^S+LZdY7~wVLYeSMGIVsrvHV#(L%Y%;83e*S%ZCG`m+B(TPXm&3GB8J*@R?52;G} zCzLI-u?z8s;!f0N^();`txM|;eY10is&ey)y303`sxYgYHwH_rhigP z?>?bK`$LM}JD{BC-lf!bPN-*@J*buU$HtWCW9rSD<&~O4F%A1;(dO!4=I+y0c4l%s zMCd%FCf7ZxdG_wytq&j5Ds>*yth?*9*0WpVhT=YTTHSa{MQ3fy-FTMNe_59~g` zgIcKHqlI=3tD*XM#-Mpjv3l#ZyYq}roS)Z{oAE+%Yt69M<7li1ld+j@)j9HKA5ubh zs|%goq&dAZ7Ay94Wl8UllGfd#cGDh;&1bJ_jNhXb=^c4E%w=wms`<>MGOx8(3HAF_ zbaPe(*^HNSJNxyO?grJ#PF#BDxaw$534)v-KCPBJ)}D6pVIcGJGLRLb$6e>;`giXyzq!>h2NtO+r#Q{ zpVj%fb86t7J5~I>C-wEF-4^CCzg>OADfN8Kqe^ivo=u^gU-+?q#K33~ffqgwHLRBKrCpM6pb zeK}@A(<(3gZtWO7uR?4dSJi9nRGsU@O9bZLPR{^}7$73Kh$Z|Lg72iG*|N zYvyw$c_yCb=w>S9xkuFf`EiY2iKk#YTegRc=x)(^&&6)~!rpi>Q^zFDc&p;1|Z9=Dwr`|H_)!G3|@( zCEi;!XLa7{>)KHG$Muc5t)r?ILt40xn|%MIYH~C7`<>gBgw7FlhqFf>)b$bNdhcO1 zy3;yaa#nL{PR62oR4X@^saDT@Nk#QF&9=2gI}a}@5v{m8ITLU5nHy3jKd(m9c`9z1 z4k{yik85gfSL~(^>b%3#x@q89wX_%1ZO+EqCweE;I_6_9<&SHfn&oBgc=o)tTg5wb zSSji5jH!D}`Q1Aa3-4Z4!rqru7B4@cYSbRmtUDXE)Mr(oGn1+ToqJ+`KBh$bgKC{OmANBlw6-KNzlG!Lnvb{^9zyfR|rRgrs@yMA1y zHoH-6?`!Ir<{#8IJL}blb>a!`*?6Ga+1Y|zE0e2>&EW+LazBYC)9+yPbq`DaoOE1>UQV1shM^U#IM8)sIT0w4C`%B-t^)T zzK=!RoE!v2GMd&`ErgkBoIcy$NnYV|OtL=EtNbk6+b?<~K&|J)j`K{`m zyPIM?irb0V$*tk-WVf#5XG{38pj$5b_{TqW(?$Hcc=_^=@Z_HPLKyt#-+$)7f9Ak{ z<^T`Be)8tC#@G|&g)=6BMFs~C^H=dZmA~QvStO_-MI9ESXkZK(_94dv9L}J?G(39f z<29I*#(XQ@fbYQ~{4f^dr%=J0umo>M6~Bt5_#GrC`E6k>Np%V-NP>%XkF)k>hb3#FLo75gf&F zIGn^Od6#2?|0@nL)fe}X?nifeHluE$4l13re2V+C%+C-6z!giqmStVA7uhFh=-x8gQ@ z8lS;xd={U>pX2lR0{#Mj2|Hyh%n)OkVa6~cjA7O=hFQxP=61#~cQA$-Wel^9G0b|# zFn2P(Gv=Ay!1P^A!`#gnW+P*m24k3e7{h##G0eS;VKy;_*~}Pb3uBo37{iP)hS|y( z=6=R7+Ze-aXAHB0G0aZJFb^<>$r!`zVhr;jW0)^7h8brJvzsx@9>y?x8N=*j4D%3U zm@hMid6+TGBaC4lWel^QG0XwRFgatG#~8yr&KTwi#xMsN!yIA^^CV-Krx?RbForqI z80JV>PbZm%Im#I37-N{@jA2eNhH;Ex{)#coNyac!jA2eOhB?g`<|~Y0zRDQp3}cw5 z8N)oo80Kq?VV-3S^Vf`F&N7C1jxkKZ80LA#FfTBM`5VSCf6Ex=9AlUl8N-}s3^UCb z<|W23UuO*S4aP8k#~9|nGKTqk#xVcD7{)V(X)=asF@|X~hM8du(_swLWehXR7^cS< zW{xq;%ZyWv#-M;r0t*N8O-#eW!F)5*uy8Qn!Za)# z%(pTP3kUOUOvA!`+bQ+&$+t5N2lE|-!@|M5foWJcnD1m777pgSn1+Re`EI6R;b6Xp zX;?Uz?`0Ym4(9uqhJ}Oqex_mJU=}eA3kUN9OvA##{2Ju52lFFL z!@|M*DATZTFh9mLEF8>YreWb=ew=AoIGCSc8Ws-bCz*zYgZU|@Vc}qYnrT=#n4e)9 z77pgWFbxX_Q(+nw4(4Z>hJ}N9Bh#>OFh9pMEF8?6n1+Rec{9_na4RnTCafc^lKPa4^5XG%Os<+nI)igZV|KVc}pdVj30><{eDK!ogITuEu)m z_}}~z)39(bzsxi&9L%pU4GRbJt4zbf!TcK2uy8QH&NM6>%x^Fa3kS25X;?Uz-((sV z4(7L*hJ}OqZKh%2V19>bSU8w>G7Sp{^SeyL!omC=)39(b3DdA}Fc&io3kP!v)39(b zmog0t2lFnbVc}rj%`_|=%zK!Ig@bu7)39(b%b13RgSm`pSU8yXF%1g`^ZQK0!oj?s zX;?Uz%bA9SgSmofSU8v~nTCafsWA-;2lEF^!@|K_#WXA&%+*Z8!ohriX;?Uz4>Ao4 z2lFAOVc}q|VHy?=W;xTaa4>(!G%OsbIVc}r@jA>Xn zm|K{Jg@akeG%OsktR)-?Qdneg@G!R%jsz(zGB|jcI|xUD6c!mA zJj^KJNRYxJgM)`zM>rCsu*l%xVb&9l1Su>sICz*l2}gny78x8o%m%`dAcaK+2M==> z;Yg6eB7=j6xtnk#NMVt|!NY7M90^ibWN`2>4Z@Kig+&Gj4|5OUNRYxJgM)|pBH>7o z!Xkr%hq;$TM0*k6c!mAJk0%sBS8v_3=SS<8{tTh!Xkr%huKaz5~Q%m;NW3) z5RL>XEHXHFn4N?pK?;iu4j$$K!jT|_MFs~ClM#*tDJ(KLc$i&;BS8v_3=SUVLBf$B zg+&Gj5A!9$ksyUd1_uu_PB;>zu*l%xVRjRa1Su>sICz*ngd;%;iwq7PW-sALkisH^ zgNNBiI1;3=$l%~%9wHnGQdneg@GxH{90^ibWN`2>4-<|ADJ(KLc$h~BM}ia<85}&! zql6sICz-Dgd;%;iwq7P<_O_PkisH^gNKM+rxQ z6c!mAJj^k|ksyUd1_uvwoNy#aVUfYX!<---2~t>OaPTmWa3n}!k-@>k{1xFykisH^ zgNHduI1;3=$l%~%rU*xZ6c!mAJj^M=ksyUd1_uvwns6jYVUfYX!+eEsBuHVA!NJ3P zm2f0TVUfYX!<-=;2~t>OaPTls6OIHaEHXHFm}dw_f)o}R96Zd|2uFex78x8o%(H|e zK?;iu4j$&O2}gny78x8o%vr*bAcaK+2M_Zc;Yg6eB7=j6DF{b`6c!mAJk0ZiBS8v_ z3=SUV1;UXag+&Gj5A!#KBS8v_3=SUVZwW_&6c!mAJj^-5ksyUd1_uxGBH>7o!Xkr% zhdEC;5~Q%m;NW4V2}gny78x8o%u9qLK?;iu4j$&~gd;%;iwq7P<{N|~K?;iu4j$(3 z2uFex78x8o%zq^u2~t>OaPTmHPdE~!u*l%xVg7+|BuHVA!NG$@W-3UK!Xkr%hiMXy z1Su>sICz*A;Yg6eB7=j6X%mhFDJ(KLc$gW&ksyUd1_uw*Ash)(SY&YUFkQltAcaK+ z2M;q#I1;3=$l%~%dW0iE3X2R59%hbkBuHVA!NJ44OgIvxu*l%xVJ;Ak1Su>sICz+O z!jT|_MFs~C^9tcekisH^gNOM?!jT|_MFs~C^D5y;kisH^gNNx8jsz(zGB|jc|3)|x zq_D`~;9>rWa3n}!k-@>k{CC2UAcaK+2M_blgd;%;iwq7P=6?{51Su>sICz+UAsh)( zSY&YUF#nTqBuHVA!NJ44MmQ3ru*l%xVg484NRYxJgM)|pSHh7Xg+&Gj5A(kXM}ia< z85}&!zY&fEDJ(KLc$oh~I1;3=$l%~%{+)0nNMVt|!NdGt!jT|_MFs~CHn3(9DySks z4JqodXdpulhXNjbm~SFJDySks4JqodXdpulhXNjbm~SRNDySks4JqodXdpulhXNjb zm~SCIDySks4JqodXdpulhXNjbm~SOMDySks4JqodXdpulhXNjbm~SIKDySks4Jqod zXdpulhXNjbm~SUODySks4JqodXdpulhXNjbnC~DyDySks4JqodXdpulhXNjbm^Tm~ z6;zR+h7@&JG>{>OLjjLI%y$wW6;zR+h7@&JG>{>OLjjLI%y$tV6;zR+h7@&JG>{>O zLjjLI%y$zX6;zR+h7@&JG>{>OLjjLI%=Zu<6;zR+h7@&JG>{>OLjjLI%=Z!>6;zR+ zh7@&JG>{>OLjjLI%=Zx=6;zR+h7@&JG>{>OLjjLI%=Z%?6;zR+h7@&JG>{>OLjjLI z%p&5Wf+`Z!kfIKY1~TMuDB#hD`2pgif+`Z!kfIKY1~TMuDB#hD`9b2Nf+`Z!kfIKY z1~TMuDB#hD`61$?f+`Z!kfIKY1~TMuDB#hD`C;Otf+`Z!kfIKY1~TMuDB#hD`4Qry zf+`Z!kfIKY1~TMuDB#hD`BCDdf+`Z!kfIKY1~TMuDB#hD`7z?7f+`Z!kfIKY1~TMu zDB#hDSxkIXP(^|oQq*D5K!zL+1w8sNKTdp9P(^|oQq*D5K!zL+1w8sNKS6v{P(^|o zQq*D5K!zL+1w8sNKS_L4P(^|oQq*D5K!zL+1w8sNKSg|0P(^|oQq*D5K!zL+1w8sN zKTUj8P(^|oQq*D5K!zL+1w8sNKSO*}P(^|oQq*D5K!zL+1w8sN|AqLdpo#=Fq^QH9 zfebku3V8Iv!L?b03aUs@Ly9^q8px2tp@2sp=4XkI3aUs@Ly9^q8px2tp@2sp=8eQh z1yv-dAw?Y)4P?mSP{5-P^K-;U1yv-dAw?Y)4P?mSP{5-P^Cse>f+`Z!kfIKY1~TMu zDB#hDc{A}*K@|yVNKuDH0~vBS6!7T7{5bq)M3#;h8zwBJo+$I;-i8p64a2Q4vPjdd{j_Hf*MlPVbMT_91aCM`Y@Lg9~D%QpoSE6STv9!heH96KFqs_ zj|!?tP(zA3EE>p=!=ZpjALiY}M+H?Rs3Aoi77b*`;ZVS%5Az=4qk<|D)R3YMiv}{} za46u>hj}mYQ9%_6YDiIsMFSafI27>c!z?2{DySks4JqodXdpulhXNjbn9GQd3aUs@ zLy9^q8px2tp@2sp=6%FR1yv-dAw?Y)4P?mSP{5-P^ZUd{1yv-dAw?Y)4P?mSP{5-P z^M2x^f+`Z!kfIKY1~TMuDB#hDxt#c@po#=Fq^QH9febku3V8Hkt{^@ts3JiPDeACj zAVUs^0v>&sD~XQ^sz^{niaIPB$dJRKfJYyuMtoFIMS>bq)M3#;h8zwBJo+$yKzvkC zMS>bq)M3#;h8zwBJo+$K5g!#)k)VbYbyzfzA%{Z&k3P)R#76~HB&Z=p9Tp8_$l*}H zqYv`|;-i8p64a2Q4vPjdhxrKcQ9%_6YDiIsMFSafI27>c!~6;HQ9%_6 zYDiIsMFSafI27>c!~7}nQ9%_6YDiIsMFSafI27>cgNy2B5h|!6K@BPDuxKDd4u=9B zeVA*Bj|!?tP(zA3EE>p=!=ZpjALcsZqk<|D)R3YMiv}{}a46u>hq<2ksGy1jHKeG+ zqJa!K913{!VLnQHR8U2N8dB6@(LjbA4h1~=FgFk%6;zR+h7@&JG>{>OLjjLI%*Tk2 z3aUs@Ly9^q8px2tp@2sp=HtXi1yv-dAw?Y)4P?mSP{5-Pvx4}jpo#=Fq^QH9febku z3iuVLr@YNKiwHIxHHs#76~H zB&Z=p9Tp8_$l*}HqYrZv@lin)32I1DheZP!ayS(5=)-)9_^6bq)M3#;h8zwBJo+%7BR(pqB0&u)>ab`aLk@=m9(|ZU zCq62uB0&u)>ab`aLk@=m9(|b46CV{+k)VbYbyzfzA%{Z&k3P&7h>r@YNKiwHIxHH< zki(&XM<3=dh>r@YNKiwHIxHHK*B(ZCoo>_d(TIGjO&X?XO|$7?V{EFX)o7!@o*6-$v| z8EROL6f022YFLb-fiYy*ha3}dID-Py@aUnB*I&Ai36`OTj_;hJDB}0f#dvFb$6$ z`gjdyEz8FuEJg)OP{mRtScV#wBgG2Tu^JYmXkZK(_94dv9L}J?G(39f<29JuSw0qF zF)CPsDwZO_GSsjfDORA4)vy>v17par4>=~_a0UgY;n71Mufg2G^05euQNa>au@niG zp@!v1u>y6hhQ%lv7(<4A$T0zjGbk_(j~@DX4Q7<(V-Xgkf+eV8DH1G04a<>Y1?pH0 zi%~Q%h79|VV*(CmP+%G!J@oM!%sQ5jMOcgqmY|BINU#hwEJunJsADxOM$y0+GVDW+ z2{@cVfoXX3(8p^q>sdY)VKFLLf-06G!7|ja94S_yj@7UjMFV5Vun##V;BW>7rs2^; ze|@?Cd?(AtA}mG)OHjp9Bv^(TmLtUq)Ug^CqiA3Z8TKK^1RTzwz%)F1=;Jk*4J;pv zuox9AK^04pU>Ryyjub0U$7)!NqJc4F*oPbwa5#ek)9~n_kJn)CV)NWx4U8efKIE8y!xby#Zn|#h8mV5#R}B1 z8Wy8yU z920Ojg96j==%J6-U>YnRi?A3KEI}1Zkzg5WSdJ7cP{(RmjG}=tWY~us6L2_#0@Lv5 zp^w*K?qT^@gvF>}3949%1j|sva->*+I#$DC6b+0a!#?DgfWsLSn1)9WeY^(qMV606 zSd0pmpo*nPunaXUM~W4wV>SLij{YD#g7|^|`L_HJgl;w3LFm%ZLDbM-5Ufhu?epZY zhaC2h!ya|vkX*`4nuq-5k2lvLELY1pu(Wk*NP;BVAtNWh4Yh?p@Grp%bLV9APv zl#HB$l8Tx&4I8$!?C9tjJm@ndU_?kn%$NyNX3SZzWJN+sMovLVMa`Oq4O?1vbo31V zCVhqkj0lN{88czZj5!OItVl@7$SEkPs9DpnVN1)7j-J8)LZ2Z4BSIo##!Q$pW6pvl zD-u#NatcZ+YSuJt*wV72qi66p>oX)^L`X!;m z^bB%+h6IcViHI39Vakj-3zn=%NXf`4D5#2#JUp zGhxb%ISZDoNJz=ZDJZF^S<|p#OUsUq9yiks_81Z{A|xVa%!Da3<}6sUA|WLsr=X;w zW=+F}EiF4bdIo>HK0^XVgha%QnJ{I>oCQl(B&1~I6qHodtZCS=rDaD)&*1OSXGp+^ zkcgNu6Q<0VvtY@Jgp`b&f|81wH4PiKwCw2U8T_643<($!5)m_I!ju_v7A#qjkdl#8 zP*PE|reVXDmK_~EgC~841dIrYh#50s%8WS+maIrf$;c@vsi;}guwhHfj*gze-=)ux zfDs`PF=Hl7nK5U(bwF%zcDn6qHXiiDJmoPv^ynl%j@wzTZ%=o$P&`V0ve5fTwIX2O&i za~3RFk&u#+Q&3V-v!-FgmX;kIJ%dW0Aps*oB4Wl&m@;F|f+Z^wQZjN1N-ApBG;G+? zvZJGC@DJ-VBw$2HM9i28Q)bLruw+F-N=8mWNkz??h7DU(bwF%zcDn6qHXiiDJmoPv^ynl%j@wzTZ%=o$QD`V0ve5fTwI zX2O&ia~3RFk&u#+Q&3V-v!-FgmX;kIJ%fK-pCJJwLLy?uOqeob&VnT?5>hg93Q8(! z)--I`(z2tYXYfzxGbCU{NJPw-2~%dwS+Hb9LP|zXK}ki;nuZNqT6T2w3~GIb1dIrY zh#50s%8WS+maIrf$;c@vsi;}guwhHfj*gze|3;r70V6^pV#Z9EGGoqyB`XqAGI9z^ zDr(j=Y}nGWqoZf=ztv|*z=)8Dm@yNk%$T!a$%=%OjGTg!ikdYI8@9CU=;#^z@AMfG zFd`%(X3T^sGv+Khg93Q8(!)--I`(z2tYXYfzy zGbCU{NJPw-2~%dwS+Hb9LP|zXK}ki;nuZNqT6T2w4E||-h6IcViHI39Vakj-3zn=% zNXf`4D5Wq-5k2lvLELY1pu(Wk*NP;GflJNWh4Yh?p@Grp%bL zV9APvl#HB$l8Tx&4I8$!?C9tj{O|P{5-=hpB4*5lDKq9QSh6A^B_pSxq@rd`!-g#_ zJ34v>Z~6=g7!eW?GiJh+8FLmaS&@*EkyB7oQM0CD!L`X!;m^bG!w`V0ve5fTwIX2O&ia~3RFk&u#+Q&3V-v!-FgmX;kIJ)WGw9zy~~gha%Q znJ{I>oCQl(B&1~I6qHodtZCS=rDaD)&*1;0&yau-ArUcSCQO+zXTg#c2`L#l1tk?V zYZ^9eY1z@yGx$I2GbCU{NJPw-2~%dwS+Hb9LP|zXK}ki;nuZNqT6T2w4E`_r3<($! z5)m_I!ju_v7A#qjkdl#8P*PE|reVXDmK_~EgLi#~1dIrYh#50s%8WS+maIrf$;c@v zsi;}guwhHfj*gze|5cwM0V6^pV#Z9EGGoqyB`XqAGI9z^Dr(j=Y}nGWqoZf=f7543 zz=)8Dm@yNk%$T!a$%=%OjGTg!ikdYI8@9CU=;#^zbNUPk7!eW?GiJh+8FLmaS&@*E zkyB7oQM0CD!L`X!;m^bGz*eTD>#2#JUpGhxb%ISZDoNJz=ZDJZF^S<|p#OUsUq zp23GcLjp#GM8u4lFlEM^1xr>Wq-5k2lvLELY1pu(Wk*NP;9t^bNWh4Yh?p@Grp%bL zV9APvl#HB$l8Tx&4I8$!?C9tj{LA_b2^bL)5i@4Olo@jtELo9|l95wTQc<&}VZ)Y| z9UVP`e?^}m0V6^pV#Z9EGGoqyB`XqAGI9z^Dr(j=Y}nGWqoc>0qQM?R0!D;H#Eh9R zWyYKZOI9SLWaJc-RMf0#*s!H#M@P@#U)5(wz=)8Dm@yNk%$T!a$%=%OjGTg!ikdYI z8@9CU=;#^zYx)cc7!eW?GiJh+8FLmaS&@*EkyB7oQM0CD!pCJJwLLy?uOqeob&VnT?5>hg93Q8(! z)--I`(z2tYXYl{fXGp+^kcgNu6Q<0VvtY@Jgp`b&f|81wH4PiKwCw2U8T{+|3<($! z5)m_I!ju_v7A#qjkdl#8P*PE|reVXDmK_~EgMUMxAps*oB4Wl&m@;F|f+Z^wQZjN1 zN-ApBG;G+?vZJHN2cp3qLjp#GM8u4lFlEM^1xr>Wq-5k2lvLELY1pu(Wk*NP;NR3| zNWh4Yh?p@Grp%bLV9APvl#HB$l8Tx&4I8$!?C9tj{9F1A2^bL)5i@4Olo@jtELo9| zl95wTQc<&}VZ)Y|9UVP`|EE4f0!D;H#Eh9RWyYKZOI9SLWaJc-RMf0#*s!H#M@P@# zOP?VDBSIo##!Q$pW6pvlD-u#NatcZ+YSuJt*wV72qi68{(q~A(h>(bwF%zcDn6qHX ziiDJmoPv^ynl%j@wzTZ%=o$RK^%)W{A|xVa%!Da3<}6sUA|WLsr=X;wW=+F}EiF4b zdItZtK0^XVgha%QnJ{I>oCQl(B&1~I6qHodtZCS=rDaD)kBcFLJ%$8~2#JUpGhxb% zISZDoNJz=ZDJZF^S<|p#OUsUqp25GP&yau-ArUcSCQO+zXTg#c2`L#l1tk?VYZ^9e zY1z@yGx&G)84@reBqCj0lN{88czZj5!OItVl@7$SEkPs9DpnVN1)7j-J7PpwEzi z5g`#VVoCQl(B&1~I6qHodtZCS=rDaD)&)`R& zAps*oB4Wl&m@;F|f+Z^wQZjN1N-ApBG;G+?vZJGC@E_|lBw$2HM9i28Q)bLruw+F- zN=8mWNkz??h7DUhg93Q8(!)--I` z(z2tYXYl{mXGp+^kcgNu6Q<0VvtY@Jgp`b&f|81wH4PiKwCw2U8T=Re3<($!5)m_I z!ju_v7A#qjkdl#8P*PE|reVXDmK_~Ega1;WAps*oB4Wl&m@;F|f+Z^wQZjN1N-ApB zG;G+?vZJHN7odYbu*W__4hT48#1SFKM4S+F%9t}IoHONu8JEntV!<^_Zdh?k!W}91 zWIT}bNWl{&&s4ln^U9hx8s6FP!In>2zS!|i#}7Tf4E`JTv&TL|4hT48#1SFKM4S+F z%9t}IoHONu8JEntV!<^_Zdh?k!W}91WIT}bNWl{&&s4ln^U9hx8s6FP!In>2zS!|i z#}7Tf4E`wl*<+s}2Lv24;)sxAB2I`oWy~29&Y5z-j7#QRvEZ5|H>|iN;f|DhG9Jiz zq~M8?XDVK(d1cKT4exCDV9O^hU+nm%~0|E{iaYV>55hui)GUkj4 z=S;a^#wByESa8jf8&=$sa7W5L84u(@k7rq zgFo2M9{UVAAmES@M}!;`aYD=~W6qdx&XfygTr%g11=lROVZ|*8cck2t@j%Wa1y7Vb zQ}II0D{J0pcxS^0TRv&|V#hZfKlJ=E_+#v6k9~$55OBzdBSMaeI3ebgF=tFTXUYXL zE}3)1f@_xCu;P}4J5uh+cp&GIf+tFzsd%C0l{IfPytCnhEuXY}vE!SLA9{Wn{IT}4 z$38<22smWK5h2G!oDg%$m@_7vGv$IAm(00h!8J>6SaD0j9Vz!@JdpE9!4oCVRJ>60 z%9=MC-r4ZMmQPx~*zrxr4?VvO{y6*DW1k@h1ROHrh>&9c9>{s5;E9rFDqg60Wz8E6?`-&B%O@>g?D(eRhn`>j_-XJ5_Sk30 z0Re}MI3nbjh!bK?8FR*jbEaG{Q!5?owd+amhfPh0r91(I%#0fE{j5%Y%Ia4l}amk!37F@IB zh84FY+>vrm#sfKz6g*M#OvMW|udI2a;hha1Z26?+iyhx|{Lu5u;7_ohJ@y%LK)@j* zjtDs>;)Iw}#+)(XoGBN~xMa>13$9sm!-`uH?nt>Oy;)R-5*1Xa1&V~=ReA4p8 zj&C}C==o*vC)v*)`wTfC;E)kVgd7ubLd+>+&X{n{lnZ8DGUti~*DSeV#VrYUq}-G7 zK+YotPn0}U@j}fjYu;#hXTt|uK56-4$2T26^!zgTlkI1ZeTEzmaL9-wLXL?zA?B1Z zXG}O}$^|nnnRCU0YnI%w;+BLvQtrukAm@>SCrX~Fc%kN%HE%S$v*Cj+pR|0jS zCrX~Fc%kN%HE%S$v*Cj+pR|0j&9c9>{s5;E9rFDqg60Wz8E6?`-&B%O@>g?D(eRhn`;s z|1JC3W1k@h1ROHrh>&9c9>{s5;E9rF zDqg60Wz8E6?`-&B%O@>g?D(eRhn`;se~SI=vCohL0uC8*M947_C&ZjG=8OsFOu1mj zC3CJ=aLtk%R@{&rd$~_qmgdomu#d8FWpl4mMjsCi}08x8Ml_+ZN?Enn>TrsId6Uj~1w z{p_*NkOKk^8F56&F%c)koHFK&3Fl0?V8$hLu2^u*k{edsl5j`LJsA(=JW}vP$uku% z)V#9hjfQtNe6Zz{mM?aE)A2*kFN6P%{p_*NkOKk^8F56&F%c)koHFK&3Fl0?V8$hL zu2^u*k{edsl5j`LJsA(=JW}vP$uku%)V#9hjfQtNe6Zz{mM?aE)A2*kFN6QC{p_*N zkOKk^8F56&F%c)koHFK&3Fl0?V8$hLu2^u*k{edsl5j`LJsA(=JW}vP$uku%)V#9h zjfQtNe6Zz{mM?aE)A2*kFK&7o{DD388FE0tAtR0mIVR$Sm{Z1_G2xsk7tFY1&J_!; zS#raQTN3U_xhLa+oJR_tD0!yhg_>8^ywUK^h7Y!U((=WQZ#sVH`DO5@+0P#P3^^d+ zkP%0O920Ru%qe5em~hUN3uat0=ZXc_EV*IDEeUs|+>`M@&Lag+lsr@MLd`2{-e`Dd z!v|YFY58KuHyuCp{4)5{?PrgDh8z%Z$cQ6Cj)^!S=9DpKOgLxC1v4(0bH##dmfW!7 zmV`S}?#Xx{=aGUZN}j2Bq2`q}Z#2BK;e#!ow0yDSn~ooPei{53_Or)6Lk=+_2)7gga91 z$#@{=k%A{mo~d}D=9M*XG`zFngDs!5e6i!3jvsn{8T^^{v&TL|4hT48#1SFKM4S+F z%9t}IoHONu8JEntV!<^_Zdh?k!W}91WIT}bNWl{&&s4ln^U9hx8s6FP!In>2zS!|i z#}7Tf4E`+p*<+s}2Lv24;)sxAB2I`oWy~29&Y5z-j7#QRvEZ5|H>|iN;f|DhG9Jiz zq~M8?XDVK(d1cKT4exCDV9O^hU+nm%|iN;f|DhG9Jizq~M8?XDVK(d1cKT4exCDV9O^hU+nm%r;Isc!Z}kem~qLRD;8X{&azMZ#BaR3;CgOycQ^uSz;hZTK z%(!IE6$`Fea>I&S67ER3C*y&fM+%-Od8Xopnpf7m(eTcO54L>L^2LsCI)3Q+W$@?N z&mQ{>IUwMW5l4g^6LCV!DPzuCJX7&P z%`0o(Xn1GC2U|XA`C`X69Y6H^GWhS?&mQ{>IUwMW5l4g^6LCV!DPzuCJX7&P%`0o(Xn1GC2U|XA`C`X69Y6H^GB~oIJ@y%L zK)@j*jtDs>;)Iw}#+)(XoGBN~xMa>13$9sm!-`uH?nt>O~0|E{iaYV>5 z5hui)GUkj4=S;a^#wByESa8jf8&=$sa7W5L84u(@k7rqgFoMX_Sk300Re}MI3nbjh!bK?8FR*jbEaG{Q!CzoMd+amhfPh0r91(I%#0fE{ zj5%Y%Ia4l}amk!37F@IBh84FY+>vrm#sfKz6g*M#OvMW|udI2a;hha1Z26?+iyhx| z{Lu5u;4ieFJ@y%LK)@j*jtDs>;)Iw}#+)(XoGBN~xMa>13$9sm!-`uH?nt>O@(zmfI~(c5pqn#2{EUPIb*^( zQ!bcs$($<|T(jhc6}KeZk#bMQ138ZrJW=vY#S1mBta+p1oedvs`K0BG9p7~P(DTdS zFS4IK_8D?Oz#$`!2stL=gqTytoH5~?DHqJRWX=@}u32)!idz!yNVzBDft*JQo+x>y z;)R-5*1Xa1&V~=ReA4p8j&C}C==o*vKeV4c_8D?Oz#$`!2stL=gqTytoH5~?DHqJR zWX=@}u32)!idz!yNVzBDft*JQo+x>y;)R-5*1Xa1&V~=ReA4p8j&C}C==o*v7u(Ms z`wTfC;E)kVgd7ubLd+>+&X{n{lnZ8DGUti~*DSeV#VrYUq}-G7K+YotPn0}U@j}fj zYu;#hXTt|uK56-4$2T26^!(zWv%w$OW1k@h1ROHrh>&9c9>{s5;E9rFDqg60Wz8E6?`-&B%O@>g?D(eRhn`;se~JCord%-Nk~vo_xMs-> zD{e`+Bjuiq2XY=Mc%tN)iWh2LS@TB2I~zXO@=41VJHF}oq34&u|Hyv!*k{NA0f&q@ zBIKBe6JkyobH;>ord%-Nk~vo_xMs->D{e`+Bjuiq2XY=Mc%tN)iWh2LS@TB2I~zXO z@=41VJHF}oq34&uiT&)c&yWKG4jFMo$T1No#GEqbj0xvVxnRa6bFNr$&5|2d+>&rd z$~_qmZa;hMGvt7PLq;4Ca!kYtF{g|8^ywUK^h7Y!U((=WQZ#sVH z`NbipgFmpxK0^)&IAp{TA;(0V5Od0yGbWrf<$@WP%(-I0HA`+-aZAD-DfeVNkn>2v z6D7}7yioJXnl~EW+3>-ZPg=g%@lD4MJ--b8O8ePkpCJbX95UjFkYgfFh&g4<857Q# za>0yC=3KGhnk6@^xFz9^lzTEB$a$pTiIQh3UZ{Cx%^MBxZ1`ZyCoNy>_@?8Bo?ixk zmHq6o&yWKG4jFMo$T1No#GEqbj0xvVxnRa6bFNr$&5|2d+>&rd$~_qmR|j5s3Xn1~Z%P8oB?gmb1`FyoRrS1h<@ z$qg%RNw_2Bo{R@_9w~UDWYF=6MM#DQBKG^a}%NIMo>G+}Nm%(3aKYQ#mr;Isc!Z}kem~qLRD;8X{&azMZ#BaR3;CgOycQ^uSz;hZTK%(!IE6$`Fea>I&S z67ER3C*y&fM+%-Od8Xopnpf7m(eTcO54L>L^2LsCI)3Q+#R2DoKd{F>Lkg zdomu#d8FWpl4mMjsCi}08x8Ml_+ZN?Enn>TrsId6Uk3kU``KfkAqNB;GUAAkVgdomu#d8FWpl4mMjsCi}08x8Ml_+ZN?Enn>T zrsId6Uj~1J{p_*NkOKk^8F56&F%c)koHFK&3Fl0?V8$hLu2^u*k{edsl5j`LJsA(= zJW}vP$uku%)V#9hjfQtNe6Zz{mM?aE)A2*kFM~7t*<+s}2Lv24;)sxAB2I`oWy~29 z&Y5z-j7#QRvEZ5|H>|iN;f|DhG9Jizq~M8?XDVK(d1cKT4exCDV9O^hU+nm%|iN;f|DhG9Jizq~M8? zXDVK(d1cKT4exCDV9O^hU+nm%|iN;f|DhG9Jizq~M8?XDVK(d1cKT4exCDV9O^hU+nm%|iN;f|DhG9Jizq~M8?XDVK( zd1cKT4exCDV9O^hU+nm%n~-QjB!037G%Fhq(J2~wm;kRnBb6e&`qNRc2xg3GzgWnRu@ zE^|3A=VdPEWnRwZyv$`Tb2%^PGMBl`WiCjOB1MW6DH5beks?Kk6bVwKNRc8%iWDhw z&x;3sZ$EthfFBW}#E6q1Ns2TXvgF89ph$@_6{^&z)1XOc87z#u_}7$(FB zql6h_oCzkGB0`iHaS|j+ktRcy9C->9DN&|El^S&#G-=VMLzii0m}QQ67U;3a63eWx z${Kyv*ric(FMw|pmQl!a{B}bkDMM{*Z zP^Ctl22EPD>Ck1G8D^Pdo&|a=vcxhgtg=R*bvD>!i*0t;Wskr=(9Zya1Q}wO5F?Bd zW{hzrm}H6wQDVeNkR(N#3|VsIDNv+DnF>{E)M?P9MVk&?rkP=uIp$fQ$0AECv%)HC z^jT+vO}5x(hh6pvT-471g9I63m=Gh35@w8XCYWT32vK6hNsuH(nhaTTd7d8DNkgLktsQgi*qb zG0p^&Oc5bUj5rCBq)3w?OO8ARij*i*p-PQ94VtuQ)1k{WGt4r_JPY(#WQk=~SY?eq z>uj*e7TfHw%N`z^1@b+H~kL%?z{5G0y@$7FlAM6;@fJ&pI1yvc)z#?6ODTKhw_ug9I63m=Gh35@w8X zCYWT32vK6hNsuH(nhaTT9DN&|El^S&# zG-=VMLzii0m}QQ67U;3a63eWx${Kyv*b+H~kL%?z{5G0y@$7FlAM6;@fJ&pI1y zvc)z#?6ODTvVH~_B*+lMgcxCzFk_4}!6Z{eh!P`Cf+Q)@WXO^uPk|yO%2cRQqfUb+ zE!uSGGR+LL%rVabJr-GFnH5%9qt7}UY_i2RJM6MY;2-K|fI)%`F-(XNMhP>{E)M?P9 zMVk&?rkP=uIp$fQ$0AECv%)HC^jT+vO}5x(hh6pv{3HDgFi4Ogh6ypkC}GAJXM#zl zh!7=4oCHZyq{)yaN1g&jN|dQkrAD0wOric(FMw|pmQl!a{B}bkDMM{*ZP^Ctl22EPD z>Ck1G8D^Pdo&|a=vcxhgtg=R*bvD>!i*0t;Wsksrp`QT;2{ObmAx0P_%oyWLFv%1V zqQr=kAW4ce8M5TaQ=mwRG8L-SsMDZHi#8p)Of$nQbIh|qk42VPW`$MO=(ElSn{2Vo z4!i6TNb6^SL4pi1Oo$Oi2{Xnx6HGEigeWoMBuJ7XO@=Hv@)RgiqD+Mric(F zMw|pmQl!a{B}bkDMM{*ZP^Ctl22EPD>Ck1G8D^Pdo&|a=vcxhgtg=R*bvD>!i*0t; zWsksB{R}WjkRgT%F~TTe#u#UUNv4PpB}SYCNm8WAkR?Z+0!2!csZgaxod!)>wCT`g zni*!9W1awwCT`gni*!9W1a9DN&|El^S&#G-=VMLzii0m}QQ67U;3a63eWx${Kyv*FqrbCx$W|(D; zc^2rg$P&w}u*w>J*4bc_Ewet13@}KLA%+Pt!YEric(FMw|pmQl!a{ zB}bkDMM{*ZP^Ctl22EPD>Ck1G8D^Pdo&|a=vcxhgtg=R*bvD>!i*0t;WskrO{R}Wj zkRgT%F~TTe#u#UUNv4PpB}SYCNm8WAkR?Z+0!2!csZgaxod!)>wCT`gni*!9W1a9DN&|El^S&#G-=VMLzii0m}QQ67U;3a z63eWx${Kyv*b+H~kL%?z{5G0y@$7FlAM6;@fJ&pI1yvc)z#?6ODTrhWz(B*+lM zgcxCzFk_4}!6Z{eh!P`Cf+Q)@WXO^uPk|yO%2cRQqfUb+E!uSGGR+LL%rVabJr-GF znH5%9qt7}UY_i2RJM6MY;J?$)0D}Y>Vweykj1p#yaVD5#iU?6+#7U4OMVbs*a^xvc zq(qqtRch2}(4FqrbCx$W|(D;c^2rg$P&w} zu*w>J*4bc_EwsZpmvlNN0{beU#`S>~8$fgX!2vCImq ztkGwk4K~?gn;mx9Bk(WuGr%B0h8QNq2&04FqrbCx$W|(D;c^2rg$P&w}u*w>J*4bc_EwwCT`gni*!9W1amvp|nUmRM$mRo3XU z&IX%ovCR&<>=F2v`Wax5AVUljVuVq`j4{pxlS~mIN{l!OlB7tJAxn-t1&Wj?Q=v+Y zIt`k%Xw#v~G&9UH$2<%4SY(N1R#;_?KI?3-$rjt}u*)8Sl70plB*+lMgcxCzFk_4} z!6Z{eh!P`Cf+Q)@WXO^uPk|yO%2cRQqfUb+E!uSGGR+LL%rVabJr-GFnH5%9qt7}U zY_i2RJM6MY;9u!yfI)%`F-(XNMhP>dMvWUGApdIMxS*y*kp@scGzW)Kv_Qn3=(9BVM2^BN|-UmnP8GB zB1DN1Cqa@FX)sZpmvlNN0{beU#`S>~8$fgX!2vCImqtkGwk4K~?g zn;mx9Bk-^FGr%B0h8QNq2&04Fq zrbCx$W|(D;c^2rg$P&w}u*w>J*4bc_EwsZpmvlNN0{ zbeU#`S>~8$fgX!2vCImqtkGwk4K~?gn;mx9BT#uC_>b7feg-(eAO{I@h#?L$%n?Ey zWrSmla-1+H7~>@4oMM90Omc=P&Jy7qQO*nNU7j<|3ubu9EU%d3HS@e-fw%N{ z$0F}p;seWkWQ9+x@|iWh(B~`bd}D*}Z1RIGezMIkcKFRMf7s(Mf&bC|>|;Lz9AJ=x z1UbYIhZ*JwA&xS_F-AE~m=la~l5tKk!D%Kr!xU$UaE>VFiE)8A7fEo5B$r8Xg)~>m zaE&b2$#H``Hz{z7BDX1Vhcb7maE~hYsquh14{7j-CXZ?Hgf>s<@Qg0cndSvEykwSF z%<-Ce-mt)1dc0$i_bl;&Wj?aPCsz5)8ei!1m36+c!FM+K!4^N+<`+BsW|u$g@t43o z`?HVz3~+!!4ie-LLmXz9BZN502*()ZIAKmO#!1FG#RR9BO7>uBbq#>#S_{*rNcA2 zJZG90%glBd}M`Btn!&PzR>3@>wIH_ z?`-meEq=1iFLwCNE`QkLFM+E4*~fkcIKUtW337-b4l~RVLL6m;V~lc~Feez}B;%Z7 zg40ZLhAGYx;T%!U6XODLE|TC9NiLJ(3TdvA;Tl=4lj8RC<0RvpVuI65a)v3+65$+C&J*JTaW0bJ5=kzT z;tFZ5lHnRzu9M>id2UkR7DaAT;tpl*QsEv|?o;Cdbso~-5ltS`;t6e@(%~6ho-@r0 zW_ZagubAUC^Soh!xAb_&BJWw^1Iv75g-@*VnKi!9=PT=cV}tK(@`EjYvdu4c_{}bV z*yAsOn*G_weg-(eAO{I@h#?L$%n?EyWrSmla-1+H7~>@4oMM90Omc=P&Jy7qQO*nNU7j<|3ubu9EU%d3HS@e-fw%N{$0F}p;seWkWQ9+x@|iWh(B~`bd}D*}Z1RIG zezMIkcKFRMf7s(Mf&bP1>|;Lz9AJ=x1UbYIhZ*JwA&xS_F-AE~m=la~l5tKk!D%Kr z!xU$UaE>VFiE)8A7fEo5B$r8Xg)~>maE&b2$#H``Hz{z7BDX1Vhcb7maE~hYsquh1 z4{7j-CXZ?Hgf>s<@Qg0cndSvEykwSF%<-Ce-mt)1dc0$i_bl;&Wj?aPCsz5)8ei!1 zm36+c!FM+K!4^N+<`+BsW|u$g@t42@`?HVz3~+!!4ie-LLmXz9BZN502*()ZIAKmO z#!1FG#RR9BO7>uBbq#>#S_{*rNcA2JZG90%glBd}M`Btn!&PzR>3@>wIH_?`-meEq=1iFLwCNE`QkLFM+!K*~fkcIKUtW z337-b4l~RVLL6m;V~lc~Feez}B;%Z7g40ZLhAGYx;T%!U6XODLE|TC9NiLJ(3TdvA z;Tl=4lj8i%PZ!1%{*^d;4MAgvB-Ot_`otBS>Y3_d}fU=^!ds<-`LglBd}M`Btn!&PzR>3@>wIH_ z?`-meEq=1iFLwCNE`QkLFM)q&fA+DT0S++8L4q7&h{FtXgb+s=;TWSFC(H@PILSDt znBX*%oMDQyL^wy3^TfD7oQov5M3T#-xI&t%WVl9_>*TmWo|_c7MUmT-xI>w{RJcc# z`_yi%PZ!1%{*^d;4MAgvB-Ot z_`otBS>Y3_d}fU=^!ds<-`L$dBXy4>G6(5-m}C9mifpEpIGHHYkZ;4SJwH)2H)A_2V4AP zn_ukkn_d2}$6o^fyZzb6eg-(eAO{I@h#?L$%n?EyWrSmla-1+H7~>@4oMM90Omc=P z&Jy7qQO*nNU7j<|3ubu9EU%d3HS@e-fw%N{$0F}p;seWkWQ9+x@|iWh(B~`b zd}D*}Z1RIGezMIkcKFRMf7s(Mfu{Z0$9@Jlz#s<+a)==gGt3b}9A$)KjB=bXCm7=- zglBd}M`B ztn!&PzR>3@>wIH_?`-meEq=1iFLwCNE`QkLFMF|s$&za^0GrVM$SIqI6dET(VTY9`> zk@qa|fn`3j!Y5Yw%o<#&J!6lMhCdC!fTqVOb zvRo&}4f5Qiz%7d0ro1L{1a!6TYHro|K5Jf*`kx;$r^7tHXISza;6 zYvy^w0&nT@jz!+H#0Qr7$O@lWRC<0RvpVuI65a)v3+65$+C z&J*JTaW0bJ5=kzT;tFZ5lHnRzu9M>id2UkR7DaAT;tpl*QsEv|?o;Cdbso~-5ltS` z;t6e@(%~6ho-@r0W_ZagubAUC^Soh!xAb_&BJWw^1Iv75g-@*VnKi!9=PT=cV}tK( z@`EjYvdu4c_{}bV*yAsO|I7aDV?P5NV330ZIm8f$8RiHfjxxeAMmbKH6O3_^aZWM8 zX(ldBXy4>G6(5-m}C9 zmifpEpIGHHYkZ;4SJwH)2H)A_2V4APn_ukkn_d2}$6o^fxBc12eg-(eAO{I@h#?L$ z%n?EyWrSmla-1+H7~>@4oMM90Omc=P&Jy7qQO*nNU7j<|3ubu9EU%d3HS@e- zfw%N{$0F}p;seWkWQ9+x@|iWh(B~`bd}D*}Z1RIGezMIkcKFRMf7s(MfwukG$9@Jl zz#s<+a)==gGt3b}9A$)KjB=bXCm7=-glBd}M`Btn!&PzR>3@>wIH_?`-meEq=1iFLwCNE`QkL zFMF|s$&za^0GrVM$SIqI6dET(VTY9`>k@qa|fn`3j!Y5Yw%o<#&J!6lMhCdC!fTqVObvRo&}4f5Qiz%7d0ro1L{1a z!6TYHro|K5Jf*`kx;$r^7tHXISza;6Yvy^w0&nT@jz!+H#0Qr7$O@lWRC<0RvpVuI65a)v3+65$+C&J*JTaW0bJ5=kzT;tFZ5lHnRzu9M>id2UkR z7DaAT;tpl*QsEv|?o;Cdbso~-5ltS`;t6e@(%~6ho-@r0W_ZagubAUC^Soh!xAb_& zBJWw^1Iv75g-@*VnKi!9=PT=cV}tK(@`EjYvdu4c_{}bV*yAsO|IhyHV?P5NV330Z zIm8f$8RiHfjxxeAMmbKH6O3_^aZWM8X(ldBXy4>G6(5-m}C9mifpEpIGHHYkZ;4SJwH)2H)A_2V4APn_ukk zn_d2}$6o^fzx~@4oMM90Omc=P&Jy7q zQO*nNU7j<|3ubu9EU%d3HS@e-fw%N{$0F}p;seWkWQ9+x@|iWh(B~`bd}D*} zZ1RIGezMIkcKFRMf7s(Mfv)}8$9@Jlz#s<+a)==gGt3b}9A$)KjB=bXCm7=-5^4sHKj28fc`6W?E>ajdnWdq>FBP=%tT-1{h?BVMZ8bjBzHIWQu8Km}QQ67FcA7 zWmZ^ajdeEIWQ%Qf*kzA>4mjk9V@^2bjB_ryyl*d7$7FlAM6;@eeoeehGVw)Xy*<+sr4mskO6HYnfoC_|w;+h+7x#OM(9(m%K z7hZYeoew_w;+r3S`6FOie*y_2m=HnT31QJOinG{k*l~0tzXjm=a1UqnrvVsiK-1YN?~11{!IinHE}Uqn!>q>7tt+dg-H|0R|ajm=Q)9 zW1I;lnPQq5W|?E21r}LinH5%9W1S5)*!6Z{mGs7%%%(K8EODwa(Dr>B>!6sX5v%@ZX>~p{&M;vp)DQBE>!6jE* zbHgon-1ERAPdxL&D{s8>!6#pQ^TRKH1gz*!AVCBZLMUN`6G0?VL=!_Sam15AB1t5Z zLMmyblR+j~WRpWKdE`?-Aw?8ZLMdgGQ$ZzFR8vDOb=1>9BTY2ZLMv^w(?KU)bkjpG zee^THAVUl@!YE^mGr=TNOf$nQbIh~AB15^4sHKj28fc`6W?E>ajdnWd zq>FBP=%tT-1{h?BVMZ8bjBzHIWQu8Km}QQ67FcA7WmZ^ajdeEIWQ%Qf*kzA>4mjk9 zV@^2bjB_ryyl*dg0fiJ%ObMlwQBDPwR8dV0 zwbW5h1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM%m|~5G0p^&Ofk(2v&=Ei0*frM%nGZl zvCamYY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0}x7=~h1CKoM%nPr)@y-XIeDTc>zx)yK zuk|O8Ac6@YlrX}HAd)Dei6NFa;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqYx zppq)8siBrS>S>^nCYouXl{VVxpp!1T>7kcC`Wax5A%+=YlrhGcV3H}OnPHYW=2>8o zC6-xXl{MDcV3RGj*kRXByA(Sw}i6D|FqKP4vIO0hlktC8yA(b@J$sm&~vdJNrJn|`^kRpmH zp_DSpsi2Z7s;QxtI_hblktUjHp_Mk;>7bJ?y6K^pKKdD8kRgT{VU#h(nP8GBrkP=u zIp$elktLQ{VU;!3*8zMmP~f5=AsI#1cn52_%w4GAX2zMmiZ}l0`N-X?_+;Yb~4?Ob3 zGcUaI#ycN;^2Ikl{PIV@hW-Q+L@*(Q5=J-?L=r_bF~kx_JP9O{L^3I)l14fiWRgWT zIpmT@J_Qs~L@_0lQbsuyR8mDXHPli^Jqh9qTyn)VH{5c^ zJr6wc#4|6v^2R$KeDcLNKm77Xz`xa>K!OM+giyi=CxS?#h$ewe^2n!vLW(G+gi^{Vr-DkVsHTQm>ZqrIMw)1*g;v^Vr-M$q=%$BW`sinX zL53J+gi*#AXM#zlm}Z7q=9p)JMV44*g;myAXM;_)*k*@a_SoluLykD+gj3Eq=YmVF zxaNji?zrcHN1k}*g;(Bq=Yvna_~wUS{s`FApFn~LCWKJJ2q%I_qKGDjSmKB$fkcu> zCWTbeNGF3#vdAWfT=K}LfI^BWri4<;D5ru-s;H)hTI#5$fkv8WriE78Xs3fty6C2d zUi#=~fI)^BW`t437-xb>rkG}iS>~8$fkl>BW`$MOSZ9Mxw%BHeUG~`LfJ2Tr=7dwu zIOl>(uDIrgTkg2$fk&Qr=7m?@c;|ypzWC;cU;YUAAM_`XAc6@YlrX}HAd)Dei6NFa z;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqYxppq)8siBrS>S>^nCYouXl{VVx zpp!1T>7kcC`Wax5A%+=YlrhGcV3H}OnPHYW=2>8oC6-xXl{MDcV3RGj*7bJ?y6K^pKKdD8kRgT{VU#h(nP8GBrkP=uIp$elktLQ{VU;!3*5^4sHKj2 z8fc`6W?E>ajdnWdq>FBP=%tT-1{h?BVMZ8bjBzHIWQu8Km}QQ67FcA7WmZ^ajdeEI zWQ%Qf*kzA>4mjk9V@^2bjB_ryyl*d7$ z7FlAM6;@eeoeehGVw)Xy*<+sr4mskO6HYnfoC_|w;+h+7x#OM(9(m%K7hZYeoew_w z;+r3S`6FOQe*y_2m=HnT31QJOinG{k*l~0tzXj zm=a1UqnrvVsiK-1YN?~11{!IinHE}Uqn!>q>7tt+dg-H|0R|ajm=Q)9W1I;lnPQq5 zW|?E21r}LinH5%9W1S5)*CWctzh$n$Wl1L_nRMJQ%gG{o>CWl<| z$ftlpiYTUpQpzZ&f=a5WriNPTsHcHOnrNnlR@!K%gHF2WriWho=x2aIh8SjqQN|c& zf=Q;BW`we^2n!vLW(G+gi^{Vr-DkVsHTQm>ZqrIMw)1*g;v^Vr-M$q=%$BW`sinXL53J+ zgi*#AXM#zlm}Z7q=9p)JMV44*g;myAXM;_)*k*@a_SoluLykD+gj3Eq=YmVFxaNji z?zrcHN1k}*g;(Bq=Yvna_~wUS{s{PA^e2!Yf(ap%Fv5u-k|?5yA(lAeNg$CVl1U+z zG}6f+lPt2yA(uSzDWH%diYcL#GRmo-k}9gHp_V%8X`qoNnrWexHrnZ+lP%G_Vg!^Ac6@YlrX}HAd)Dei6NFa;z=Np zB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqYxppq)8siBrS>S>^nCYouXl{VVxpp!1T z>7kcC`Wax5A%+=YlrhGcV3H}OnPHYW=2>8oC6-xXl{MDcV3RGj*{a3Y8#ifCep zC60I!NF<45Qb;9@bTY^!i)?bpC69axD5QvDN+_j_aw@2#ifU@8rH* zb~@;!i*9=8rH_6F7-WcHMi^y`aVD5#ifLw;WsZ3kSY(N1R#;_?bvD>!i*0t;WsiLh zIOK?9PB`U^b1t~#ifeAT<&JwEc;ty^UU=n=cRu*!i*J7T<&S`U{Rt$9U_uBbjBp}| zB#LNah$W7A5=bP8WKu{ajdU`|B#Ufv$R&?_3MizAVoE5bjB+Zdq>5^4sHKj28fc`6 zW?E>ajdnWdq>FBP=%tT-1{h?BVMZ8bjBzHIWQu8Km}QQ67FcA7WmZ^ajdeEIWQ%Qf z*kzA>4mjk9V@^2bjB_ry7$7FlAM z6;@eeoeehGVw)Xy*<+sr4mskO6HYnfoC_|w;+h+7x#OM(9(m%K7hZYeoew_w;+r3S z`6J+e*PlRw2quJ3!U!jVNTP@)hFIc=CxJwgNG63;(nu$ROtQ!(hg|Z=r+`9=D5iu` z$|$FTN~)-)hFa>Vr-4SAXr_f$+GwYPPP*u(hhF;VXMjP57-ob~#u#UUNv4=)hFRvA zXMshQSZ0M))>vnQO}5x(hh6sA=YT_wIOc>?&N%0SORl))hFk8q=YdC_c;CWctzh$n$Wl1L_nRMJQ%gG{o>CWl<|$ftlp ziYTUpQpzZ&f=a5WriNPTsHcHOnrNnlR@!K%gHF2WriWho=x2aIh8SjqQN|c&f=Q;B zW`3trCy*e52_cj)!igY~D58lWmN?=`Adw`JNg%G{!jf0B#2-_2qlbgB8Vi4Xkv&Zj(8GCB#C5F zNF|MQGRP#0Y;wpYk9-O!q=;flD5Z>YDyXE2YHFyZj(Qqsq={x)Xr+yII_RW}ZhGjY zkA4OiWQbu#7-fucCYWT3X=a#Zj(HYXWQk=~SY?fMHrQl~ZFbmYk9`g}8zMmP~f5=AsI#1cn5 z2_%w4GAX2zMmiZ}l0`N-X?_+;Yb~4?Ob3GcUaI#ycN;^2Ikl{PIV@|D`{H1QARKp@b1m1d&7$ zO$@Qb5l;e%B#}%Csicuk2AO1$O%A!_kxv1I6j4kGrIb-l1(j4$O%1iwQBMPnG|@~8 zt+dfj2c2}$O%J{F(a!*b3^B|Iql_`m1d~iL%?z{5G0y^vEV0ZAtE{ok2AgcL%?`Wl zvCjdA9C6GEr<`%l1(#fL%?-EQanA#fJn_s6ue|Zj2cLZL%@4o);bjx>4}k;`ObDTb z5l#e=L=jC4vBVKi0*NG%ObV%_kxmAgWRXn{x#W>g0fiJ%ObMlwQBDPwR8dV0wbW5h z1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM%m|~5G0p^&Ofk(2v&=Ei0*frM%nGZlvCamY zY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0}x7=~h1CKoM%nPr)@y-XIeDTc>zx)yKf9p>m zK?D;*C}D&XK_pQ`6GJR<#FIcGNhFg(DruyXK_*#blS3|f!6Z{mGs7%%%(K8EODwa( zDr>B>!6sX5v%@ZX>~p{&M;vp)DQBE>!6jE*bHgon-1ERAPdxL&D{s8>!6#pQ^TRKH z_|Odahd_b|CWKJJ2q%I_qKGDjSmKB$fkcu>CWTbeNGF3#vdAWfT=K}LfI^BWri4<; zD5ru-s;H)hTI#5$fkv8WriE78Xs3fty6C2dUi#=~fI)^BW`t437-xb>rkG}iS>~8$ zfkl>BW`$MOSZ9Mxw%BHeUG~`LfJ2Tr=7dwuIOl>(uDIrgTkg2$fk&Qr=7m?@c;|yp zzWC;cU;YUAfAlAiAc6@YlrX}HAd)Dei6NFa;z=NpB$7!Xl{C`HAd@Vz$sw0K@+qK@ zB8n-YlrqYxppq)8siBrS>S>^nCYouXl{VVxpp!1T>7kcC`Wax5A%+=YlrhGcV3H}O znPHYW=2>8oC6-xXl{MDcV3RGj*Kq5&blR_$Kq?17=S!9z# zE_virKp{mGQ$i_alv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak!FMaegz#u~mGr}li zj5EO`Q%p0%EOX4Wz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWSoO8h?S6p+$EqC1W zz#~sQ^TI1{yz{{)Uwre!FMkC5zxoqM5W$2HN*Lip5J?o##1Kmy@g$H)63L{HN*d{8 zkVzKVk4wwS|A&?+~2_cj)!igY~D58lWmN?=`Adw`J zNgh9qTyn)VH{5c^Jr6wc#4|6v^2R$KeDcLNKm78C!{vZ~2qcJLLI@>{a3Y8# zifCepC60I!NF<45Qb;9@bTY^!i)?bpC69axD5QvDN+_j_aw@2#ifU@8rH*b~@;!i*9=8rH_6F7-WcHMi^y`aVD5#ifLw;WsZ3kSY(N1R#;_?bvD>!i*0t; zWsiLhIOK?9PB`U^b1t~#ifeAT<&JwEc;ty^UU=n=cRu*!i*J7T<&S`WuRnnV5ljf7 zgb_{zkwg(q46(!!PXdV~kxUAyq>)YrnPibo4!Pu!PXUD#QA`P?lu=Fvl~hqp4Ykx! zPXmoK(M$`iw9!rnopjMn554r!&j5oAG0X^~j4{pxlT0zq471EJ&jO1qvCImqtg+4p zn{2Vo4!i8J&jE)Vam)#)oN>+tmt1kp4Y%BJ&jXJ<@yrXayz$NlpM3Gn55N53VkzJs z0tq6R5JCwfoCqR`BAOUti6fo_5=kPN6jDheoeVO`BAXm?$s?Zv3MrzP5=tqfoC+$b zqM90NsiU3-8fl`L7FubeoenzbqMIIi>7$7FlAM z6;@eeoeehGVw)Xy*<+sr4mskO6HYnfoC_|w;+h+7x#OM(9(m%K7hZYeoew_w;+r3S z`6J*z=uaR)1QS9iVT2PwBvC{YLo9K`lRzR#B$GlaX{3`uCRt>YLoRvbQ$Qg_6jMSe zWt3AvB~?^YLoIdG(?BClG}A&WZM4%tCtY;YLoa>wGr%B23^T$gV~jJwBvVW?!z^>m zv%n%tEVIHYYpk=uCR=Q?!!CR5bHE`-9CN}cXPk4vC0AT?!!38*^S~odJoCaUZ@lxt zCtrN?!!LihR1WxuK!OM+giyi=CxS?#h$ewe^2n!v zLW(G+gi^{Vr-DkVsHTQm>ZqrIMw)1*g;v^Vr-M$q=%$BW`sinXL53J+gi*#AXM#zl zm}Z7q=9p)JMV44*g;myAXM;_)*k*@a_SoluLykD+gj3Eq=YmVFxaNji?zrcHN1k}* zg;(Bq=Yvna_~wUS{s{R0^e2!Yf(ap%Fv5u-k|?5yA(lAeNg$CVl1U+zG}6f+lPt2y zA(uSzDWH%diYcL#GRmo-k}9gHp_V%8X`qoNnrWexHrnZ+lP%Qlmz@kAVCBZLMUN`6G0?VL=!_Sam15AB1t5ZLMmyb zlR+j~WRpWKdE`?-Aw?8ZLMdgGQ$ZzFR8vDOb=1>9BTY2ZLMv^w(?KU)bkjpGee^TH zAVUl@!YE^mGr=TNOf$nQbIh~AB17~w<^Nfgn<5KA2KB#=lF z$)u1<8tG(^Nfz1UkV_u<6i`SJ#gtG=8Rb+^Nfp)9P)i;4G|)&B&9u-;8|`$^Nf+Jp z&`Tfv3^2$L!;CP>7~@PZ$rRJfFv}eCEU?HD%dD`<8tZJZ$rjt}u*)9%9B{}H$DDA= z8RuMZ$rab!aLXO{Jn+a9&%E%;8}EGZ$rs=J@XH^D$pZc%kRXByA(Sw}i6D|FqKP4v zIO0hlktC8yA(b@J$sm&~vdJNrJn|`^kRpmHp_DSpsi2Z7s;QxtI_hblktUjHp_Mk; z>7bJ?y6K^pKKdD8kRgT{VU#h(nP8GBrkP=uIp$elktLQ{VU;!3*8zMmP~f z5=AsI#1cn52_%w4GAX2zMmiZ}l0`N-X?_+;Yb~4?Ob3GcUaI#ycN;^2Ikl{PKrE(tv*mB#2-_ z2qlbgB8Vi4Xkv&Zj(8GCB#C5FNF|MQGRP#0Y;wpYk9-O!q=;flD5Z>YDyXE2YHFyZ zj(Qqsq={x)Xr+yII_RW}ZhGjYkA4OiWQbu#7-fucCYWT3X=a#Zj(HYXWQk=~SY?fM zHrQl~ZFbmYk9`g}g0fiJ%ObMlwQBDPw zR8dV0wbW5h1C2D%Obe~F(M|`QbkR)@z4Xz~0D}xM%m|~5G0p^&Ofk(2v&=Ei0*frM z%nGZlvCamYY_ZJ_yX>*g0f!uM%n7HQan1#oTyf0}x7=~h1CKoM%nPr)@y-XIeDTc> zzx-i{Jm4P!2_l#fLJ1?B2qK9hniyhKq5&blR_$Kq?17=S!9z#E_vir zKp{mGQ$i_alv6<^Ra8?$Ep^n>KqE~w(?Tn4w9`Q+U3Ak!FMaegz#u~mGr}lij5EO` zQ%p0%EOX4Wz#>a5v%)HCth2!;TWqt#E_>{Az#&H*bHXWSoO8h?S6p+$EqC1Wz#~sQ z^TI1{yz{{)Uwre!FMn7d3HXOVf(RyrP{If&f=Hr>CWctzh$n$Wl1L_nRMJQ%gG{o> zCWl<|$ftlpiYTUpQpzZ&f=a5WriNPTsHcHOnrNnlR@!K%gHF2WriWho=x2aIh8Sjq zQN|c&f=Q;BW`37h6G#xjgb+#?;Y1Kg6w$;GOC0eekVq2Aq>xG) z>12>e7TM&GOCI?YP)HHQlu$|;uj*e7TfHw%O3k2aL5tIoN&q+=Ui~f z71!Kw%N_ST@W>O-yzt5!?|ksd7vKEw%O94<0{$V8Ac6@YlrX}HAd)Dei6NFa;z=Np zB$7!Xl{C`HAd@Vz$sw0K@+qK@B8n-YlrqYxppq)8siBrS>S>^nCYouXl{VVxpp!1T z>7kcC`Wax5A%+=YlrhGcV3H}OnPHYW=2>8oC6-xXl{MDcV3RGj*qRnJo3adFTC=`J0E=V#Wz3v@`uIIfPV-ih+skpC5&() zh$M<=Vu&Syl*dk|arzBuSDaNs=Td!&)heKN=-i)?bph9qTyn)VH~iri2O|>xB9SDLNg(TYdE`?-ArE;(5sxY62~T;(b6)V05?=9|HrkG}iS>~8$fkl>BW`$MOSZ9Mxw%BHeUG~`LfJ2Tr=7dwuIOl>(uDIrg zKirb=KkQE;NhFg(DtEZcJ<>?$J{e?^MK(F)@_;<@DWH&tJfeul6!V0qJmWbpcu5Jb zc+DG1DWjYUDtXI0-c!W~s;QxtkJM351C2D%Obe~F@riai_{EbKj_|6Z0@{8Ye z(?c(P^fSO9Lku&*C}WH>!6Z{mGs7%%%(K8EODwa(Dr>B>!6sX5v%@ZX>~p{&M;vp) zDQBE>!6jE*bHg8QaVRR`FA_;2nG{mF!(Hx?MmqP&Ad@Vz$sv~qHb7<2yh2$uEA>O%J{F(a!*b3^B|Iql_`m1d~iL%?z{5G0y^vEV0ZA ztE{ok2AgcL%?`WlvCjdA9C6GEr<`%l1(#fL%?*FJ#gVv#zepsBWKu}w4tKdn8tL38 zgG{o>CWl-ekVifR6!MTq6!Dm1p74}sJm&>3Dd82bc|$2>lv6<^Z+XXis`x-PHPrHv zI_hblktUjHp_Mj1(M|`S`9ddMeB~S8`N24mjk9V@^2bjB_ryOwwoN~rF7hH11H8=d> z76&2|{vweil1U+zJKW_SX{2+X3^K_gn;dd^Kpy!NP{>0bQN&}4dBRhk@thaDq=Z+z z<_)EkQBDPwyyYG5sp13G)KJSu>ZqrIMw)1*g;v`5L^~aP<_n#4@s)3U=LbLe#c#Ul zp_e}T8DNkhh8ba$F~*r-k}0N{VU{`OSzwVRmRVtyHP+c+lP$K{VV6DjIpB~Zjyd6! zGtRl-k}Iyc;SaYY{BQe{ND|4UkjfqIa*s69xlaa}WRXn{xjZ0`dq4$PmMfFv=L?Ofbn5)66i-9P=!&$P&w}u*w?iY_Q1|+w8E* z9{U_{$Pve!aLO6yTyV)1*WB=jTO5Z?_=`l6NG63;?r@iTq>;{jGRP#0Y;wru0eR$8 zKp_u#L=lfE<_S-E#&cfqk`i9=nm3eEMmZH!@|JhJr-~0$Q$sBusiU3-8fl`L7FucJ z6YX^HnJ;wG#aF)Zoge(<7r*JIhhF;VXMjP57-ob~#u#UUNv4=)hFRvAXMshQSZ0M) z)>vnQO}5x(hh6sA=YT_wIOc>?&N%0SORl))hCker@PF)2B1t5ZLMnH-%RSOa=RO%^ zl0`N-w8Rb+^$y?s> zo+>_2O%1htq>g$TXrzf|T4<$>Pqfp)XTH!$7hn0tcYg4bU;L(<9(w7cp8*CLVwe#| z8DpFYCYfTI8D^Pdo&^?JVwn|IS!10IHrZmE9d_Acp92m#;+PXoIpdrQF1g~G8~$)h z!vD2Di6oIs3aQ-TF84?yo%>{vNfz1Ukjn$|$ftlp9`cAH9#hN{p7M<6yx=7zyy7)) zD5Z>YDyZZw?|4rYAE>5=T0T-oJqEJV8=%kCUeB(Pm_{lGR(@hV( z^wG}%gA6gu2&0TK&IFT8G0hCK%rVabi!8Cs3ahNK&IX%ovCR&we9*{>q1r+j-M-=gxVxI7n zXFTTxFDc;_uX#f$Wt3AvC2x7hd#d7$7FlAM6;@ee zoeehGVw)Xy*<+sr4mskO6HYnfoC_|w;`*PK{zF1SLi2z5`@c2(r?$T}75?4d{pb3^ ze|h^KoBwO$-~PQ1{~ZbckdW|~$G`noNuD9TgJ|IY&j$XD-+zejAR34UqJd~28i)p> zfoLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~2 z8i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34U zqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvz zAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh z2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8 zXdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEa zhz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p> zfoLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~2 z8i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34U zqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvz zAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh z2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8 zXdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEa zhz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p> zfoLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~2 z8i)p>foLEahz6p8XdoJh2BLvzAR34UqJd~28i)p>foLEahz6p8XdoJh2BLvzAR34U WqJd~28i)p>foLEahz9;&4g535-DSc6 From 21958bb393a654591ed26f339791b752d58f5c8b Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 2 Nov 2023 13:10:33 +0100 Subject: [PATCH 50/63] cmake : disable LLAMA_NATIVE by default (#3906) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 611ed3f4d..3c49d645c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ endif() # general option(LLAMA_STATIC "llama: static link libraries" OFF) -option(LLAMA_NATIVE "llama: enable -march=native flag" ON) +option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF) # debug From 4ff1046d75e64f0e556d8dcd930ea25c23eb8b18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 16:22:30 +0200 Subject: [PATCH 51/63] gguf : print error for GGUFv1 files (#3908) --- ggml.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml.c b/ggml.c index d5a49d8e4..605a27940 100644 --- a/ggml.c +++ b/ggml.c @@ -18884,6 +18884,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + if (ctx->header.version == 1) { + fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); fclose(file); From d6069051de7165a4e06662c89257f5d2905bb156 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Thu, 2 Nov 2023 18:10:39 +0100 Subject: [PATCH 52/63] cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903) * Using cuda memory pools for async alloc/dealloc. * If cuda device doesnt support memory pool than use old implementation. * Removed redundant cublasSetStream --------- Co-authored-by: Oleksii Maryshchenko --- ggml-cuda.cu | 130 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 52 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e46295126..58b58f331 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -181,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cudaError_t err_ = (err); \ if (err_ != cudaSuccess) { \ - int id; \ - cudaGetDevice(&id); \ + int dev_id; \ + cudaGetDevice(&dev_id); \ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ cudaGetErrorString(err_)); \ - fprintf(stderr, "current device: %d\n", id); \ + fprintf(stderr, "current device: %d\n", dev_id); \ exit(1); \ } \ } while (0) @@ -195,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ - int id; \ - cudaGetDevice(&id); \ + int dev_id; \ + cudaGetDevice(&dev_id); \ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ - fprintf(stderr, "current device: %d\n", id); \ + fprintf(stderr, "current device: %d\n", dev_id); \ exit(1); \ } \ } while (0) @@ -465,6 +465,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define MAX_STREAMS 8 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; +static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr }; struct ggml_tensor_extra_gpu { void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors @@ -5772,6 +5773,16 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } +static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) { + if (g_cudaMemPools[id] == nullptr) { + return ggml_cuda_pool_malloc(size, actual_size); + } + void *ptr; + CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream)); + *actual_size = size; + return ptr; +} + static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); int id; @@ -5790,6 +5801,13 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { } +static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { + if (g_cudaMemPools[id] == nullptr) { + return ggml_cuda_pool_free(ptr, actual_size); + } + CUDA_CHECK(cudaFreeAsync(ptr, stream)); +} + void ggml_init_cublas() { static bool initialized = false; @@ -5844,6 +5862,13 @@ void ggml_init_cublas() { // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); + + // configure memory pool + cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); + if (err == cudaSuccess) { + size_t treshold = UINT64_MAX; + CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); + } } // configure logging to stdout @@ -6437,7 +6462,7 @@ inline void ggml_cuda_op_mul_mat_cublas( const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); GGML_ASSERT(to_fp16_cuda != nullptr); size_t ne = row_diff*ne00; - src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as); + src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream); to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream); } const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16; @@ -6448,13 +6473,12 @@ inline void ggml_cuda_op_mul_mat_cublas( const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); GGML_ASSERT(to_fp16_cuda != nullptr); size_t ne = src1_ncols*ne10; - src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as); + src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream); to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream); } const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16; - - size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as); + size_t dst_f16_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream); const half alpha_f16 = 1.0f; const half beta_f16 = 0.0f; @@ -6472,14 +6496,15 @@ inline void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream); - ggml_cuda_pool_free(dst_f16, dst_as); - - if (src0_as != 0) { - ggml_cuda_pool_free(src0_as_f16, src0_as); + if (dst_f16_as != 0) { + ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream); } + if (src0_as != 0) { + ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream); + } if (src1_as != 0) { - ggml_cuda_pool_free(src1_as_f16, src1_as); + ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream); } } else { @@ -6489,7 +6514,7 @@ inline void ggml_cuda_op_mul_mat_cublas( if (src0->type != GGML_TYPE_F32) { const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); GGML_ASSERT(to_fp32_cuda != nullptr); - src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); } const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; @@ -6506,7 +6531,7 @@ inline void ggml_cuda_op_mul_mat_cublas( &beta, dst_dd_i, ldc)); if (src0_as != 0) { - ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream); } } @@ -6929,21 +6954,22 @@ static void ggml_cuda_op_mul_mat( src0_dd[id] = (char *) src0_extra->data_device[id]; } else { const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); - src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); + src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream); } if (src1_on_device && src1_is_contiguous) { src1_ddf[id] = (float *) src1_extra->data_device[id]; } else { - src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); + src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream); } if (convert_src1_to_q8_1) { - src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs; + src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream); if (src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); - CUDA_CHECK(cudaGetLastError()); + // CUDA_CHECK(cudaGetLastError()); } } @@ -6951,7 +6977,7 @@ static void ggml_cuda_op_mul_mat( dst_dd[id] = (float *) dst_extra->data_device[id]; } else { const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); - dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]); + dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id, stream); } } @@ -7077,24 +7103,6 @@ static void ggml_cuda_op_mul_mat( } } - for (int64_t id = 0; id < g_device_count; ++id) { - CUDA_CHECK(ggml_cuda_set_device(id)); - - // free buffers again when done - if (src0_as[id] > 0) { - ggml_cuda_pool_free(src0_dd[id], src0_as[id]); - } - if (src1_asf[id] > 0) { - ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); - } - if (src1_asq[id] > 0) { - ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]); - } - if (dst_as[id] > 0) { - ggml_cuda_pool_free(dst_dd[id], dst_as[id]); - } - } - // main device waits for all other devices to be finished if (split && g_device_count > 1) { int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; @@ -7112,6 +7120,21 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaDeviceSynchronize()); } + + for (int64_t id = 0; id < g_device_count; ++id) { + if (src0_as[id] > 0) { + ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]); + } + if (src1_asf[id] > 0) { + ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]); + } + if (src1_asq[id] > 0) { + ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]); + } + if (dst_as[id] > 0) { + ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]); + } + } } static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -7298,11 +7321,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const GGML_ASSERT(to_fp16_cuda != nullptr); size_t src1_as = 0; - half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); + half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream); to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream); GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -7349,10 +7372,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } else { // use cublasGemmBatchedEx const int ne23 = ne12*ne13; - - void ** ptrs_as = nullptr; + // allocate device memory for pointers size_t ptrs_s = 0; - ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s); + void ** ptrs_as = (void **)ggml_cuda_pool_malloc_async(3*ne23*sizeof(void *), &ptrs_s, id, main_stream); dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( @@ -7365,7 +7387,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const dst->nb[2], dst->nb[3], r2, r3); CUDA_CHECK(cudaGetLastError()); - CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, @@ -7375,16 +7396,21 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ne23, CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - ggml_cuda_pool_free(ptrs_as, ptrs_s); + // free device memory for pointers + if (ptrs_s != 0) { + ggml_cuda_pool_free_async(ptrs_as, ptrs_s, id, main_stream); + } } #endif const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); - - ggml_cuda_pool_free(src1_as_f16, src1_as); - ggml_cuda_pool_free(dst_f16, dst_as); + if (src1_as != 0) { + ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream); + } + if (dst_as != 0) { + ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream); + } } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { From c7743fe1c1cbda5a886362aa371480360580fdf0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 20:32:11 +0200 Subject: [PATCH 53/63] cuda : fix const ptrs warning causing ROCm build issues (#3913) --- ggml-cuda.cu | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 58b58f331..06c28f565 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7248,7 +7248,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor __global__ void k_compute_batched_ptrs( const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, - void ** ptrs, + const void ** ptrs_src, void ** ptrs_dst, int ne12, int ne13, int ne23, int nb02, int nb03, @@ -7265,9 +7265,9 @@ __global__ void k_compute_batched_ptrs( int i03 = i13 / r3; int i02 = i12 / r2; - ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03; - ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2; + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; } static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -7372,14 +7372,20 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } else { // use cublasGemmBatchedEx const int ne23 = ne12*ne13; - // allocate device memory for pointers - size_t ptrs_s = 0; - void ** ptrs_as = (void **)ggml_cuda_pool_malloc_async(3*ne23*sizeof(void *), &ptrs_s, id, main_stream); + + const void ** ptrs_src = nullptr; + void ** ptrs_dst = nullptr; + + size_t ptrs_src_s = 0; + size_t ptrs_dst_s = 0; + + ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream); + ptrs_dst = ( void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream); dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( src0_as_f16, src1_as_f16, dst_f16, - ptrs_as, + ptrs_src, ptrs_dst, ne12, ne13, ne23, nb02, nb03, @@ -7390,15 +7396,18 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01, + &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, ne23, CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // free device memory for pointers - if (ptrs_s != 0) { - ggml_cuda_pool_free_async(ptrs_as, ptrs_s, id, main_stream); + + if (ptrs_src_s != 0) { + ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream); + } + if (ptrs_dst_s != 0) { + ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream); } } #endif From 224e7d5b14cbabab7ae45c64db2cfde979c8455d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 20:44:12 +0200 Subject: [PATCH 54/63] readme : add notice about #3912 --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index b56ecaec7..9c9e36ad0 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) -[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) @@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics -- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436 -- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252) +- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912 ---- From 51b2fc11f7f605fff49725a4540e9a6ef7b51b70 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 2 Nov 2023 15:40:31 -0400 Subject: [PATCH 55/63] cmake : fix relative path to git submodule index (#3915) --- common/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 0150114e3..ac594b2ca 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -11,7 +11,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") if(NOT IS_DIRECTORY "${GIT_DIR}") file(READ ${GIT_DIR} REAL_GIT_DIR_LINK) string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK}) - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}") + set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}") endif() set(GIT_INDEX "${GIT_DIR}/index") From 629f917cd6b96ba1274c49a8aab163b1b189229d Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Thu, 2 Nov 2023 13:58:22 -0600 Subject: [PATCH 56/63] cuda : add ROCM aliases for CUDA pool stuff (#3918) --- ggml-cuda.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 06c28f565..baf02df2b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -39,6 +39,10 @@ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceGetMemPool hipDeviceGetMemPool +#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold +#define cudaMemPoolSetAttribute hipMemPoolSetAttribute +#define cudaMemPool_t hipMemPool_t #define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t @@ -48,6 +52,7 @@ #define cudaEvent_t hipEvent_t #define cudaEventDestroy hipEventDestroy #define cudaFree hipFree +#define cudaFreeAsync hipFreeAsync #define cudaFreeHost hipHostFree #define cudaGetDevice hipGetDevice #define cudaGetDeviceCount hipGetDeviceCount @@ -55,6 +60,7 @@ #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc +#define cudaMallocFromPoolAsync hipMallocFromPoolAsync #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync From 3fdbe6b66b7b5c6ad3b2f245cbad1517c27ff776 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Fri, 3 Nov 2023 02:31:58 -0400 Subject: [PATCH 57/63] llama : change yarn_ext_factor placeholder to -1 (#3922) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index bb60044b4..cc0211ceb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7982,7 +7982,7 @@ struct llama_context_params llama_context_default_params() { /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, - /*.yarn_ext_factor =*/ NAN, + /*.yarn_ext_factor =*/ -1.0f, /*.yarn_attn_factor =*/ 1.0f, /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, @@ -8125,7 +8125,7 @@ struct llama_context * llama_new_context_with_model( cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } - if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set' + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; } From 05816027d649f977468fc804cdb54e99eac246d1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 3 Nov 2023 09:24:00 +0200 Subject: [PATCH 58/63] common : YAYF (yet another YARN fix) (#3925) ggml-ci --- common/common.h | 44 ++++++++++++++++++++++---------------------- llama.h | 10 +++++----- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/common/common.h b/common/common.h index 72a49b890..9ad625633 100644 --- a/common/common.h +++ b/common/common.h @@ -43,29 +43,29 @@ extern char const *LLAMA_BUILD_TARGET; int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed + uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 16; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor - float yarn_ext_factor = NAN; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f;// YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 16; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // // sampling parameters diff --git a/llama.h b/llama.h index 3f1becd76..e8dc04bb5 100644 --- a/llama.h +++ b/llama.h @@ -175,11 +175,11 @@ extern "C" { }; struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context, 0 = from model - uint32_t n_batch; // prompt processing maximum batch size - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + uint32_t seed; // RNG seed, -1 for random + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 From 8f961abdc4e134c83bf8c2ad618ab256b4cae0f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 3 Nov 2023 09:41:17 +0200 Subject: [PATCH 59/63] speculative : change default p_accept to 0.5 + CLI args (#3919) ggml-ci --- common/common.cpp | 14 ++++++++++++++ common/common.h | 8 ++++++-- examples/speculative/speculative.cpp | 8 +++++--- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index e938dee16..20cc4a081 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -403,6 +403,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_sequences = std::stoi(argv[i]); + } else if (arg == "--p-accept" || arg == "-pa") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_accept = std::stof(argv[i]); + } else if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.p_split = std::stof(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -778,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); + printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept); + printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); diff --git a/common/common.h b/common/common.h index 9ad625633..dd6b002eb 100644 --- a/common/common.h +++ b/common/common.h @@ -44,6 +44,7 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_predict = -1; // new tokens to predict @@ -54,6 +55,8 @@ struct gpt_params { int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode + float p_accept = 0.5f; // speculative decoding accept probability + float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors @@ -66,7 +69,8 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment + // pinging @cebtenzzre // // sampling parameters struct llama_sampling_params sparams; @@ -90,7 +94,7 @@ struct gpt_params { int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 798684f66..3a8e27811 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -37,9 +37,11 @@ int main(int argc, char ** argv) { // max number of parallel drafting sequences (i.e. tree branches) const int n_seq_dft = params.n_parallel; - // TODO: make this configurable - const float p_accept = 0.80f; - const float p_split = 0.10f; + // probability threshold for accepting a token from the draft model + const float p_accept = params.p_accept; + + // probability threshold for splitting a draft branch (only for n_seq_dft > 1) + const float p_split = params.p_split; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("speculative", "log")); From abb77e7319aabc0b5cfb7c22da690a692489b6b7 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 3 Nov 2023 12:13:09 +0100 Subject: [PATCH 60/63] ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921) --- ggml-cuda.cu | 53 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index baf02df2b..bdbcca0ca 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -989,7 +989,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; @@ -1093,7 +1093,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; @@ -1197,7 +1197,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; @@ -1451,7 +1451,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; @@ -4261,7 +4261,7 @@ template static __global__ void template static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row >= nrows) { return; @@ -4301,7 +4301,7 @@ template static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block - const int row = blockIdx.y*blockDim.y + threadIdx.y; + const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row >= nrows) { return; @@ -4874,7 +4874,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); @@ -4883,7 +4884,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); @@ -4892,7 +4893,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); @@ -4901,7 +4902,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); @@ -4910,7 +4911,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); @@ -4920,7 +4921,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f GGML_ASSERT(ncols % QK_K == 0); const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); } @@ -4929,7 +4930,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f GGML_ASSERT(ncols % QK_K == 0); const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols, nrows); } @@ -4938,7 +4939,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f GGML_ASSERT(ncols % QK_K == 0); const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols, nrows); } @@ -4953,7 +4954,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f GGML_ASSERT(ncols % QK_K == 0); const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); } @@ -4961,7 +4962,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -4970,7 +4971,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK4_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -4979,7 +4980,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK5_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -4988,7 +4989,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK5_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -4997,7 +4998,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5006,7 +5007,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5015,7 +5016,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5024,7 +5025,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5033,7 +5034,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5042,7 +5043,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); @@ -5061,7 +5062,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(1, block_num_y, 1); + const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); dequantize_mul_mat_vec<1, 1, convert_f16> <<>>(vx, y, dst, ncols, nrows); From 5ba37461711095c0284233dbd14f0d9010cdbf56 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Fri, 3 Nov 2023 13:00:31 -0500 Subject: [PATCH 61/63] ggml-metal: fix yarn rope (#3937) --- ggml-metal.m | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index b33a3cb8f..acdb83843 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute( const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - const int n_orig_ctx = ((int32_t *) dst->op_params)[3]; + // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); From d9b33fe95bd257b36c84ee5769cc048230067d6f Mon Sep 17 00:00:00 2001 From: Peter Sugihara Date: Fri, 3 Nov 2023 12:18:18 -0700 Subject: [PATCH 62/63] metal : round up to 16 to fix MTLDebugComputeCommandEncoder assertion (#3938) --- ggml-metal.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index acdb83843..78ae4485d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0]; const int64_t nrows = ggml_nrows(src0); From f28af0d81aa1010afa5de74cf627dcb04bea3157 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 4 Nov 2023 16:20:34 -0600 Subject: [PATCH 63/63] gguf-py: Support 01.AI Yi models (#3943) --- gguf-py/gguf/gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 727b4e554..a2271d225 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -393,6 +393,7 @@ class TensorNameMap: "layers.{bid}.attention_norm", # llama-pth "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon + "model.layers.{bid}.ln1", # yi ), # Attention norm 2 @@ -464,6 +465,7 @@ class TensorNameMap: "layers.{bid}.ffn_norm", # llama-pth "encoder.layer.{bid}.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon + "model.layers.{bid}.ln2", # yi ), # Feed-forward up