From f266259ad9a2bce5a34d919592310147af23f3dc Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Mon, 17 Apr 2023 15:10:57 +0200 Subject: [PATCH 1/8] Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933) --- CMakeLists.txt | 22 ++++- ggml.c | 229 ++++++++++++++++++++++++++++++++++++++++++------- ggml.h | 2 + llama.cpp | 26 +++--- 4 files changed, 235 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a20de3a2..9189b6fc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" option(LLAMA_AVX "llama: enable AVX" ON) option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) +option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) +option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_FMA "llama: enable FMA" ON) # in MSVC F16C is implied with AVX2/AVX512 if (NOT MSVC) @@ -220,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") if (MSVC) if (LLAMA_AVX512) add_compile_options(/arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (LLAMA_AVX512_VBMI) + add_compile_definitions(__AVX512VBMI__) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_definitions(__AVX512VNNI__) + endif() elseif (LLAMA_AVX2) add_compile_options(/arch:AVX2) elseif (LLAMA_AVX) @@ -240,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") endif() if (LLAMA_AVX512) add_compile_options(-mavx512f) - # add_compile_options(-mavx512cd) - # add_compile_options(-mavx512dq) - # add_compile_options(-mavx512bw) + add_compile_options(-mavx512bw) + endif() + if (LLAMA_AVX512_VBMI) + add_compile_options(-mavx512vbmi) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_options(-mavx512vnni) endif() endif() else() diff --git a/ggml.c b/ggml.c index 69974989c..2be2ce3e9 100644 --- a/ggml.c +++ b/ggml.c @@ -1977,33 +1977,187 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float } #if __AVX512F__ && QK4_0 == 32 -static inline __m512 dot_q4_0_oneblock_avx512( +static inline __m512i bytes_from_q4_0_twoblocks_avx512( const __m512i blocks ) { + // The 64 bytes of `blocks` contain two consecutive Q4_0 blocks loaded from memory: + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // | :. =_ () [] <> () Zz Yy| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |Xx Ww Vv Uu Tt Ss Rr Qq Pp Oo Nn Mm Ll Kk Jj Ii Hh Gg Ff Ee Dd Cc Bb Aa | + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // + // Bytes 04..19 (block #0) and 24..39 (block #1) both contain 32 nibbles (4-bit unsigned integers). + // We have exactly 64 nibbles, so we want to place each nibble into a separate byte. + // Bytes 00..03 and 20..23 contain scales, which are irrelevant to this function. + // Bytes 40..63 are masked when loading the data, so they are zeroed out. +#ifdef __AVX512VBMI__ + const __m512i byte_perm = _mm512_set_epi8( + 39, 38, 39, 38, 37, 36, 37, 36, 35, 34, 35, 34, 33, 32, 33, 32, + 31, 30, 31, 30, 29, 28, 29, 28, 27, 26, 27, 26, 25, 24, 25, 24, + 19, 18, 19, 18, 17, 16, 17, 16, 15, 14, 15, 14, 13, 12, 13, 12, + 11, 10, 11, 10, 9, 8, 9, 8, 7, 6, 7, 6, 5, 4, 5, 4 + ); + const __m512i permuted = _mm512_permutexvar_epi8( byte_perm, blocks ); + // After applying VPERMB, `permuted` looks like this: + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |:. =_ :. =_ () [] () [] <> () <> () Zz Yy Zz Yy Xx Ww Xx Ww Vv Uu Vv Uu Tt Ss Tt Ss Rr Qq Rr Qq| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |Pp Oo Pp Oo Nn Mm Nn Mm Ll Kk Ll Kk Jj Ii Jj Ii Hh Gg Hh Gg Ff Ee Ff Ee Dd Cc Dd Cc Bb Aa Bb Aa| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ +#else + const __m512i word_perm = _mm512_set_epi16( + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2 + ); + const __m512i permuted = _mm512_permutexvar_epi16( word_perm, blocks ); + // This is the fallback path for CPUs that don't support VPERMB. Since we permute 16-bit groups only, + // VPERMB can be replaced with VPERMW. We could always use VPERMW, but at least on Tiger Lake and + // Ice Lake VPERMW followed by a right shift is quite noticeably slower than VPERMB. +#endif + + // Shift every odd-numbered 16-bit group to the right by 4 bits. + const __mmask32 shift_mask = 0xaaaaaaaa; + const __m512i shifted = _mm512_mask_srai_epi16( permuted, shift_mask, permuted, 4 ); + // After applying VPSRAW, `shifted` looks like this (the "empty" nibbles are filled with zeroes): + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | : .= :. =_ ( )[ () [] < >( <> () Z zY Zz Yy X xW Xx Ww V vU Vv Uu T tS Tt Ss R rQ Rr Qq + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | P pO Pp Oo N nM Nn Mm L lK Ll Kk J jI Jj Ii H hG Hh Gg F fE Ff Ee D dC Dd Cc B bA Bb Aa| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + + // Now we just need to zero out the higher nibble in each byte, and we're done. + const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf ); + return _mm512_and_si512( low_nibble_mask, shifted ); + // The final result looks like this: + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | : = . _ ( [ ) ] < ( > ) Z Y z y X W x w V U v u T S t s R Q r q| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | P O p o N M n m L K l k J I j i H G h g F E f e D C d c B A b a| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ +} + +static inline __m512 dot_q4_0_twoblocks_avx512( __m512 acc, const block_q4_0 * restrict x, const block_q4_0 * restrict y, int i ) { - // Compute combined scale for the block - __m512 d = _mm512_set1_ps( x[i].d * y[i].d ); + // A pair of Q4_0 blocks spans 40 bytes, while an AVX-512 register has 64. The remaining 24 bytes + // can potentially be unaddressable, so we make sure to mask them out before the load, even though + // we don't use them at all. This might hurt the performance slightly, since the compiler is forced + // to use e.g. `VMOVDQU64 REG, MASK, [ADDR] + VPERMB ..., REG` instead of just `VPERMB ..., [ADDR]`. + const __mmask8 load_mask = 0x1f; + const __m512i blocks_0 = _mm512_maskz_loadu_epi64( load_mask, &x[i] ); + const __m512i blocks_1 = _mm512_maskz_loadu_epi64( load_mask, &y[i] ); - __m256i bx = bytesFromNibbles( x[i].qs ); - __m256i by = bytesFromNibbles( y[i].qs ); + // We want to multiply the scales, so we interpret both registers as 16 32-bit floats: + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // blocks_0_float + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | | | | | | | xx | xx | xx | xx | B | xx | xx | xx | xx | A | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // blocks_1_float + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | | | | | | | xx | xx | xx | xx | D | xx | xx | xx | xx | C | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + const __m512 blocks_0_float = _mm512_castsi512_ps( blocks_0 ); + const __m512 blocks_1_float = _mm512_castsi512_ps( blocks_1 ); + // We absolutely shouldn't touch the floats marked with `xx`: they contain some + // random data, which might very well underflow. At least on Intel, this leads + // to a huge penalty that can't be ignored (easily 100x or more) unless you + // compile your code with something like `-ffast-math` to enable FTZ/DAZ flags. + // (and ggml can't assume that you do)... + const __mmask16 scale_mul_mask = 0x21; +#ifdef __clang__ + // ...however, clang decides to optimize the multiplication mask away: + // https://godbolt.org/z/P8PqdsfvW + // gcc and MSVC do the sane thing. This horrible workaround forces clang to emit the mask. + __m512i scales; + __asm__( + "vmulps %1, %2, %0%{%3%}" + : "=v" ( scales ) + : "vm" ( blocks_0_float ), "v" ( blocks_1_float ), "Yk" ( scale_mul_mask ) + ); +#else + const __m512 scales = _mm512_maskz_mul_ps( scale_mul_mask, blocks_0_float, blocks_1_float ); +#endif + const __m512i scale_perm = _mm512_set_epi32( + 5, 5, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 0, 0, 0, 0, 0 + ); + const __m512 permuted_scales = _mm512_permutexvar_ps( scale_perm, scales ); + // After VMULPS and VPERMPS, `permuted_scales` looks like this: + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | B*D| B*D| B*D| B*D| B*D| B*D| B*D| B*D| A*C| A*C| A*C| A*C| A*C| A*C| A*C| A*C| + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - by = _mm256_sub_epi8( by, off ); + const __m512i bytes_0 = bytes_from_q4_0_twoblocks_avx512( blocks_0 ); + const __m512i bytes_1 = bytes_from_q4_0_twoblocks_avx512( blocks_1 ); - // Sign-extend 16 signed bytes into int16_t - __m512i x32 = _mm512_cvtepi8_epi16( bx ); - __m512i y32 = _mm512_cvtepi8_epi16( by ); - // Compute products of int16_t integers, add pairwise - __m512i i64 = _mm512_madd_epi16( x32, y32 ); + // Now we want to compute dot products of 4-element byte vectors and store them in + // 32-bit integers. That is (only one 4-element vector is shown for clarity): + // +----+----+----+----+ + // ... | 03 | 02 | 01 | 00 | + // +----+----+----+----+ + // bytes_0 + // +----+----+----+----+ + // ... | D | C | B | A | + // +----+----+----+----+ + // bytes_1 + // +----+----+----+----+ + // ... | H | G | F | E | + // +----+----+----+----+ + // final_res_int + // +----+----+----+----+ + // ... | A*E+B*F+C*G+D*H | + // +----+----+----+----+ + const __m512i plus_8 = _mm512_set1_epi8( 8 ); + const __m512i bytes_1_minus_8 = _mm512_sub_epi8( bytes_1, plus_8 ); - // Convert int32_t to float - __m512 p = _mm512_cvtepi32_ps( i64 ); - // Apply the scale, and accumulate - return _mm512_fmadd_ps( d, p, acc ); +#ifdef __AVX512VNNI__ + // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch: + // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8 + // from each nibble, so they can be negative. So, instead of `(bytes_0 - 8) * (bytes_1 - 8)`, + // we compute `bytes_0 * (bytes_1 - 8) + bytes_1 * (-8) + 64`. VPDPBUSDS uses an accumulator, + // which means we only need 2 instructions. + const __m512i dot_init = _mm512_set1_epi32( 4 * 64 ); + const __m512i minus_8 = _mm512_set1_epi8( -8 ); + const __m512i prod_0 = _mm512_dpbusds_epi32( dot_init, bytes_1, minus_8 ); + const __m512i final_res_int = _mm512_dpbusds_epi32( prod_0, bytes_0, bytes_1_minus_8 ); +#else + // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones. + // It has the same catch as VPDPBUSDS: the left operand should be unsigned. + // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me + // ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119 + const __m512i one = _mm512_set1_epi16( 1 ); + const __m512i prod_0 = _mm512_maddubs_epi16( bytes_0, bytes_1_minus_8 ); + const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, bytes_1_minus_8 ); + const __m512i diff = _mm512_sub_epi16( prod_0, prod_1 ); + const __m512i final_res_int = _mm512_madd_epi16( diff, one ); +#endif + + // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate. + const __m512 final_res_float = _mm512_cvtepi32_ps( final_res_int ); + return _mm512_fmadd_ps( permuted_scales, final_res_float, acc ); } #endif @@ -2135,25 +2289,26 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest __m512 acc0 = _mm512_setzero_ps(); __m512 acc1 = _mm512_setzero_ps(); - const int superblock_size = 8; + const int superblock_size = 16; + const int superblock_count = nb / superblock_size; for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { int i = superblock_ix * superblock_size; - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 ); } // Remainders - for (int i = superblock_count * superblock_size; i < nb; ++i) { - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i ); + for (int i = superblock_count * superblock_size; i < nb; i += 2) { + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i ); } // Horizontal sum of all lanes of the accumulator @@ -11303,6 +11458,22 @@ int ggml_cpu_has_avx512(void) { #endif } +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; diff --git a/ggml.h b/ggml.h index 241e96a19..e693754c9 100644 --- a/ggml.h +++ b/ggml.h @@ -808,6 +808,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * int ggml_cpu_has_avx(void); int ggml_cpu_has_avx2(void); int ggml_cpu_has_avx512(void); +int ggml_cpu_has_avx512_vbmi(void); +int ggml_cpu_has_avx512_vnni(void); int ggml_cpu_has_fma(void); int ggml_cpu_has_neon(void); int ggml_cpu_has_arm_fma(void); diff --git a/llama.cpp b/llama.cpp index a6429a4e7..3b916e5ad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1915,18 +1915,20 @@ const char * llama_print_system_info(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); } From 69b740289f9b3756ea9dd2a23f241c6f688d88b9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Apr 2023 16:16:23 +0300 Subject: [PATCH 2/8] ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 2be2ce3e9..995a2faab 100644 --- a/ggml.c +++ b/ggml.c @@ -8057,11 +8057,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = ggml_fp16_to_fp32(src[0]); - const float x1 = ggml_fp16_to_fp32(src[1]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); - dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta); - dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } From eb17a026fd23d1c1b612fa4600f7f5c58e501a28 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Apr 2023 17:31:06 +0300 Subject: [PATCH 3/8] quantize-stats : fix bug in --type argument --- examples/quantize-stats/quantize-stats.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 050300931..cd973e8ac 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -221,7 +221,7 @@ int main(int argc, char ** argv) { break; } int j; - for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) { + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) { // find match } if (j < GGML_TYPE_COUNT) { From efd05648c88a0923a55f56e7ce1b0f9c33410afb Mon Sep 17 00:00:00 2001 From: Arik Poznanski Date: Mon, 17 Apr 2023 17:41:53 +0300 Subject: [PATCH 4/8] llama : well-defined static initialization of complex objects (#927) * Replaced static initialization of complex objects with a initialization on first use. This prevents an undefined behavior on program run, for example, crash in Release build, works in Debug build * replaced use of auto with exact type to avoid using -std=c++14 * Made the assessors functions for static maps be static const --- llama.cpp | 72 +++++++++++++++++++++++--------------- tests/test-tokenizer-0.cpp | 20 ++++++----- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3b916e5ad..cdd8bd16b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -42,35 +42,51 @@ static const size_t MB = 1024*1024; // TODO: dynamically determine these sizes // needs modifications in ggml -static const std::map MEM_REQ_SCRATCH0 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, -}; +static const std::map & MEM_REQ_SCRATCH0() +{ + static std::map _MEM_REQ_SCRATCH0 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 512ull * MB }, + }; + return _MEM_REQ_SCRATCH0; +} -static const std::map MEM_REQ_SCRATCH1 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, +static const std::map & MEM_REQ_SCRATCH1() +{ + static std::map _MEM_REQ_SCRATCH1 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 512ull * MB }, + }; + return _MEM_REQ_SCRATCH1; }; // 2*n_embd*n_ctx*n_layer*sizeof(float16) -static const std::map MEM_REQ_KV_SELF = { - { MODEL_7B, 1026ull*MB }, - { MODEL_13B, 1608ull*MB }, - { MODEL_30B, 3124ull*MB }, - { MODEL_65B, 5120ull*MB }, +static const std::map & MEM_REQ_KV_SELF() +{ + static std::map _MEM_REQ_KV_SELF = { + { MODEL_7B, 1026ull * MB }, + { MODEL_13B, 1608ull * MB }, + { MODEL_30B, 3124ull * MB }, + { MODEL_65B, 5120ull * MB }, + }; + return _MEM_REQ_KV_SELF; }; // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map MEM_REQ_EVAL = { - { MODEL_7B, 768ull*MB }, - { MODEL_13B, 1024ull*MB }, - { MODEL_30B, 1280ull*MB }, - { MODEL_65B, 1536ull*MB }, +static const std::map & MEM_REQ_EVAL() +{ + static std::map _MEM_REQ_EVAL = { + { MODEL_7B, 768ull * MB }, + { MODEL_13B, 1024ull * MB }, + { MODEL_30B, 1280ull * MB }, + { MODEL_65B, 1536ull * MB }, + }; + return _MEM_REQ_EVAL; }; // default hparams (LLaMA 7B) @@ -899,13 +915,13 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size + - MEM_REQ_SCRATCH0.at(model.type) + - MEM_REQ_SCRATCH1.at(model.type) + - MEM_REQ_EVAL.at (model.type); + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = - scale*MEM_REQ_KV_SELF.at(model.type); + scale*MEM_REQ_KV_SELF().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1732,10 +1748,10 @@ struct llama_context * llama_init_from_file( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } return ctx; diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 55b086dae..b08984571 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,13 +5,17 @@ #include #include -static const std::map> k_tests = { - { "Hello World", { 1, 10994, 2787, }, }, - { " Hello World", { 1, 15043, 2787, }, }, - { " Hello World!", { 1, 15043, 2787, 29991, }, }, - { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, +static const std::map> & k_tests() +{ + static std::map> _k_tests = { + { "Hello World", { 1, 10994, 2787, }, }, + { " Hello World", { 1, 15043, 2787, }, }, + { " Hello World!", { 1, 15043, 2787, 29991, }, }, + { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, + }; + return _k_tests; }; int main(int argc, char **argv) { @@ -47,7 +51,7 @@ int main(int argc, char **argv) { return 2; } - for (const auto & test_kv : k_tests) { + for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); res.resize(n); From 315a95a4d30db726fb7d244dd3b9e90a83fb1616 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Mon, 17 Apr 2023 17:28:55 +0200 Subject: [PATCH 5/8] Add LoRA support (#820) --- convert-lora-to-ggml.py | 124 +++++++++++ examples/common.cpp | 15 ++ examples/common.h | 7 +- examples/main/main.cpp | 11 + examples/perplexity/perplexity.cpp | 11 + ggml.c | 327 ++++++++++++++++++++++++++--- ggml.h | 6 + llama.cpp | 251 ++++++++++++++++++++++ llama.h | 12 ++ llama_util.h | 30 +-- 10 files changed, 753 insertions(+), 41 deletions(-) create mode 100644 convert-lora-to-ggml.py diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py new file mode 100644 index 000000000..8a2085c25 --- /dev/null +++ b/convert-lora-to-ggml.py @@ -0,0 +1,124 @@ +import json +import os +import re +import struct +import sys +from typing import Any, Dict, Sequence, TextIO + +import torch + +from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType + +HF_SUBLAYER_TO_GGML = { + "self_attn.q_proj": "attention.wq", + "self_attn.k_proj": "attention.wk", + "self_attn.v_proj": "attention.wv", + "self_attn.o_proj": "attention.wo", + "mlp.gate_proj": "feed_forward.w1", + "mlp.down_proj": "feed_forward.w2", + "mlp.up_proj": "feed_forward.w3", + "input_layernorm": "attention_norm", + "post_attention_layernorm": "ffn_norm", + # "norm": "norm", + # "embed_tokens": "tok_embeddings", + # "lm_head": "output", +} + + +def translate_tensor_name(t: str) -> str: + match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) + if match: + nn = match.group(1) + sub_layer = match.group(2) + lora_type = match.group(3) + + sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) + if sub_layer_renamed is None: + print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") + sys.exit(1) + + output_string = ( + f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" + ) + return output_string + else: + print(f"Error: unrecognized tensor {t}") + sys.exit(1) + + +def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: + fout.write(b"ggla"[::-1]) # magic (ggml lora) + fout.write(struct.pack("i", 1)) # file version + fout.write(struct.pack("ii", params["r"], params["lora_alpha"])) + + +def write_tensor_header( + self, name: str, shape: Sequence[int], data_type: DataType +) -> None: + sname = name.encode("utf-8") + fout.write( + struct.pack( + "iii", + len(shape), + len(sname), + DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]], + ) + ) + fout.write(struct.pack("i" * len(shape), *shape[::-1])) + fout.write(sname) + fout.seek((fout.tell() + 31) & -32) + + +if len(sys.argv) != 2: + print(f"Usage: python {sys.argv[0]} ") + print( + "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" + ) + sys.exit(1) + +input_json = os.path.join(sys.argv[1], "adapter_config.json") +input_model = os.path.join(sys.argv[1], "adapter_model.bin") +output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") + +model = torch.load(input_model, map_location="cpu") + +with open(input_json, "r") as f: + params = json.load(f) + +if params["peft_type"] != "LORA": + print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + sys.exit(1) + +if params["fan_in_fan_out"] == True: + print("Error: param fan_in_fan_out is not supported") + sys.exit(1) + +if params["bias"] is not None and params["bias"] != "none": + print("Error: param bias is not supported") + sys.exit(1) + +# TODO: these seem to be layers that have been trained but without lora. +# doesn't seem widely used but eventually should be supported +if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: + print("Error: param modules_to_save is not supported") + sys.exit(1) + +with open(output_path, "wb") as fout: + fout.truncate() + + write_file_header(fout, params) + for k, v in model.items(): + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: + v = v.float() + + t = v.numpy() + tname = translate_tensor_name(k) + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) + t.tofile(fout) + +print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/examples/common.cpp b/examples/common.cpp index 0772dbfe1..a0b6f10ad 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.model = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { @@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/examples/common.h b/examples/common.h index 1ea6f7445..cbbc2dfab 100644 --- a/examples/common.h +++ b/examples/common.h @@ -31,11 +31,12 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; - std::string input_prefix = ""; // string to prefix user inputs with - - + std::string input_prefix = ""; // string to prefix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted + std::string lora_adapter = ""; // lora adapter path + std::string lora_base = ""; // base model path for the lora adapter + bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3e4b0034e..b7b3c4196 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -114,6 +114,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 19449e16e..80792ea0d 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -134,6 +134,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); diff --git a/ggml.c b/ggml.c index 995a2faab..acdba0333 100644 --- a/ggml.c +++ b/ggml.c @@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #endif } +static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q4_0] = { + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_0_q8_0, + }, + [GGML_TYPE_Q4_1] = { + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q_dot = quantize_row_q4_1, + .vec_dot_q = ggml_vec_dot_q4_1, + }, + // TODO: GGML_TYPE_Q8_0 +}; + +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + + // // simd mappings // @@ -5588,6 +5616,26 @@ static void ggml_compute_forward_dup_f16( } } } + } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + size_t id = 0; + uint8_t * dst_ptr = (uint8_t *) dst->data; + size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + float * src0_f32 = (float *) params->wdata; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + // convert to f32 and quantize + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + } + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += dst_row_size; + } + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -5780,6 +5828,21 @@ static void ggml_compute_forward_dup_f32( } } } + } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + size_t id = 0; + uint8_t * dst_ptr = (uint8_t *) dst->data; + size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += dst_row_size; + } + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -5968,6 +6031,212 @@ static void ggml_compute_forward_add_f32( } } +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(float)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + //const int64_t ne10 = src1->ne[0]; + //const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t ne0 = dst->ne[0]; + //const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float*) params->wdata + ne00 * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne00); + } +} + static void ggml_compute_forward_add( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -5978,6 +6247,23 @@ static void ggml_compute_forward_add( { ggml_compute_forward_add_f32(params, src0, src1, dst); } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + { + ggml_compute_forward_add_q_f32(params, src0, src1, dst); + } break; default: { GGML_ASSERT(false); @@ -7257,30 +7543,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { - [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0, - }, - [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, - }, - // TODO: GGML_TYPE_Q8_0 -}; - -// For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); - return quantize_fns[i]; -} - static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10137,13 +10399,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) struct ggml_tensor * node = cgraph->nodes[i]; switch (node->op) { + case GGML_OP_CPY: case GGML_OP_DUP: { node->n_tasks = 1; + + size_t cur = 0; + if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_ADD: { node->n_tasks = n_threads; + + size_t cur = 0; + + if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_SUB: case GGML_OP_MUL: @@ -10224,7 +10502,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = n_threads; } break; - case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: diff --git a/ggml.h b/ggml.h index e693754c9..59de0cb12 100644 --- a/ggml.h +++ b/ggml.h @@ -430,6 +430,12 @@ struct ggml_tensor * ggml_add( struct ggml_tensor * a, struct ggml_tensor * b); + +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + struct ggml_tensor * ggml_sub( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index cdd8bd16b..db71c03bd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,6 +1,8 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include +#include #endif #include "llama_util.h" @@ -633,6 +635,7 @@ struct llama_model_loader { throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } + return get_tensor_for(lt); } @@ -1774,6 +1777,254 @@ int llama_model_quantize( } } +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + auto & model = ctx->model; + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 'ggla') { + fprintf(stderr, "%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_buffer base_buf; + if (path_base_model) { + fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + + size_t ctx_size, mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; + } + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + std::string name(length, 0); + fin.read(&name[0], length); + + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } + + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + + if (model_tensors.find(base_name.data()) == model_tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + fprintf(stderr, "%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor* lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); + + lora_tensors[name] = lora_tensor; + + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + + ggml_tensor * dest_t = model_tensors[base_name]; + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; + } + + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale(lora_ctx, BA, scale_tensor); + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + r = ggml_cpy(lora_ctx, r, dest_t); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) + fprintf(stderr, "."); + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); + return 1; + } +} + // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { diff --git a/llama.h b/llama.h index 192217593..c35193a8a 100644 --- a/llama.h +++ b/llama.h @@ -96,6 +96,18 @@ extern "C" { const char * fname_out, enum llama_ftype ftype); + // Apply a LoRA adapter to a loaded model + // path_base_model is the path to a higher quality model to use as a base for + // the layers modified by the adapter. Can be NULL to use the current loaded model. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter + // will be applied on top of the previous one + // Returns 0 on success + LLAMA_API int llama_apply_lora_from_file( + struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); + // Returns the KV cache that will contain the context for the // ongoing prediction with the model. LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); diff --git a/llama_util.h b/llama_util.h index c92c0cc71..ee9b2a6f3 100755 --- a/llama_util.h +++ b/llama_util.h @@ -168,7 +168,7 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -180,10 +180,12 @@ struct llama_mmap { throw format("mmap failed: %s", strerror(errno)); } - // Advise the kernel to preload the mapped memory - if (madvise(addr, file->size, MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); + if (prefetch) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } } } @@ -193,7 +195,7 @@ struct llama_mmap { #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); @@ -215,13 +217,15 @@ struct llama_mmap { } #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } #else #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") From 4ad73137a1aadc40a62f7101aab883f69e7172c6 Mon Sep 17 00:00:00 2001 From: Cameron Date: Mon, 17 Apr 2023 11:26:23 -0700 Subject: [PATCH 6/8] add 4_0 to default outfile namestr dict (#1031) this came up when trying to convert the gpt4all-lora-unfiltered-quantized.bin file --- convert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 7b9f043b2..7f7ae05fa 100644 --- a/convert.py +++ b/convert.py @@ -1085,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", }[params.file_type] @@ -1108,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)") + parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") From e9298af3896b536d0c6d740447dc764e56b22102 Mon Sep 17 00:00:00 2001 From: Atsushi Tatsuma Date: Tue, 18 Apr 2023 04:34:35 +0900 Subject: [PATCH 7/8] readme : add Ruby bindings (#1029) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 78215c9ce..4f451f32d 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) +- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) **UI:** From 42747220b4cac548b6e3059b66b3e960b517cfa4 Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Tue, 18 Apr 2023 03:15:50 +0200 Subject: [PATCH 8/8] Do not close file after mmap (Windows version) (#1034) --- llama_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_util.h b/llama_util.h index ee9b2a6f3..eba14656a 100755 --- a/llama_util.h +++ b/llama_util.h @@ -202,7 +202,6 @@ struct llama_mmap { HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); DWORD error = GetLastError(); - CloseHandle(hFile); if (hMapping == NULL) { throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());