From 872c365a9176a011b13d31269bb3121fa89c37e1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 22 Apr 2023 11:08:12 +0300 Subject: [PATCH 1/2] ggml : fix AVX build + update to new Q8_0 format --- Makefile | 10 +++++++--- ggml.c | 12 ++++++++++-- llama.cpp | 6 +++--- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 3b48eec99..b297959c9 100644 --- a/Makefile +++ b/Makefile @@ -74,13 +74,17 @@ endif # feel free to update the Makefile for your architecture and send a pull request or issue ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # Use all CPU extensions that are available: - CFLAGS += -march=native -mtune=native + CFLAGS += -march=native -mtune=native CXXFLAGS += -march=native -mtune=native + + # Usage AVX-only + #CFLAGS += -mfma -mf16c -mavx + #CXXFLAGS += -mfma -mf16c -mavx endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) - CFLAGS += -mcpu=power9 + CFLAGS += -mcpu=power9 CXXFLAGS += -mcpu=power9 endif # Require c++23's std::byteswap for big-endian support. @@ -114,7 +118,7 @@ ifdef LLAMA_GPROF CXXFLAGS += -pg endif ifneq ($(filter aarch64%,$(UNAME_M)),) - CFLAGS += -mcpu=native + CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native endif ifneq ($(filter armv6%,$(UNAME_M)),) diff --git a/ggml.c b/ggml.c index 72b392fdb..46c0292fe 100644 --- a/ggml.c +++ b/ggml.c @@ -468,6 +468,14 @@ static inline int hsum_i32_8(const __m256i a) { return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + #if __AVX2__ || __AVX512F__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval @@ -1381,7 +1389,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].s1 = d * sum1; } #elif defined(__AVX2__) || defined(__AVX__) - // TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -1460,7 +1467,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = d * hsum_i32_8(_mm256_set_m128i(s1, s0)); + y[i].s0 = d * hsum_i32_4(s0); + y[i].s1 = d * hsum_i32_4(s1); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); diff --git a/llama.cpp b/llama.cpp index 00cce6e2a..4e92f5515 100644 --- a/llama.cpp +++ b/llama.cpp @@ -68,7 +68,7 @@ static const std::map & MEM_REQ_SCRATCH1() { MODEL_65B, 512ull * MB }, }; return _MEM_REQ_SCRATCH1; -}; +} // 2*n_embd*n_ctx*n_layer*sizeof(float16) static const std::map & MEM_REQ_KV_SELF() @@ -80,7 +80,7 @@ static const std::map & MEM_REQ_KV_SELF() { MODEL_65B, 5120ull * MB }, }; return _MEM_REQ_KV_SELF; -}; +} // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled @@ -93,7 +93,7 @@ static const std::map & MEM_REQ_EVAL() { MODEL_65B, 1536ull * MB }, }; return _MEM_REQ_EVAL; -}; +} // default hparams (LLaMA 7B) struct llama_hparams { From 7e312f165c5047d6e16680d1eebc83055e95c313 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 22 Apr 2023 16:18:20 +0800 Subject: [PATCH 2/2] cmake : fix build under Windows when enable BUILD_SHARED_LIBS (#1100) * Fix build under Windows when enable BUILD_SHARED_LIBS * Make AVX512 test on Windows to build the shared libs --- .github/workflows/build.yml | 2 +- CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7e8a29b1e..b2a35613e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -169,7 +169,7 @@ jobs: - build: 'avx' defines: '-DLLAMA_AVX2=OFF' - build: 'avx512' - defines: '-DLLAMA_AVX512=ON' + defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c3c60167..2d4e30e5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,10 @@ endif() if (MSVC) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() endif() if (LLAMA_LTO)