Merge branch 'master' into concedo

# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile
2023-04-22 16:22:08 +08:00 · 2023-04-22 16:22:08 +08:00 · 1b7aa2b815
commit 1b7aa2b815
parent 1ea0e15292 7e312f165c
3 changed files with 14 additions and 6 deletions
--- a/2
+++ b/2
@ -106,7 +106,7 @@ endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mcpu=power9
+		CFLAGS   += -mcpu=power9
 		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
--- a/ggml.c
+++ b/ggml.c
@ -470,6 +470,14 @@ static inline int hsum_i32_8(const __m256i a) {
    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }

+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
 #if __AVX2__ || __AVX512F__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
@ -1383,7 +1391,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
        y[i].s1 = d * sum1;
    }
 #elif defined(__AVX2__) || defined(__AVX__)
-    // TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    for (int i = 0; i < nb; i++) {
        // Load elements into 4 AVX vectors
        __m256 v0 = _mm256_loadu_ps( x );
@ -1462,7 +1469,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
        // Compute the sum of the quants and set y[i].s
        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_8(_mm256_set_m128i(s1, s0));
+        y[i].s0 = d * hsum_i32_4(s0);
+        y[i].s1 = d * hsum_i32_4(s1);

        // Convert int32 to int16
        ni0 = _mm_packs_epi32( ni0, ni1 );
--- a/llama.cpp
+++ b/llama.cpp
@ -68,7 +68,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
        { MODEL_65B,   512ull * MB },
    };
    return _MEM_REQ_SCRATCH1;
-};
+}

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
        { MODEL_65B,  5120ull * MB },
    };
    return _MEM_REQ_KV_SELF;
-};
+}

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
@ -93,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
        { MODEL_65B, 1536ull * MB },
    };
    return _MEM_REQ_EVAL;
-};
+}

 // default hparams (LLaMA 7B)
 struct llama_hparams {