Add new quantization to quantize

2023-04-11 15:32:41 +02:00 · 2023-04-11 15:32:41 +02:00 · 709d23543a
commit 709d23543a
parent 92408cd983
5 changed files with 76 additions and 28 deletions
--- a/27
+++ b/27
@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
 llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

+ggml_extra.o: ggml_extra.cpp ggml_extra.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding

-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+libllama.so: llama.o ggml.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        fprintf(stderr, "  type = 2 - q4_0\n");
        fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 4 - new q4_0\n");
+        fprintf(stderr, "  type = 5 - new q4_1\n");
        return 1;
    }

--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@ -10,6 +10,11 @@

 namespace {

+constexpr int kChunkSize = 32*32*8;
+constexpr int QK = 32;
+constexpr int kBucketSize0 = QK/2 + sizeof(float);
+constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+
 inline int toNearestInt(float fval) {
    assert(fval <= 4194303.f);
    constexpr float kSnapper=3<<22;
@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
    return {a, b};
 }

-void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
-    constexpr int kChunkSize = 32*32*8;
-    constexpr int QK = 32;
-    constexpr int kBucketSize0 = QK/2 + sizeof(float);
-    constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
    assert(k % QK == 0);

    auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
+        auto q = (uint8_t*)y;
        if (type == 0) {
            float scale = kQuantize0(QK, X, L, work, -7, 7);
-            std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
            for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
        } else {
            auto result = kQuantize1(QK, X, L, tmpX, work, 7);
-            std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
-            std::memcpy(y, &result.first,  sizeof(result.first));  y += sizeof(result.first);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
+            std::memcpy(q, &result.first,  sizeof(result.first));  q += sizeof(result.first);
            for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
        }
    };
@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
        std::vector<std::pair<float,int>> work;
        std::vector<float> tmpX;
        int nb = k / QK;
+        auto x = X;
        for (int i=0; i<nb; ++i) {
-            processOne(x + QK*i, L.data(), y, work, tmpX);
+            processOne(x, L.data(), y, work, tmpX);
            y += bucketSize; x += QK;
        }
        return;
    }

    std::atomic<int> counter(0);
-    auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
+    auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
        std::vector<int8_t> L(QK);
        std::vector<std::pair<float,int>> work;
        std::vector<float> tmpX;
        while (true) {
-            int first = counter.fetch_add(kChunkSize);
+            int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
            if (first >= k) break;
            int last = first + kChunkSize;
            if (last > k) last = k;
-            auto xi = x + first;
+            auto xi = X + first;
            auto yi = y + (first/QK)*bucketSize;
            int n = (last - first)/QK;
            for (int i=0; i<n; ++i) {
@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
    for (auto& w : workers) w.join();
 }

+void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
+    if (!hist) return;
+    auto y = (const uint8_t*)buffer;
+    int m = type == 0 ? 4 : 8;
+    int n = k / 32;
+    for (int i=0; i<n; ++i) {
+        y += m;
+        for (int l=0; l<16; ++l) {
+            ++hist[y[l] & 15];
+            ++hist[y[l] >> 4];
+        }
+        y += 16;
+    }
+}
+
 }

 extern "C" {
@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
    kQuantizeQ4(x, buffer, k, 1);
 }

+size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 0);
+    collectHisto(k, buffer, hist, 0);
+    return (k / QK) * kBucketSize0;
+}
+
+size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 1);
+    collectHisto(k, buffer, hist, 1);
+    return (k / QK) * kBucketSize1;
+}
+
 }
--- a/ggml_extra.h
+++ b/ggml_extra.h
@ -1,7 +1,12 @@
 #pragma once

 #ifdef  __cplusplus
+#include <cstdint>
+#include <cstddef>
 extern "C" {
+#else
+#include <stdint.h>
+#include <stddef.h>
 #endif

 #ifdef  __cplusplus
@ -12,8 +17,10 @@ extern "C" {
 #endif

 void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);

 void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);

 #ifdef  __cplusplus
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -8,6 +8,7 @@
 #include "llama_internal.h"

 #include "ggml.h"
+#include "ggml_extra.h"

 #include <array>
 #include <cinttypes>
@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(

 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
    ggml_type quantized_type;
+    bool useNewQuantization = false;
    switch (itype) {
        case 2: quantized_type = GGML_TYPE_Q4_0; break;
        case 3: quantized_type = GGML_TYPE_Q4_1; break;
+        case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
+        case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
        default: throw format("invalid quantization type %d\n", itype);
    };

@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            switch (new_type) {
                case GGML_TYPE_Q4_0:
                    {
-                        new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                    } break;
                case GGML_TYPE_Q4_1:
                    {
-                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                    } break;
                default:
                    LLAMA_ASSERT(false);