diff --git a/Makefile b/Makefile index 3e58a28a7..17624656b 100644 --- a/Makefile +++ b/Makefile @@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h llama.o: llama.cpp llama.h llama_util.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o +ggml_extra.o: ggml_extra.cpp ggml_extra.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: rm -vf *.o main quantize quantize-stats perplexity embedding -main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) +main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o - $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) +libllama.so: llama.o ggml.o ggml_extra.o + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 680757c6b..313b7534f 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,6 +14,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = 4 - new q4_0\n"); + fprintf(stderr, " type = 5 - new q4_1\n"); return 1; } diff --git a/ggml_extra.cpp b/ggml_extra.cpp index cabbefae7..e2ae005df 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -10,6 +10,11 @@ namespace { +constexpr int kChunkSize = 32*32*8; +constexpr int QK = 32; +constexpr int kBucketSize0 = QK/2 + sizeof(float); +constexpr int kBucketSize1 = QK/2 + 2*sizeof(float); + inline int toNearestInt(float fval) { assert(fval <= 4194303.f); constexpr float kSnapper=3<<22; @@ -126,24 +131,19 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector return {a, b}; } -void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) { - constexpr int kChunkSize = 32*32*8; - constexpr int QK = 32; - constexpr int kBucketSize0 = QK/2 + sizeof(float); - constexpr int kBucketSize1 = QK/2 + 2*sizeof(float); +void kQuantizeQ4(const float* X, void* buffer, int k, int type) { assert(k % QK == 0); auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { + auto q = (uint8_t*)y; if (type == 0) { float scale = kQuantize0(QK, X, L, work, -7, 7); - std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale); - uint8_t* q = (uint8_t*)y; + std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k> work; std::vector tmpX; int nb = k / QK; + auto x = X; for (int i=0; i counter(0); - auto compute = [&counter, x, y, k, bucketSize, &processOne] () { + auto compute = [&counter, X, y, k, bucketSize, &processOne] () { std::vector L(QK); std::vector> work; std::vector tmpX; while (true) { - int first = counter.fetch_add(kChunkSize); + int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed); if (first >= k) break; int last = first + kChunkSize; if (last > k) last = k; - auto xi = x + first; + auto xi = X + first; auto yi = y + (first/QK)*bucketSize; int n = (last - first)/QK; for (int i=0; i> 4]; + } + y += 16; + } +} + } extern "C" { @@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) { kQuantizeQ4(x, buffer, k, 1); } +size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 0); + collectHisto(k, buffer, hist, 0); + return (k / QK) * kBucketSize0; +} + +size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 1); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 99041bed0..788fcd0ea 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -1,7 +1,12 @@ #pragma once #ifdef __cplusplus +#include +#include extern "C" { +#else +#include +#include #endif #ifdef __cplusplus @@ -12,8 +17,10 @@ extern "C" { #endif void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 54ba01eef..04ba10672 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8,6 +8,7 @@ #include "llama_internal.h" #include "ggml.h" +#include "ggml_extra.h" #include #include @@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k( static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { ggml_type quantized_type; + bool useNewQuantization = false; switch (itype) { case 2: quantized_type = GGML_TYPE_Q4_0; break; case 3: quantized_type = GGML_TYPE_Q4_1; break; + case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break; + case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break; default: throw format("invalid quantization type %d\n", itype); }; @@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s switch (new_type) { case GGML_TYPE_Q4_0: { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + new_size = useNewQuantization ? + kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) : + ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + new_size = useNewQuantization ? + kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) : + ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; default: LLAMA_ASSERT(false);