Add new quantization to quantize

This commit is contained in:
Iwan Kawrakow 2023-04-11 15:32:41 +02:00
parent 92408cd983
commit 709d23543a
5 changed files with 76 additions and 28 deletions

View file

@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
ggml_extra.o: ggml_extra.cpp ggml_extra.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
libllama.so: llama.o ggml.o ggml_extra.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#

View file

@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 4 - new q4_0\n");
fprintf(stderr, " type = 5 - new q4_1\n");
return 1;
}

View file

@ -10,6 +10,11 @@
namespace {
constexpr int kChunkSize = 32*32*8;
constexpr int QK = 32;
constexpr int kBucketSize0 = QK/2 + sizeof(float);
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
inline int toNearestInt(float fval) {
assert(fval <= 4194303.f);
constexpr float kSnapper=3<<22;
@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
return {a, b};
}
void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
constexpr int kChunkSize = 32*32*8;
constexpr int QK = 32;
constexpr int kBucketSize0 = QK/2 + sizeof(float);
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
assert(k % QK == 0);
auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
auto q = (uint8_t*)y;
if (type == 0) {
float scale = kQuantize0(QK, X, L, work, -7, 7);
std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
uint8_t* q = (uint8_t*)y;
std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
} else {
auto result = kQuantize1(QK, X, L, tmpX, work, 7);
std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
std::memcpy(y, &result.first, sizeof(result.first)); y += sizeof(result.first);
uint8_t* q = (uint8_t*)y;
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
}
};
@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
std::vector<std::pair<float,int>> work;
std::vector<float> tmpX;
int nb = k / QK;
auto x = X;
for (int i=0; i<nb; ++i) {
processOne(x + QK*i, L.data(), y, work, tmpX);
processOne(x, L.data(), y, work, tmpX);
y += bucketSize; x += QK;
}
return;
}
std::atomic<int> counter(0);
auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
std::vector<int8_t> L(QK);
std::vector<std::pair<float,int>> work;
std::vector<float> tmpX;
while (true) {
int first = counter.fetch_add(kChunkSize);
int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
if (first >= k) break;
int last = first + kChunkSize;
if (last > k) last = k;
auto xi = x + first;
auto xi = X + first;
auto yi = y + (first/QK)*bucketSize;
int n = (last - first)/QK;
for (int i=0; i<n; ++i) {
@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
for (auto& w : workers) w.join();
}
void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
if (!hist) return;
auto y = (const uint8_t*)buffer;
int m = type == 0 ? 4 : 8;
int n = k / 32;
for (int i=0; i<n; ++i) {
y += m;
for (int l=0; l<16; ++l) {
++hist[y[l] & 15];
++hist[y[l] >> 4];
}
y += 16;
}
}
}
extern "C" {
@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
kQuantizeQ4(x, buffer, k, 1);
}
size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
kQuantizeQ4(x, buffer, k, 0);
collectHisto(k, buffer, hist, 0);
return (k / QK) * kBucketSize0;
}
size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
kQuantizeQ4(x, buffer, k, 1);
collectHisto(k, buffer, hist, 1);
return (k / QK) * kBucketSize1;
}
}

View file

@ -1,7 +1,12 @@
#pragma once
#ifdef __cplusplus
#include <cstdint>
#include <cstddef>
extern "C" {
#else
#include <stdint.h>
#include <stddef.h>
#endif
#ifdef __cplusplus
@ -12,8 +17,10 @@ extern "C" {
#endif
void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
#ifdef __cplusplus
}

View file

@ -8,6 +8,7 @@
#include "llama_internal.h"
#include "ggml.h"
#include "ggml_extra.h"
#include <array>
#include <cinttypes>
@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type quantized_type;
bool useNewQuantization = false;
switch (itype) {
case 2: quantized_type = GGML_TYPE_Q4_0; break;
case 3: quantized_type = GGML_TYPE_Q4_1; break;
case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
default: throw format("invalid quantization type %d\n", itype);
};
@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
switch (new_type) {
case GGML_TYPE_Q4_0:
{
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
new_size = useNewQuantization ?
kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
new_size = useNewQuantization ?
kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
default:
LLAMA_ASSERT(false);