Add new quantization to quantize
This commit is contained in:
parent
92408cd983
commit
709d23543a
5 changed files with 76 additions and 28 deletions
27
Makefile
27
Makefile
|
@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
|
|||
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
|
||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||
|
||||
ggml_extra.o: ggml_extra.cpp ggml_extra.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
libllama.so: llama.o ggml.o ggml_extra.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
|
|
@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = 2 - q4_0\n");
|
||||
fprintf(stderr, " type = 3 - q4_1\n");
|
||||
fprintf(stderr, " type = 4 - new q4_0\n");
|
||||
fprintf(stderr, " type = 5 - new q4_1\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,11 @@
|
|||
|
||||
namespace {
|
||||
|
||||
constexpr int kChunkSize = 32*32*8;
|
||||
constexpr int QK = 32;
|
||||
constexpr int kBucketSize0 = QK/2 + sizeof(float);
|
||||
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
|
||||
|
||||
inline int toNearestInt(float fval) {
|
||||
assert(fval <= 4194303.f);
|
||||
constexpr float kSnapper=3<<22;
|
||||
|
@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
|
|||
return {a, b};
|
||||
}
|
||||
|
||||
void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
|
||||
constexpr int kChunkSize = 32*32*8;
|
||||
constexpr int QK = 32;
|
||||
constexpr int kBucketSize0 = QK/2 + sizeof(float);
|
||||
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
|
||||
void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
|
||||
auto q = (uint8_t*)y;
|
||||
if (type == 0) {
|
||||
float scale = kQuantize0(QK, X, L, work, -7, 7);
|
||||
std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
|
||||
uint8_t* q = (uint8_t*)y;
|
||||
std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
|
||||
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
|
||||
} else {
|
||||
auto result = kQuantize1(QK, X, L, tmpX, work, 7);
|
||||
std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
|
||||
std::memcpy(y, &result.first, sizeof(result.first)); y += sizeof(result.first);
|
||||
uint8_t* q = (uint8_t*)y;
|
||||
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
|
||||
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
|
||||
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
||||
}
|
||||
};
|
||||
|
@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
|
|||
std::vector<std::pair<float,int>> work;
|
||||
std::vector<float> tmpX;
|
||||
int nb = k / QK;
|
||||
auto x = X;
|
||||
for (int i=0; i<nb; ++i) {
|
||||
processOne(x + QK*i, L.data(), y, work, tmpX);
|
||||
processOne(x, L.data(), y, work, tmpX);
|
||||
y += bucketSize; x += QK;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
std::atomic<int> counter(0);
|
||||
auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
|
||||
auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
|
||||
std::vector<int8_t> L(QK);
|
||||
std::vector<std::pair<float,int>> work;
|
||||
std::vector<float> tmpX;
|
||||
while (true) {
|
||||
int first = counter.fetch_add(kChunkSize);
|
||||
int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
|
||||
if (first >= k) break;
|
||||
int last = first + kChunkSize;
|
||||
if (last > k) last = k;
|
||||
auto xi = x + first;
|
||||
auto xi = X + first;
|
||||
auto yi = y + (first/QK)*bucketSize;
|
||||
int n = (last - first)/QK;
|
||||
for (int i=0; i<n; ++i) {
|
||||
|
@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
|
|||
for (auto& w : workers) w.join();
|
||||
}
|
||||
|
||||
void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
|
||||
if (!hist) return;
|
||||
auto y = (const uint8_t*)buffer;
|
||||
int m = type == 0 ? 4 : 8;
|
||||
int n = k / 32;
|
||||
for (int i=0; i<n; ++i) {
|
||||
y += m;
|
||||
for (int l=0; l<16; ++l) {
|
||||
++hist[y[l] & 15];
|
||||
++hist[y[l] >> 4];
|
||||
}
|
||||
y += 16;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
|
|||
kQuantizeQ4(x, buffer, k, 1);
|
||||
}
|
||||
|
||||
size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
|
||||
kQuantizeQ4(x, buffer, k, 0);
|
||||
collectHisto(k, buffer, hist, 0);
|
||||
return (k / QK) * kBucketSize0;
|
||||
}
|
||||
|
||||
size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
|
||||
kQuantizeQ4(x, buffer, k, 1);
|
||||
collectHisto(k, buffer, hist, 1);
|
||||
return (k / QK) * kBucketSize1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
extern "C" {
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -12,8 +17,10 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
||||
|
||||
void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
12
llama.cpp
12
llama.cpp
|
@ -8,6 +8,7 @@
|
|||
#include "llama_internal.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml_extra.h"
|
||||
|
||||
#include <array>
|
||||
#include <cinttypes>
|
||||
|
@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
||||
ggml_type quantized_type;
|
||||
bool useNewQuantization = false;
|
||||
switch (itype) {
|
||||
case 2: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case 3: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
|
||||
case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
|
||||
default: throw format("invalid quantization type %d\n", itype);
|
||||
};
|
||||
|
||||
|
@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
switch (new_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
new_size = useNewQuantization ?
|
||||
kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
|
||||
ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
new_size = useNewQuantization ?
|
||||
kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
|
||||
ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
LLAMA_ASSERT(false);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue