Add new quantization to quantize
This commit is contained in:
parent
92408cd983
commit
709d23543a
5 changed files with 76 additions and 28 deletions
27
Makefile
27
Makefile
|
@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
|
||||||
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
|
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
|
||||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||||
|
|
||||||
|
ggml_extra.o: ggml_extra.cpp ggml_extra.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding
|
rm -vf *.o main quantize quantize-stats perplexity embedding
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
libllama.so: llama.o ggml.o
|
libllama.so: llama.o ggml.o ggml_extra.o
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
#
|
#
|
||||||
# Tests
|
# Tests
|
||||||
#
|
#
|
||||||
|
|
|
@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
fprintf(stderr, " type = 2 - q4_0\n");
|
fprintf(stderr, " type = 2 - q4_0\n");
|
||||||
fprintf(stderr, " type = 3 - q4_1\n");
|
fprintf(stderr, " type = 3 - q4_1\n");
|
||||||
|
fprintf(stderr, " type = 4 - new q4_0\n");
|
||||||
|
fprintf(stderr, " type = 5 - new q4_1\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,11 @@
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
constexpr int kChunkSize = 32*32*8;
|
||||||
|
constexpr int QK = 32;
|
||||||
|
constexpr int kBucketSize0 = QK/2 + sizeof(float);
|
||||||
|
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
|
||||||
|
|
||||||
inline int toNearestInt(float fval) {
|
inline int toNearestInt(float fval) {
|
||||||
assert(fval <= 4194303.f);
|
assert(fval <= 4194303.f);
|
||||||
constexpr float kSnapper=3<<22;
|
constexpr float kSnapper=3<<22;
|
||||||
|
@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
|
||||||
return {a, b};
|
return {a, b};
|
||||||
}
|
}
|
||||||
|
|
||||||
void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
|
void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
|
||||||
constexpr int kChunkSize = 32*32*8;
|
|
||||||
constexpr int QK = 32;
|
|
||||||
constexpr int kBucketSize0 = QK/2 + sizeof(float);
|
|
||||||
constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
|
|
||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
|
auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
|
||||||
|
auto q = (uint8_t*)y;
|
||||||
if (type == 0) {
|
if (type == 0) {
|
||||||
float scale = kQuantize0(QK, X, L, work, -7, 7);
|
float scale = kQuantize0(QK, X, L, work, -7, 7);
|
||||||
std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
|
std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
|
||||||
uint8_t* q = (uint8_t*)y;
|
|
||||||
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
|
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
|
||||||
} else {
|
} else {
|
||||||
auto result = kQuantize1(QK, X, L, tmpX, work, 7);
|
auto result = kQuantize1(QK, X, L, tmpX, work, 7);
|
||||||
std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
|
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
|
||||||
std::memcpy(y, &result.first, sizeof(result.first)); y += sizeof(result.first);
|
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
|
||||||
uint8_t* q = (uint8_t*)y;
|
|
||||||
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
|
||||||
std::vector<std::pair<float,int>> work;
|
std::vector<std::pair<float,int>> work;
|
||||||
std::vector<float> tmpX;
|
std::vector<float> tmpX;
|
||||||
int nb = k / QK;
|
int nb = k / QK;
|
||||||
|
auto x = X;
|
||||||
for (int i=0; i<nb; ++i) {
|
for (int i=0; i<nb; ++i) {
|
||||||
processOne(x + QK*i, L.data(), y, work, tmpX);
|
processOne(x, L.data(), y, work, tmpX);
|
||||||
y += bucketSize; x += QK;
|
y += bucketSize; x += QK;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
|
auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
|
||||||
std::vector<int8_t> L(QK);
|
std::vector<int8_t> L(QK);
|
||||||
std::vector<std::pair<float,int>> work;
|
std::vector<std::pair<float,int>> work;
|
||||||
std::vector<float> tmpX;
|
std::vector<float> tmpX;
|
||||||
while (true) {
|
while (true) {
|
||||||
int first = counter.fetch_add(kChunkSize);
|
int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
|
||||||
if (first >= k) break;
|
if (first >= k) break;
|
||||||
int last = first + kChunkSize;
|
int last = first + kChunkSize;
|
||||||
if (last > k) last = k;
|
if (last > k) last = k;
|
||||||
auto xi = x + first;
|
auto xi = X + first;
|
||||||
auto yi = y + (first/QK)*bucketSize;
|
auto yi = y + (first/QK)*bucketSize;
|
||||||
int n = (last - first)/QK;
|
int n = (last - first)/QK;
|
||||||
for (int i=0; i<n; ++i) {
|
for (int i=0; i<n; ++i) {
|
||||||
|
@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
|
||||||
for (auto& w : workers) w.join();
|
for (auto& w : workers) w.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
|
||||||
|
if (!hist) return;
|
||||||
|
auto y = (const uint8_t*)buffer;
|
||||||
|
int m = type == 0 ? 4 : 8;
|
||||||
|
int n = k / 32;
|
||||||
|
for (int i=0; i<n; ++i) {
|
||||||
|
y += m;
|
||||||
|
for (int l=0; l<16; ++l) {
|
||||||
|
++hist[y[l] & 15];
|
||||||
|
++hist[y[l] >> 4];
|
||||||
|
}
|
||||||
|
y += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
|
||||||
kQuantizeQ4(x, buffer, k, 1);
|
kQuantizeQ4(x, buffer, k, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
|
||||||
|
kQuantizeQ4(x, buffer, k, 0);
|
||||||
|
collectHisto(k, buffer, hist, 0);
|
||||||
|
return (k / QK) * kBucketSize0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
|
||||||
|
kQuantizeQ4(x, buffer, k, 1);
|
||||||
|
collectHisto(k, buffer, hist, 1);
|
||||||
|
return (k / QK) * kBucketSize1;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstddef>
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
#else
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -12,8 +17,10 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||||
|
size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
||||||
|
|
||||||
void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||||
|
size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
12
llama.cpp
12
llama.cpp
|
@ -8,6 +8,7 @@
|
||||||
#include "llama_internal.h"
|
#include "llama_internal.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml_extra.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
||||||
ggml_type quantized_type;
|
ggml_type quantized_type;
|
||||||
|
bool useNewQuantization = false;
|
||||||
switch (itype) {
|
switch (itype) {
|
||||||
case 2: quantized_type = GGML_TYPE_Q4_0; break;
|
case 2: quantized_type = GGML_TYPE_Q4_0; break;
|
||||||
case 3: quantized_type = GGML_TYPE_Q4_1; break;
|
case 3: quantized_type = GGML_TYPE_Q4_1; break;
|
||||||
|
case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
|
||||||
|
case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
|
||||||
default: throw format("invalid quantization type %d\n", itype);
|
default: throw format("invalid quantization type %d\n", itype);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
new_size = useNewQuantization ?
|
||||||
|
kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
|
||||||
|
ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
new_size = useNewQuantization ?
|
||||||
|
kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
|
||||||
|
ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
LLAMA_ASSERT(false);
|
LLAMA_ASSERT(false);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue