diff --git a/ggml_extra.cpp b/ggml_extra.cpp index e2ae005df..9eb9bbe83 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -23,7 +23,7 @@ inline int toNearestInt(float fval) { return (i & 0x007fffff) - 0x00400000; } -float kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { +std::pair kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); float max = 0; int imax = -1; @@ -33,7 +33,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector 0) { - kmin = nmax-2; kmax = nmax + 1; + kmin = nmax-2; kmax = nmax+1; } else { kmin = nmax/2; kmax = nmax+1; } @@ -97,7 +97,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector kQuantize1(int n, const float* X, int8_t* L, std::vector& tmpX, @@ -137,7 +137,17 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - float scale = kQuantize0(QK, X, L, work, -7, 7); + if (int(tmpX.size()) < QK) tmpX.resize(QK); + auto r1 = kQuantize0(QK, X, L, work, -8, 7); + for (int i=0; i r1.first) { + scale = -r2.first; + std::memcpy(L, L2, QK); + } + //float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k