diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec1f97b..41958c93a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,7 +235,9 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + ggml_extra.h + ggml_extra.cpp) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump diff --git a/ggml_extra.cpp b/ggml_extra.cpp new file mode 100644 index 000000000..cabbefae7 --- /dev/null +++ b/ggml_extra.cpp @@ -0,0 +1,204 @@ +#include "ggml_extra.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +inline int toNearestInt(float fval) { + assert(fval <= 4194303.f); + constexpr float kSnapper=3<<22; + auto val = fval + kSnapper; + int i; std::memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +float kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { + work.clear(); + work.reserve(n*(nmax+2)); + float max = 0; int imax = -1; + for (int i=0; i max) { max = x; imax = i; } + } + if (imax < 0) { // all X are zero + for (int i=0; i 0) { + kmin = nmax-2; kmax = nmax + 1; + } else { + kmin = nmax/2; kmax = nmax+1; + } + } + for (int k=kmin; k<=kmax; ++k) work.push_back({(k + 0.501f)*maxi, imax}); + float minScale = work.front().first; + float maxScale = work.back().first; + for (int i=0; i maxScale) break; + if (s > minScale) work.push_back({s,i}); + } + } + std::sort(work.begin(), work.end()); + float sumlx = 0; int suml2 = 0; + float s = work.front().first; + for (int i=0; i L[i]) { + sumlx += X[i]; + suml2 += 1 + 2*L[i]; + } + else { + sumlx -= X[i]; + suml2 += 1 - 2*L[i]; + } + L[i] = l; + float sumlx2 = sumlx*sumlx; + if ((s != lasts || k == int(work.size())-1) && suml2 > 0 && sumlx2*bestSuml2 > bestSumlx2*suml2) { + bestSumlx = sumlx; bestSumlx2 = sumlx2; bestSuml2 = suml2; bests = s; + } + lasts = s; + } + for (int i=0; i kQuantize1(int n, const float* X, int8_t* L, std::vector& tmpX, + std::vector>& work, int nmax) { + float min = X[0], max = X[1]; + for (int i=1; i>& work, std::vector& tmpX) { + if (type == 0) { + float scale = kQuantize0(QK, X, L, work, -7, 7); + std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale); + uint8_t* q = (uint8_t*)y; + for (int k=0; k L(QK); + std::vector> work; + std::vector tmpX; + int nb = k / QK; + for (int i=0; i counter(0); + auto compute = [&counter, x, y, k, bucketSize, &processOne] () { + std::vector L(QK); + std::vector> work; + std::vector tmpX; + while (true) { + int first = counter.fetch_add(kChunkSize); + if (first >= k) break; + int last = first + kChunkSize; + if (last > k) last = k; + auto xi = x + first; + auto yi = y + (first/QK)*bucketSize; + int n = (last - first)/QK; + for (int i=0; i workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); +} + +} + +extern "C" { + +void kQuantizeQ4_0(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 0); +} + +void kQuantizeQ4_1(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 1); +} + +} diff --git a/ggml_extra.h b/ggml_extra.h new file mode 100644 index 000000000..99041bed0 --- /dev/null +++ b/ggml_extra.h @@ -0,0 +1,20 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif + +void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); + +void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); + +#ifdef __cplusplus +} +#endif