From 9cdaea9240c8ea21f4eed8ab7f7248ac19844022 Mon Sep 17 00:00:00 2001 From: niansa Date: Thu, 22 Jun 2023 16:30:36 +0200 Subject: [PATCH] Implemented dequantize_row_q4_1 --- ggml-vulkan.cpp | 101 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 20 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index b0a84942e..c722609a9 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -53,6 +53,35 @@ std::vector compileSource(const std::string& source) { return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; } +template +std::vector getVecBlockQ4_0D(T *x, unsigned nb) { + std::vector fres(nb); + for (unsigned it = 0; it != nb; it++) { + fres[it] = x[it].d; + } + return fres; +} + +template +std::vector getVecBlockQ4_0M(T *x, unsigned nb) { + std::vector fres(nb); + for (unsigned it = 0; it != nb; it++) { + fres[it] = x[it].m; + } + return fres; +} + +template +std::vector getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) { + std::vector fres(nb*(qk/2)); + for (unsigned x_it = 0; x_it != nb; x_it++) { + for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { + fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it]; + } + } + return fres; +}; + static const std::string program_source_head = R"( #version 450 @@ -88,7 +117,6 @@ void main() { } ); - void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { static const int qk = QK4_0; const unsigned nb = k / qk; @@ -99,25 +127,8 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { assert(k % qk == 0); - auto getVecBlockQ4_0D = [x, nb] () { - std::vector fres(nb); - for (unsigned it = 0; it != nb; it++) { - fres[it] = x[it].d; - } - return fres; - }; - auto getVecBlockQ4_0QS = [x, nb] () { - std::vector fres(nb*(qk/2)); - for (unsigned x_it = 0; x_it != nb; x_it++) { - for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { - fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it]; - } - } - return fres; - }; - - const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D()); - const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS()); + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); mgr.sequence() @@ -130,6 +141,56 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { } +static const std::string program_dequantize_row_q4_1 = + program_source_head+'\n'+MULTILINE_QUOTE( +layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; }; +layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 3) buffer tensorY { float y[]; }; + +void main() { + const int qk = QK4_1; + + const int i = int(gl_GlobalInvocationID.x); + const int j = int(gl_GlobalInvocationID.y); + + const float d = float(x_d[i]); + const float m = float(x_m[i]); + const uint8_t qs = x_qs[i * (qk / 2) + j]; + + const int x0 = (qs & 0x0F); + const int x1 = (qs >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} +); + +void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) { + static const int qk = QK4_1; + const unsigned nb = k / qk; + const unsigned y_size = nb*qk; + const static auto spirv = compileSource(program_dequantize_row_q4_1); + + const auto x = reinterpret_cast(x_); + + assert(k % qk == 0); + + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x, nb)); + const auto tensorBlockQ4_0M = mgr.tensorT(getVecBlockQ4_0M(x, nb)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x, nb, qk)); + const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); + + mgr.sequence() + ->record({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY}) + ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0})) + ->record({tensorY}) + ->eval(); + + std::memcpy(y, tensorY->data(), tensorY->size()*sizeof(*y)); +} + + template<> kp::Tensor::TensorDataTypes kp::TensorT::dataType()