CUDA: optimize and refactor MMQ (#8416)

* CUDA: optimize and refactor MMQ

* explicit q8_1 memory layouts, add documentation
This commit is contained in:
Johannes Gäßler 2024-07-11 16:47:47 +02:00 committed by GitHub
parent a977c11544
commit 808aba3916
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 867 additions and 687 deletions

View file

@ -5,7 +5,11 @@
#include <cstdint>
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
typedef void (*quantize_cuda_t)(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,