ggml : add AArch64 optimized GEMV and GEMM Q4 kernels (#5780)
* Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization
* Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions
* Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions
* Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions
* Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions
* Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aarch64.h files
* Arm AArch64: minor code refactoring for rebase
* Arm AArch64: minor code refactoring for resolving a build issue with cmake
* Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8
* Arm AArch64: minor code change for resolving a build issue with server-windows
* retrigger checks
* Arm AArch64: minor code changes for rebase
* Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm cpus with SVE VL not equal to 256 bits
* Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete build.zig
* Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8
* Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8
* Arm AArch64: minor code refactoring
* Arm AArch64: simplify logic for calling gemm and gemv functions in ggml_compute_forward_mul_mat
* Arm AArch64: minimize changes in ggml_compute_forward_mul_mat
* Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types
* Arm AArch64: minor code refactoring
* Arm AArch64: minor code refactoring
* Arm AArch64: minor code refactoring
* rebase on the latest master commit 3fd62a6
and adapt to the new directory structure
* Arm AArch64: remove a redundant comment
* Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-strings warning off
* Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels
* Arm AArch64: update docs/build.md README to include compile time flags for buiilding the Q4_0_4_4 quant type
This commit is contained in:
parent
83321c6958
commit
0f1a39f343
14 changed files with 2534 additions and 53 deletions
|
@ -199,6 +199,30 @@ typedef struct {
|
|||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue