ggml : add AArch64 optimized GEMV and GEMM Q4 kernels (#5780)

* Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aarch64.h files * Arm AArch64: minor code refactoring for rebase * Arm AArch64: minor code refactoring for resolving a build issue with cmake * Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code change for resolving a build issue with server-windows * retrigger checks * Arm AArch64: minor code changes for rebase * Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm cpus with SVE VL not equal to 256 bits * Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete build.zig * Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code refactoring * Arm AArch64: simplify logic for calling gemm and gemv functions in ggml_compute_forward_mul_mat * Arm AArch64: minimize changes in ggml_compute_forward_mul_mat * Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * rebase on the latest master commit 3fd62a6 and adapt to the new directory structure * Arm AArch64: remove a redundant comment * Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-strings warning off * Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels * Arm AArch64: update docs/build.md README to include compile time flags for buiilding the Q4_0_4_4 quant type
2024-07-10 07:14:51 -05:00 · 2024-07-10 07:14:51 -05:00 · 0f1a39f343
commit 0f1a39f343
parent 83321c6958
14 changed files with 2534 additions and 53 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -383,6 +383,9 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
+        GGML_TYPE_Q4_0_4_4 = 31,
+        GGML_TYPE_Q4_0_4_8 = 32,
+        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_COUNT,
    };

@ -424,6 +427,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };

    // available tensor operations:
@ -2406,6 +2412,12 @@ extern "C" {
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
+                                             int64_t k, int64_t bx);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);

    typedef struct {
        const char      * type_name;
@ -2418,6 +2430,11 @@ extern "C" {
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
        int64_t           nrows; // number of rows to process simultaneously;
+        int64_t           ncols; // number of columns to process simultaneously;
+        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_gemv_t       gemv;
+        ggml_gemm_t       gemm;
    } ggml_type_traits_t;

    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);