some cleanup with tinyblas backend

2024-11-16 22:30:02 +01:00 · 2024-11-16 22:30:02 +01:00 · dda8847636
commit dda8847636
parent 7dd261f3e9
13 changed files with 264 additions and 150 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
-if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+if (NOT DEFINED GGML_TINYBLAS)
+    set(GGML_TINYBLAS ON)
 endif()

 if (NOT DEFINED GGML_AMX)
--- a/docs/android.md
+++ b/docs/android.md
@ -45,7 +45,7 @@ $ cmake \
  -DCMAKE_C_FLAGS="-march=armv8.7a" \
  -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
  -DGGML_OPENMP=OFF \
-  -DGGML_LLAMAFILE=OFF \
+  -DGGML_TINYBLAS=OFF \
  -B build-android
 ```

--- a/docs/build.md
+++ b/docs/build.md
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.

  **Notes**:

-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)

 Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.

-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -57,8 +57,8 @@ else()
 endif()

 # defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
+if (NOT GGML_TINYBLAS_DEFAULT)
+    set(GGML_TINYBLAS_DEFAULT OFF)
 endif()

 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@ -124,8 +124,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
-
+option(GGML_TINYBLAS                        "ggml: use TINYBLAS"                              OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
+    include/ggml-tinyblas.h
    include/ggml-vulkan.h)

 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
+ggml_add_backend(TINYBLAS)
 ggml_add_backend(Vulkan)
 ggml_add_backend(MUSA)

--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -91,10 +91,12 @@ struct ggml_backend_registry {
            return;
        }

-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+        GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
+//#ifndef NDEBUG
+//        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+//            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+//#endif
        backends.push_back(reg);
        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
            register_device(ggml_backend_reg_dev_get(reg, i));
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -6,7 +6,20 @@
 typedef uint16_t ggml_half;
 typedef uint32_t ggml_half2;

-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+// std-c++ allow anonymous unions but some compiler warn on it
+#define GGML_COMMON_AGGR_U data
+// std-c++ do not allow it.
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_METAL)
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_HIP)
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_SYCL)
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
 typedef sycl::half  ggml_half;
 typedef sycl::half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #endif
@ -154,9 +171,9 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t qs[QK4_1 / 2]; // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@ -175,9 +192,9 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
@ -196,9 +213,9 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 ds;
-    };
+    } GGML_COMMON_AGGR_U;
    int8_t qs[QK8_1]; // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@ -261,9 +278,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

@ -288,9 +305,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
@ -305,9 +322,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];           // quants, high bit
    uint8_t qs[QK_K/2];           // quants, low 4 bits
@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
 #define GGML_TABLE_END() };

+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_METAL)
 #include <metal_stdlib>
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -44,16 +44,6 @@ if (GGML_OPENMP)
    endif()
 endif()

-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
-
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
-endif()
-
 if (GGML_CPU_HBM)
    find_library(memkind memkind REQUIRED)

--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -39,14 +39,6 @@
 #include <omp.h>
 #endif

-#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-#undef GGML_USE_LLAMAFILE
-#endif
-
-#ifdef GGML_USE_LLAMAFILE
-#include "llamafile/sgemm.h"
-#endif
-
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

-#if GGML_USE_LLAMAFILE
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    if (src1_cont) {
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)src1->data + i12*nb12 + i13*nb13,
-                                     nb11/ggml_type_size(src1->type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     src1->type,
-                                     dst->type))
-                    goto UseGgmlGemm1;
-        return;
-    }
-UseGgmlGemm1:;
-#endif
-
    if (src1->type != vec_dot_type) {
        char * wdata = params->wdata;

@ -7530,30 +7495,6 @@ UseGgmlGemm1:;

    ggml_barrier(params->threadpool);

-#if GGML_USE_LLAMAFILE
-    if (src1->type != vec_dot_type) {
-        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
-                                     row_size/ggml_type_size(vec_dot_type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     vec_dot_type,
-                                     dst->type))
-                    goto UseGgmlGemm2;
-        return;
-    }
-UseGgmlGemm2:;
-#endif
-
    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
    const int64_t nr0 = ne0;

--- a/ggml/src/ggml-tinyblas/CMakeLists.txt
+++ b/ggml/src/ggml-tinyblas/CMakeLists.txt
@ -1,3 +1,5 @@
+message(STATUS "Using TINYBLAS")
+
 add_library(ggml-tinyblas
            ggml-tinyblas.cpp
            )
@ -225,6 +227,10 @@ endif()
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
+target_compile_features   (ggml-tinyblas PRIVATE cxx_std_17)
+
 if (EMSCRIPTEN)
    set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
--- a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
+++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
@ -1,3 +1,48 @@
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "ggml-tinyblas.h"
@ -7,8 +52,9 @@

 #include <memory>
 #include <cstring>
-#include <iostream>

+// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
+// https://github.com/ggerganov/llama.cpp/pull/1999
 #ifdef GGML_USE_OPENMP
 #include <omp.h>
 #endif
@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
        int n_threads = GGML_DEFAULT_N_THREADS;
        std::unique_ptr<char[]> work_data;
        size_t work_size = 0;
-        //int pp_threads = GGML_DEFAULT_N_THREADS;
-        //int tg_threads = GGML_DEFAULT_N_THREADS;
    };

    template<bool RUN>
@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
            }
        }

-        // apres conversion de B: FP32 => src0->vec_dot_type
+        // after convert B: FP32 => src0->vec_dot_type
        enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
        if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
            if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
                    src1->data, nb11/ggml_type_size(src1->type),
                    dst->data, nb1/ggml_type_size(dst->type),
                    0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
-                // @ voir ca aurait etait bien de redimensioner work_data ici..
+                // TODO: how to resize work_data here
                return true;
            }
        }
@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
        const enum ggml_type type0 = src0->type;
        const enum ggml_type type1 = src1->type;

-        // les type "directs"
        // broadcast factors
        const int64_t r2 = ne12 / ne02;
        const int64_t r3 = ne13 / ne03;
@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
        }
        UseGgmlGemm1:;

-        // apres conversion de B ?
+        // with B converted from FP32 -> vec_dot_type
        GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
        enum ggml_type    const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
        ggml_from_float_t const from_float   = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
-        // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;

        if (src1->type != vec_dot_type) {
-            // OK on va au moins essayer de changer le type de B
-
            const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
            // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
            const size_t nbw2 = nbw1*ne11;
            const size_t nbw3 = nbw2*ne12;

-            // TOD0: vor si on peu caller ca dans supports_mul_mat
+            // TODO: move to: supports_mul_mat
            if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
                ctx->work_data.reset(new char[ne13*nbw3]);
                ctx->work_size = ne13*nbw3;
@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
            char * wdata = ctx->work_data.get();

@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
            // mat-mul bis...
            for (int64_t i13 = 0; i13 < ne13; i13++)
@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
        delete backend;
    }

-    // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
-    //  - voir ggml_graph_compute / ggml_threadpool
-    // https://github.com/ggerganov/llama.cpp/pull/1999
-    //
    static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
        context * ctx = (context *)backend->context;

@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
                mul_mat(ctx, node, ith, nth);
            }
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
            mul_mat(ctx, node, 0, 1);
 #endif
            break;
@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
        return backend != NULL && ggml_guid_matches(backend->guid, guid());
    }

-    // number of threads to use for compute
-    static void set_pp_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->pp_threads = n_threads;
-    }
-
-    static void set_tg_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->tg_threads = n_threads;
-    }
-
    static void set_n_threads(ggml_backend_t backend, int n_threads) {
        GGML_ASSERT(is_tinyblas(backend));
        context * ctx = (context *)backend->context;
        ctx->n_threads = n_threads;
-        //ctx->tg_threads = n_threads;
-        //ctx->pp_threads = n_threads;
    }

 }
@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
    }

    static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
-        //const struct ggml_tensor * src0 = op->src[0];
-        //const struct ggml_tensor * src1 = op->src[1];
-
        switch (op->op) {
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
        if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
            return (void *)ggml::backend::tinyblas::set_n_threads;
        }
-        if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_pp_threads;
-        }
-        if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_tg_threads;
-        }
        return NULL;
    }

--- a/ggml/src/ggml-tinyblas/sgemm.cpp
+++ b/ggml/src/ggml-tinyblas/sgemm.cpp
@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
        }
 #endif
        return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
        }
 #endif
        return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
        }
 #endif
        return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
        // TODO
 #endif
        return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
        }
 #endif
        return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
 #else
        return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
 #else
        return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
 #else
        return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
 #else
        return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
    }
    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
            const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
--- a/ggml/src/ggml-tinyblas/sgemm.h
+++ b/ggml/src/ggml-tinyblas/sgemm.h
@ -1,17 +1,56 @@
-#pragma once
-//#include <cstdint>
-#include "ggml.h"
-#define GGML_COMMON_DECL_C
-//#define GGML_COMMON_DECL_CPP
-#include "ggml-common.h"
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.

-// appelé que depuis du c++ (le tinyBLAS backend)
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
+#pragma once
+#include "ggml.h"
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"

 namespace ggml::backend::tinyblas {

-    // on est en C++
-    //  => on peu avoir autant de fonction que de type.
-    // calcule C = Aᵀ * B
+    // compute: C = Aᵀ * B
    template<bool RUN>
    bool gemm(int64_t m, int64_t n, int64_t k,
              const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,