diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93c60ef43..ff62b3cba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
 
 # change the default for these ggml options
-if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+if (NOT DEFINED GGML_TINYBLAS)
+    set(GGML_TINYBLAS ON)
 endif()
 
 if (NOT DEFINED GGML_AMX)
diff --git a/docs/android.md b/docs/android.md
index 320b62240..ac3ecb43d 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -45,7 +45,7 @@ $ cmake \
   -DCMAKE_C_FLAGS="-march=armv8.7a" \
   -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
   -DGGML_OPENMP=OFF \
-  -DGGML_LLAMAFILE=OFF \
+  -DGGML_TINYBLAS=OFF \
   -B build-android
 ```
 
diff --git a/docs/build.md b/docs/build.md
index 52de2b4e2..538490a17 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
 
   **Notes**:
 
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, there are two cases:
@@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
 
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4fb78e59f..cc9d277a9 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -57,8 +57,8 @@ else()
 endif()
 
 # defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
+if (NOT GGML_TINYBLAS_DEFAULT)
+    set(GGML_TINYBLAS_DEFAULT OFF)
 endif()
 
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@@ -124,8 +124,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
-
+option(GGML_TINYBLAS                        "ggml: use TINYBLAS"                              OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
@@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-metal.h
     include/ggml-rpc.h
     include/ggml-sycl.h
+    include/ggml-tinyblas.h
     include/ggml-vulkan.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 71934c679..33d494dd7 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
+ggml_add_backend(TINYBLAS)
 ggml_add_backend(Vulkan)
 ggml_add_backend(MUSA)
 
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 78bcb6c5c..233debb12 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -91,10 +91,12 @@ struct ggml_backend_registry {
             return;
         }
 
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+        GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
+//#ifndef NDEBUG
+//        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+//            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+//#endif
         backends.push_back(reg);
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
             register_device(ggml_backend_reg_dev_get(reg, i));
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 050161393..51c36ff57 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -6,7 +6,20 @@
 typedef uint16_t ggml_half;
 typedef uint32_t ggml_half2;
 
-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+// std-c++ allow anonymous unions but some compiler warn on it
+#define GGML_COMMON_AGGR_U data
+// std-c++ do not allow it.
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_METAL)
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_HIP)
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_SYCL)
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
 typedef sycl::half  ggml_half;
 typedef sycl::half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #endif
@@ -154,9 +171,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t qs[QK4_1 / 2]; // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -175,9 +192,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t qh[4];         // 5-th bit of quants
     uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
@@ -196,9 +213,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 ds;
-    };
+    } GGML_COMMON_AGGR_U;
     int8_t qs[QK8_1]; // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@@ -261,9 +278,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 
@@ -288,9 +305,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
     uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
@@ -305,9 +322,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
     uint8_t qh[QK_K/8];           // quants, high bit
     uint8_t qs[QK_K/2];           // quants, low 4 bits
@@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
 #define GGML_TABLE_END() };
 
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_METAL)
 #include <metal_stdlib>
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index cef41a074..03c7607b5 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -44,16 +44,6 @@ if (GGML_OPENMP)
     endif()
 endif()
 
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
-
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
-endif()
-
 if (GGML_CPU_HBM)
     find_library(memkind memkind REQUIRED)
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 7f5c465df..37a11449c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -39,14 +39,6 @@
 #include <omp.h>
 #endif
 
-#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-#undef GGML_USE_LLAMAFILE
-#endif
-
-#ifdef GGML_USE_LLAMAFILE
-#include "llamafile/sgemm.h"
-#endif
-
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if GGML_USE_LLAMAFILE
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    if (src1_cont) {
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)src1->data + i12*nb12 + i13*nb13,
-                                     nb11/ggml_type_size(src1->type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     src1->type,
-                                     dst->type))
-                    goto UseGgmlGemm1;
-        return;
-    }
-UseGgmlGemm1:;
-#endif
-
     if (src1->type != vec_dot_type) {
         char * wdata = params->wdata;
 
@@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
 
     ggml_barrier(params->threadpool);
 
-#if GGML_USE_LLAMAFILE
-    if (src1->type != vec_dot_type) {
-        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
-                                     row_size/ggml_type_size(vec_dot_type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     vec_dot_type,
-                                     dst->type))
-                    goto UseGgmlGemm2;
-        return;
-    }
-UseGgmlGemm2:;
-#endif
-
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int64_t nr0 = ne0;
 
diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt
index c8c4fd04e..2b197f511 100644
--- a/ggml/src/ggml-tinyblas/CMakeLists.txt
+++ b/ggml/src/ggml-tinyblas/CMakeLists.txt
@@ -1,3 +1,5 @@
+message(STATUS "Using TINYBLAS")
+
 add_library(ggml-tinyblas
             ggml-tinyblas.cpp
             )
@@ -225,6 +227,10 @@ endif()
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
+target_compile_features   (ggml-tinyblas PRIVATE cxx_std_17)
+
 if (EMSCRIPTEN)
     set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
index 7317b5dd3..5d0704289 100644
--- a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
+++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
@@ -1,3 +1,48 @@
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "ggml-tinyblas.h"
@@ -7,8 +52,9 @@
 
 #include <memory>
 #include <cstring>
-#include <iostream>
 
+// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
+// https://github.com/ggerganov/llama.cpp/pull/1999
 #ifdef GGML_USE_OPENMP
 #include <omp.h>
 #endif
@@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
         int n_threads = GGML_DEFAULT_N_THREADS;
         std::unique_ptr<char[]> work_data;
         size_t work_size = 0;
-        //int pp_threads = GGML_DEFAULT_N_THREADS;
-        //int tg_threads = GGML_DEFAULT_N_THREADS;
     };
 
     template<bool RUN>
@@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
             }
         }
 
-        // apres conversion de B: FP32 => src0->vec_dot_type
+        // after convert B: FP32 => src0->vec_dot_type
         enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
         if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
             if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
                     src1->data, nb11/ggml_type_size(src1->type),
                     dst->data, nb1/ggml_type_size(dst->type),
                     0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
-                // @ voir ca aurait etait bien de redimensioner work_data ici..
+                // TODO: how to resize work_data here
                 return true;
             }
         }
@@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
         const enum ggml_type type0 = src0->type;
         const enum ggml_type type1 = src1->type;
 
-        // les type "directs"
         // broadcast factors
         const int64_t r2 = ne12 / ne02;
         const int64_t r3 = ne13 / ne03;
@@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
         }
         UseGgmlGemm1:;
 
-        // apres conversion de B ?
+        // with B converted from FP32 -> vec_dot_type
         GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
         enum ggml_type    const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
         ggml_from_float_t const from_float   = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
-        // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
 
         if (src1->type != vec_dot_type) {
-            // OK on va au moins essayer de changer le type de B
-
             const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
             // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
             const size_t nbw2 = nbw1*ne11;
             const size_t nbw3 = nbw2*ne12;
 
-            // TOD0: vor si on peu caller ca dans supports_mul_mat
+            // TODO: move to: supports_mul_mat
             if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
                 ctx->work_data.reset(new char[ne13*nbw3]);
                 ctx->work_size = ne13*nbw3;
@@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
             char * wdata = ctx->work_data.get();
 
@@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
             // mat-mul bis...
             for (int64_t i13 = 0; i13 < ne13; i13++)
@@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
         delete backend;
     }
 
-    // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
-    //  - voir ggml_graph_compute / ggml_threadpool
-    // https://github.com/ggerganov/llama.cpp/pull/1999
-    //
     static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
         context * ctx = (context *)backend->context;
 
@@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
                 mul_mat(ctx, node, ith, nth);
             }
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
             mul_mat(ctx, node, 0, 1);
 #endif
             break;
@@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
         return backend != NULL && ggml_guid_matches(backend->guid, guid());
     }
 
-    // number of threads to use for compute
-    static void set_pp_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->pp_threads = n_threads;
-    }
-
-    static void set_tg_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->tg_threads = n_threads;
-    }
-
     static void set_n_threads(ggml_backend_t backend, int n_threads) {
         GGML_ASSERT(is_tinyblas(backend));
         context * ctx = (context *)backend->context;
         ctx->n_threads = n_threads;
-        //ctx->tg_threads = n_threads;
-        //ctx->pp_threads = n_threads;
     }
 
 }
@@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
     }
 
     static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
-        //const struct ggml_tensor * src0 = op->src[0];
-        //const struct ggml_tensor * src1 = op->src[1];
-
         switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
         if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
             return (void *)ggml::backend::tinyblas::set_n_threads;
         }
-        if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_pp_threads;
-        }
-        if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_tg_threads;
-        }
         return NULL;
     }
 
diff --git a/ggml/src/ggml-tinyblas/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp
index 5c7a3c357..b82ae3f84 100644
--- a/ggml/src/ggml-tinyblas/sgemm.cpp
+++ b/ggml/src/ggml-tinyblas/sgemm.cpp
@@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
@@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
         // TODO
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
@@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h
index 88c014e3e..18bf5230e 100644
--- a/ggml/src/ggml-tinyblas/sgemm.h
+++ b/ggml/src/ggml-tinyblas/sgemm.h
@@ -1,17 +1,56 @@
-#pragma once
-//#include <cstdint>
-#include "ggml.h"
-#define GGML_COMMON_DECL_C
-//#define GGML_COMMON_DECL_CPP
-#include "ggml-common.h"
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
 
-// appelé que depuis du c++ (le tinyBLAS backend)
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
+#pragma once
+#include "ggml.h"
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
 
 namespace ggml::backend::tinyblas {
 
-    // on est en C++
-    //  => on peu avoir autant de fonction que de type.
-    // calcule C = Aᵀ * B
+    // compute: C = Aᵀ * B
     template<bool RUN>
     bool gemm(int64_t m, int64_t n, int64_t k,
               const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,