diff --git a/CMakeLists.txt b/CMakeLists.txt index 93c60ef43..ff62b3cba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options -if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE_DEFAULT ON) +if (NOT DEFINED GGML_TINYBLAS) + set(GGML_TINYBLAS ON) endif() if (NOT DEFINED GGML_AMX) diff --git a/docs/android.md b/docs/android.md index 320b62240..ac3ecb43d 100644 --- a/docs/android.md +++ b/docs/android.md @@ -45,7 +45,7 @@ $ cmake \ -DCMAKE_C_FLAGS="-march=armv8.7a" \ -DCMAKE_CXX_FLAGS="-march=armv8.7a" \ -DGGML_OPENMP=OFF \ - -DGGML_LLAMAFILE=OFF \ + -DGGML_TINYBLAS=OFF \ -B build-android ``` diff --git a/docs/build.md b/docs/build.md index 52de2b4e2..538490a17 100644 --- a/docs/build.md +++ b/docs/build.md @@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options. **Notes**: - - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. + - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - For debug builds, there are two cases: @@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md) Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. -To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). +To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`). diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4fb78e59f..cc9d277a9 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -57,8 +57,8 @@ else() endif() # defaults -if (NOT GGML_LLAMAFILE_DEFAULT) - set(GGML_LLAMAFILE_DEFAULT OFF) +if (NOT GGML_TINYBLAS_DEFAULT) + set(GGML_TINYBLAS_DEFAULT OFF) endif() if (NOT GGML_CUDA_GRAPHS_DEFAULT) @@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework" option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) - +option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) @@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-metal.h include/ggml-rpc.h include/ggml-sycl.h + include/ggml-tinyblas.h include/ggml-vulkan.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 71934c679..33d494dd7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -256,6 +256,7 @@ ggml_add_backend(Kompute) ggml_add_backend(METAL) ggml_add_backend(RPC) ggml_add_backend(SYCL) +ggml_add_backend(TINYBLAS) ggml_add_backend(Vulkan) ggml_add_backend(MUSA) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 78bcb6c5c..233debb12 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -91,10 +91,12 @@ struct ggml_backend_registry { return; } -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", + GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); -#endif +//#ifndef NDEBUG +// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", +// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); +//#endif backends.push_back(reg); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 050161393..51c36ff57 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -6,7 +6,20 @@ typedef uint16_t ggml_half; typedef uint32_t ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S + +#define GGML_COMMON_DECL +#elif defined(GGML_COMMON_DECL_CPP) +#include + +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +// std-c++ allow anonymous unions but some compiler warn on it +#define GGML_COMMON_AGGR_U data +// std-c++ do not allow it. +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) @@ -29,7 +43,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_HIP) @@ -39,7 +54,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_SYCL) @@ -49,7 +65,8 @@ typedef half2 ggml_half2; typedef sycl::half ggml_half; typedef sycl::half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #endif @@ -154,9 +171,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -175,9 +192,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -196,9 +213,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half s; // d * sum(qs[i]) - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 ds; - }; + } GGML_COMMON_AGGR_U; int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); @@ -261,9 +278,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -288,9 +305,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -305,9 +322,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits @@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_END() }; +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CPP) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_METAL) #include diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index cef41a074..03c7607b5 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -44,16 +44,6 @@ if (GGML_OPENMP) endif() endif() -if (GGML_LLAMAFILE) - message(STATUS "Using llamafile") - - add_compile_definitions(GGML_USE_LLAMAFILE) - - target_sources(ggml-cpu PRIVATE - llamafile/sgemm.cpp - llamafile/sgemm.h) -endif() - if (GGML_CPU_HBM) find_library(memkind memkind REQUIRED) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7f5c465df..37a11449c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -39,14 +39,6 @@ #include #endif -#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) -#undef GGML_USE_LLAMAFILE -#endif - -#ifdef GGML_USE_LLAMAFILE -#include "llamafile/sgemm.h" -#endif - #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if GGML_USE_LLAMAFILE - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const bool src1_cont = ggml_is_contiguous(src1); - - if (src1_cont) { - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), - (const char *)src1->data + i12*nb12 + i13*nb13, - nb11/ggml_type_size(src1->type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - type, - src1->type, - dst->type)) - goto UseGgmlGemm1; - return; - } -UseGgmlGemm1:; -#endif - if (src1->type != vec_dot_type) { char * wdata = params->wdata; @@ -7530,30 +7495,6 @@ UseGgmlGemm1:; ggml_barrier(params->threadpool); -#if GGML_USE_LLAMAFILE - if (src1->type != vec_dot_type) { - const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), - (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, - row_size/ggml_type_size(vec_dot_type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - type, - vec_dot_type, - dst->type)) - goto UseGgmlGemm2; - return; - } -UseGgmlGemm2:; -#endif - // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) const int64_t nr0 = ne0; diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt index c8c4fd04e..2b197f511 100644 --- a/ggml/src/ggml-tinyblas/CMakeLists.txt +++ b/ggml/src/ggml-tinyblas/CMakeLists.txt @@ -1,3 +1,5 @@ +message(STATUS "Using TINYBLAS") + add_library(ggml-tinyblas ggml-tinyblas.cpp ) @@ -225,6 +227,10 @@ endif() target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") +#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17) +#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17") +target_compile_features (ggml-tinyblas PRIVATE cxx_std_17) + if (EMSCRIPTEN) set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") endif() diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp index 7317b5dd3..5d0704289 100644 --- a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp +++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp @@ -1,3 +1,48 @@ +// Copyright 2024 Mozilla Foundation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// _ _ ___ _ _ ___ +// | |_(_)_ _ _ _| _ ) | /_\ / __| +// | _| | ' \ || | _ \ |__ / _ \\__ \. +// \__|_|_||_\_, |___/____/_/ \_\___/ +// |__/ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + #include "ggml-cpu.h" #include "ggml-impl.h" #include "ggml-tinyblas.h" @@ -7,8 +52,9 @@ #include #include -#include +// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool +// https://github.com/ggerganov/llama.cpp/pull/1999 #ifdef GGML_USE_OPENMP #include #endif @@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas { int n_threads = GGML_DEFAULT_N_THREADS; std::unique_ptr work_data; size_t work_size = 0; - //int pp_threads = GGML_DEFAULT_N_THREADS; - //int tg_threads = GGML_DEFAULT_N_THREADS; }; template @@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas { } } - // apres conversion de B: FP32 => src0->vec_dot_type + // after convert B: FP32 => src0->vec_dot_type enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type; if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) { if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), @@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas { src1->data, nb11/ggml_type_size(src1->type), dst->data, nb1/ggml_type_size(dst->type), 0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) { - // @ voir ca aurait etait bien de redimensioner work_data ici.. + // TODO: how to resize work_data here return true; } } @@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas { const enum ggml_type type0 = src0->type; const enum ggml_type type1 = src1->type; - // les type "directs" // broadcast factors const int64_t r2 = ne12 / ne02; const int64_t r3 = ne13 / ne03; @@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas { } UseGgmlGemm1:; - // apres conversion de B ? + // with B converted from FP32 -> vec_dot_type GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float' enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type; ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float; - // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size; if (src1->type != vec_dot_type) { - // OK on va au moins essayer de changer le type de B - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); // const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - // TOD0: vor si on peu caller ca dans supports_mul_mat + // TODO: move to: supports_mul_mat if ((ith == 0) && (ctx->work_size < ne13*nbw3)) { ctx->work_data.reset(new char[ne13*nbw3]); ctx->work_size = ne13*nbw3; @@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas { #ifdef GGML_USE_OPENMP #pragma omp barrier #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); #endif char * wdata = ctx->work_data.get(); @@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas { #ifdef GGML_USE_OPENMP #pragma omp barrier #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); #endif // mat-mul bis... for (int64_t i13 = 0; i13 < ne13; i13++) @@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas { delete backend; } - // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin... - // - voir ggml_graph_compute / ggml_threadpool - // https://github.com/ggerganov/llama.cpp/pull/1999 - // static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { context * ctx = (context *)backend->context; @@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas { mul_mat(ctx, node, ith, nth); } #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); mul_mat(ctx, node, 0, 1); #endif break; @@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas { return backend != NULL && ggml_guid_matches(backend->guid, guid()); } - // number of threads to use for compute - static void set_pp_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(is_tinyblas(backend)); - context * ctx = (context *)backend->context; - //ctx->pp_threads = n_threads; - } - - static void set_tg_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(is_tinyblas(backend)); - context * ctx = (context *)backend->context; - //ctx->tg_threads = n_threads; - } - static void set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(is_tinyblas(backend)); context * ctx = (context *)backend->context; ctx->n_threads = n_threads; - //ctx->tg_threads = n_threads; - //ctx->pp_threads = n_threads; } } @@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device { } static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { - //const struct ggml_tensor * src0 = op->src[0]; - //const struct ggml_tensor * src1 = op->src[1]; - switch (op->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg { if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { return (void *)ggml::backend::tinyblas::set_n_threads; } - if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) { - return (void *)ggml::backend::tinyblas::set_pp_threads; - } - if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) { - return (void *)ggml::backend::tinyblas::set_tg_threads; - } return NULL; } diff --git a/ggml/src/ggml-tinyblas/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp index 5c7a3c357..b82ae3f84 100644 --- a/ggml/src/ggml-tinyblas/sgemm.cpp +++ b/ggml/src/ggml-tinyblas/sgemm.cpp @@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, @@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas { // TODO #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, @@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h index 88c014e3e..18bf5230e 100644 --- a/ggml/src/ggml-tinyblas/sgemm.h +++ b/ggml/src/ggml-tinyblas/sgemm.h @@ -1,17 +1,56 @@ -#pragma once -//#include -#include "ggml.h" -#define GGML_COMMON_DECL_C -//#define GGML_COMMON_DECL_CPP -#include "ggml-common.h" +// Copyright 2024 Mozilla Foundation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. -// appelé que depuis du c++ (le tinyBLAS backend) +// +// _ _ ___ _ _ ___ +// | |_(_)_ _ _ _| _ ) | /_\ / __| +// | _| | ' \ || | _ \ |__ / _ \\__ \. +// \__|_|_||_\_, |___/____/_/ \_\___/ +// |__/ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + +#pragma once +#include "ggml.h" +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" namespace ggml::backend::tinyblas { - // on est en C++ - // => on peu avoir autant de fonction que de type. - // calcule C = Aᵀ * B + // compute: C = Aᵀ * B template bool gemm(int64_t m, int64_t n, int64_t k, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,