some cleanup with tinyblas backend

This commit is contained in:
Djip007 2024-11-16 22:30:02 +01:00
parent 7dd261f3e9
commit dda8847636
13 changed files with 264 additions and 150 deletions

View file

@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options # change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE) if (NOT DEFINED GGML_TINYBLAS)
set(GGML_LLAMAFILE_DEFAULT ON) set(GGML_TINYBLAS ON)
endif() endif()
if (NOT DEFINED GGML_AMX) if (NOT DEFINED GGML_AMX)

View file

@ -45,7 +45,7 @@ $ cmake \
-DCMAKE_C_FLAGS="-march=armv8.7a" \ -DCMAKE_C_FLAGS="-march=armv8.7a" \
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \ -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
-DGGML_OPENMP=OFF \ -DGGML_OPENMP=OFF \
-DGGML_LLAMAFILE=OFF \ -DGGML_TINYBLAS=OFF \
-B build-android -B build-android
``` ```

View file

@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
**Notes**: **Notes**:
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/). - For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases: - For debug builds, there are two cases:
@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).

View file

@ -57,8 +57,8 @@ else()
endif() endif()
# defaults # defaults
if (NOT GGML_LLAMAFILE_DEFAULT) if (NOT GGML_TINYBLAS_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF) set(GGML_TINYBLAS_DEFAULT OFF)
endif() endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT) if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor") "ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-metal.h include/ggml-metal.h
include/ggml-rpc.h include/ggml-rpc.h
include/ggml-sycl.h include/ggml-sycl.h
include/ggml-tinyblas.h
include/ggml-vulkan.h) include/ggml-vulkan.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View file

@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
ggml_add_backend(METAL) ggml_add_backend(METAL)
ggml_add_backend(RPC) ggml_add_backend(RPC)
ggml_add_backend(SYCL) ggml_add_backend(SYCL)
ggml_add_backend(TINYBLAS)
ggml_add_backend(Vulkan) ggml_add_backend(Vulkan)
ggml_add_backend(MUSA) ggml_add_backend(MUSA)

View file

@ -91,10 +91,12 @@ struct ggml_backend_registry {
return; return;
} }
#ifndef NDEBUG GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif //#ifndef NDEBUG
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
//#endif
backends.push_back(reg); backends.push_back(reg);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i)); register_device(ggml_backend_reg_dev_get(reg, i));

View file

@ -6,7 +6,20 @@
typedef uint16_t ggml_half; typedef uint16_t ggml_half;
typedef uint32_t ggml_half2; typedef uint32_t ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;
// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL) #elif defined(GGML_COMMON_DECL_METAL)
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA) #elif defined(GGML_COMMON_DECL_CUDA)
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP) #elif defined(GGML_COMMON_DECL_HIP)
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL) #elif defined(GGML_COMMON_DECL_SYCL)
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half; typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2; typedef sycl::half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#endif #endif
@ -154,9 +171,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1; } block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@ -175,9 +192,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1; } block_q5_1;
@ -196,9 +213,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half s; // d * sum(qs[i]) ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 ds; ggml_half2 ds;
}; } GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants int8_t qs[QK8_1]; // quants
} block_q8_1; } block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@ -261,9 +278,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
} block_q2_K; } block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
@ -288,9 +305,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K; } block_q4_K;
@ -305,9 +322,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() }; #define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL) #elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib> #include <metal_stdlib>

View file

@ -44,16 +44,6 @@ if (GGML_OPENMP)
endif() endif()
endif() endif()
if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE)
target_sources(ggml-cpu PRIVATE
llamafile/sgemm.cpp
llamafile/sgemm.h)
endif()
if (GGML_CPU_HBM) if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED) find_library(memkind memkind REQUIRED)

View file

@ -39,14 +39,6 @@
#include <omp.h> #include <omp.h>
#endif #endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
#ifdef GGML_USE_LLAMAFILE
#include "llamafile/sgemm.h"
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts // disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :) // we should just be careful :)
@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if GGML_USE_LLAMAFILE
// broadcast factors
const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03;
const bool src1_cont = ggml_is_contiguous(src1);
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
type,
src1->type,
dst->type))
goto UseGgmlGemm1;
return;
}
UseGgmlGemm1:;
#endif
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
char * wdata = params->wdata; char * wdata = params->wdata;
@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
type,
vec_dot_type,
dst->type))
goto UseGgmlGemm2;
return;
}
UseGgmlGemm2:;
#endif
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
const int64_t nr0 = ne0; const int64_t nr0 = ne0;

View file

@ -1,3 +1,5 @@
message(STATUS "Using TINYBLAS")
add_library(ggml-tinyblas add_library(ggml-tinyblas
ggml-tinyblas.cpp ggml-tinyblas.cpp
) )
@ -225,6 +227,10 @@ endif()
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
target_compile_features (ggml-tinyblas PRIVATE cxx_std_17)
if (EMSCRIPTEN) if (EMSCRIPTEN)
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
endif() endif()

View file

@ -1,3 +1,48 @@
// Copyright 2024 Mozilla Foundation
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// _ _ ___ _ _ ___
// | |_(_)_ _ _ _| _ ) | /_\ / __|
// | _| | ' \ || | _ \ |__ / _ \\__ \.
// \__|_|_||_\_, |___/____/_/ \_\___/
// |__/
//
// BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, LLaMA Now Goes Faster on CPUs, Mar. 2024. [Online].
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-tinyblas.h" #include "ggml-tinyblas.h"
@ -7,8 +52,9 @@
#include <memory> #include <memory>
#include <cstring> #include <cstring>
#include <iostream>
// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
// https://github.com/ggerganov/llama.cpp/pull/1999
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
#include <omp.h> #include <omp.h>
#endif #endif
@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
int n_threads = GGML_DEFAULT_N_THREADS; int n_threads = GGML_DEFAULT_N_THREADS;
std::unique_ptr<char[]> work_data; std::unique_ptr<char[]> work_data;
size_t work_size = 0; size_t work_size = 0;
//int pp_threads = GGML_DEFAULT_N_THREADS;
//int tg_threads = GGML_DEFAULT_N_THREADS;
}; };
template<bool RUN> template<bool RUN>
@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
} }
} }
// apres conversion de B: FP32 => src0->vec_dot_type // after convert B: FP32 => src0->vec_dot_type
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type; enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) { if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type), if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
src1->data, nb11/ggml_type_size(src1->type), src1->data, nb11/ggml_type_size(src1->type),
dst->data, nb1/ggml_type_size(dst->type), dst->data, nb1/ggml_type_size(dst->type),
0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) { 0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
// @ voir ca aurait etait bien de redimensioner work_data ici.. // TODO: how to resize work_data here
return true; return true;
} }
} }
@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
const enum ggml_type type0 = src0->type; const enum ggml_type type0 = src0->type;
const enum ggml_type type1 = src1->type; const enum ggml_type type1 = src1->type;
// les type "directs"
// broadcast factors // broadcast factors
const int64_t r2 = ne12 / ne02; const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03; const int64_t r3 = ne13 / ne03;
@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
} }
UseGgmlGemm1:; UseGgmlGemm1:;
// apres conversion de B ? // with B converted from FP32 -> vec_dot_type
GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float' GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type; enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float; ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
// auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
// OK on va au moins essayer de changer le type de B
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
// const size_t row_size = ggml_row_size(vec_dot_type, ne10); // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
const size_t nbw2 = nbw1*ne11; const size_t nbw2 = nbw1*ne11;
const size_t nbw3 = nbw2*ne12; const size_t nbw3 = nbw2*ne12;
// TOD0: vor si on peu caller ca dans supports_mul_mat // TODO: move to: supports_mul_mat
if ((ith == 0) && (ctx->work_size < ne13*nbw3)) { if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
ctx->work_data.reset(new char[ne13*nbw3]); ctx->work_data.reset(new char[ne13*nbw3]);
ctx->work_size = ne13*nbw3; ctx->work_size = ne13*nbw3;
@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
#pragma omp barrier #pragma omp barrier
#else #else
static_assert(false, "Note implemented: use GGML_USE_OPENMP"); static_assert(false, "Not implemented: use GGML_USE_OPENMP");
#endif #endif
char * wdata = ctx->work_data.get(); char * wdata = ctx->work_data.get();
@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
#pragma omp barrier #pragma omp barrier
#else #else
static_assert(false, "Note implemented: use GGML_USE_OPENMP"); static_assert(false, "Not implemented: use GGML_USE_OPENMP");
#endif #endif
// mat-mul bis... // mat-mul bis...
for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i13 = 0; i13 < ne13; i13++)
@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
delete backend; delete backend;
} }
// TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
// - voir ggml_graph_compute / ggml_threadpool
// https://github.com/ggerganov/llama.cpp/pull/1999
//
static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
context * ctx = (context *)backend->context; context * ctx = (context *)backend->context;
@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
mul_mat(ctx, node, ith, nth); mul_mat(ctx, node, ith, nth);
} }
#else #else
static_assert(false, "Note implemented: use GGML_USE_OPENMP"); static_assert(false, "Not implemented: use GGML_USE_OPENMP");
mul_mat(ctx, node, 0, 1); mul_mat(ctx, node, 0, 1);
#endif #endif
break; break;
@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
return backend != NULL && ggml_guid_matches(backend->guid, guid()); return backend != NULL && ggml_guid_matches(backend->guid, guid());
} }
// number of threads to use for compute
static void set_pp_threads(ggml_backend_t backend, int n_threads) {
GGML_ASSERT(is_tinyblas(backend));
context * ctx = (context *)backend->context;
//ctx->pp_threads = n_threads;
}
static void set_tg_threads(ggml_backend_t backend, int n_threads) {
GGML_ASSERT(is_tinyblas(backend));
context * ctx = (context *)backend->context;
//ctx->tg_threads = n_threads;
}
static void set_n_threads(ggml_backend_t backend, int n_threads) { static void set_n_threads(ggml_backend_t backend, int n_threads) {
GGML_ASSERT(is_tinyblas(backend)); GGML_ASSERT(is_tinyblas(backend));
context * ctx = (context *)backend->context; context * ctx = (context *)backend->context;
ctx->n_threads = n_threads; ctx->n_threads = n_threads;
//ctx->tg_threads = n_threads;
//ctx->pp_threads = n_threads;
} }
} }
@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
} }
static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
//const struct ggml_tensor * src0 = op->src[0];
//const struct ggml_tensor * src1 = op->src[1];
switch (op->op) { switch (op->op) {
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml::backend::tinyblas::set_n_threads; return (void *)ggml::backend::tinyblas::set_n_threads;
} }
if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
return (void *)ggml::backend::tinyblas::set_pp_threads;
}
if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
return (void *)ggml::backend::tinyblas::set_tg_threads;
}
return NULL; return NULL;
} }

View file

@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
} }
#endif #endif
return false; return false;
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
} }
#endif #endif
return false; return false;
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
} }
#endif #endif
return false; return false;
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
// TODO // TODO
#endif #endif
return false; return false;
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
} }
#endif #endif
return false; return false;
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
#else #else
return false; return false;
#endif #endif
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
#else #else
return false; return false;
#endif #endif
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
#else #else
return false; return false;
#endif #endif
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
#else #else
return false; return false;
#endif #endif
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(A);
GGML_UNUSED(lda);
GGML_UNUSED(B);
GGML_UNUSED(ldb);
GGML_UNUSED(C);
GGML_UNUSED(ldc);
GGML_UNUSED(ith);
GGML_UNUSED(nth);
} }
template bool gemm<true>(int64_t m, int64_t n, int64_t k, template bool gemm<true>(int64_t m, int64_t n, int64_t k,
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,

View file

@ -1,17 +1,56 @@
#pragma once // Copyright 2024 Mozilla Foundation
//#include <cstdint> //
#include "ggml.h" // Permission is hereby granted, free of charge, to any person obtaining
#define GGML_COMMON_DECL_C // a copy of this software and associated documentation files (the
//#define GGML_COMMON_DECL_CPP // "Software"), to deal in the Software without restriction, including
#include "ggml-common.h" // without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// appelé que depuis du c++ (le tinyBLAS backend) //
// _ _ ___ _ _ ___
// | |_(_)_ _ _ _| _ ) | /_\ / __|
// | _| | ' \ || | _ \ |__ / _ \\__ \.
// \__|_|_||_\_, |___/____/_/ \_\___/
// |__/
//
// BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, LLaMA Now Goes Faster on CPUs, Mar. 2024. [Online].
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
#pragma once
#include "ggml.h"
#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
namespace ggml::backend::tinyblas { namespace ggml::backend::tinyblas {
// on est en C++ // compute: C = Aᵀ * B
// => on peu avoir autant de fonction que de type.
// calcule C = Aᵀ * B
template<bool RUN> template<bool RUN>
bool gemm(int64_t m, int64_t n, int64_t k, bool gemm(int64_t m, int64_t n, int64_t k,
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,