some cleanup with tinyblas backend
This commit is contained in:
parent
7dd261f3e9
commit
dda8847636
13 changed files with 264 additions and 150 deletions
|
@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||||
|
|
||||||
# change the default for these ggml options
|
# change the default for these ggml options
|
||||||
if (NOT DEFINED GGML_LLAMAFILE)
|
if (NOT DEFINED GGML_TINYBLAS)
|
||||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
set(GGML_TINYBLAS ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_AMX)
|
if (NOT DEFINED GGML_AMX)
|
||||||
|
|
|
@ -45,7 +45,7 @@ $ cmake \
|
||||||
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||||
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||||
-DGGML_OPENMP=OFF \
|
-DGGML_OPENMP=OFF \
|
||||||
-DGGML_LLAMAFILE=OFF \
|
-DGGML_TINYBLAS=OFF \
|
||||||
-B build-android
|
-B build-android
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
|
||||||
|
|
||||||
**Notes**:
|
**Notes**:
|
||||||
|
|
||||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
- For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
- For debug builds, there are two cases:
|
- For debug builds, there are two cases:
|
||||||
|
@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
|
||||||
|
|
||||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||||
|
|
||||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
|
||||||
|
|
|
@ -57,8 +57,8 @@ else()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# defaults
|
# defaults
|
||||||
if (NOT GGML_LLAMAFILE_DEFAULT)
|
if (NOT GGML_TINYBLAS_DEFAULT)
|
||||||
set(GGML_LLAMAFILE_DEFAULT OFF)
|
set(GGML_TINYBLAS_DEFAULT OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
||||||
|
@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
||||||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
"ggml: BLAS library vendor")
|
"ggml: BLAS library vendor")
|
||||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
|
option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
|
||||||
|
|
||||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||||
option(GGML_MUSA "ggml: use MUSA" OFF)
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-metal.h
|
include/ggml-metal.h
|
||||||
include/ggml-rpc.h
|
include/ggml-rpc.h
|
||||||
include/ggml-sycl.h
|
include/ggml-sycl.h
|
||||||
|
include/ggml-tinyblas.h
|
||||||
include/ggml-vulkan.h)
|
include/ggml-vulkan.h)
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
|
|
@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
|
||||||
ggml_add_backend(METAL)
|
ggml_add_backend(METAL)
|
||||||
ggml_add_backend(RPC)
|
ggml_add_backend(RPC)
|
||||||
ggml_add_backend(SYCL)
|
ggml_add_backend(SYCL)
|
||||||
|
ggml_add_backend(TINYBLAS)
|
||||||
ggml_add_backend(Vulkan)
|
ggml_add_backend(Vulkan)
|
||||||
ggml_add_backend(MUSA)
|
ggml_add_backend(MUSA)
|
||||||
|
|
||||||
|
|
|
@ -91,10 +91,12 @@ struct ggml_backend_registry {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
|
||||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
|
||||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||||
#endif
|
//#ifndef NDEBUG
|
||||||
|
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||||
|
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||||
|
//#endif
|
||||||
backends.push_back(reg);
|
backends.push_back(reg);
|
||||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||||
|
|
|
@ -6,7 +6,20 @@
|
||||||
typedef uint16_t ggml_half;
|
typedef uint16_t ggml_half;
|
||||||
typedef uint32_t ggml_half2;
|
typedef uint32_t ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_AGGR
|
#define GGML_COMMON_AGGR_U
|
||||||
|
#define GGML_COMMON_AGGR_S
|
||||||
|
|
||||||
|
#define GGML_COMMON_DECL
|
||||||
|
#elif defined(GGML_COMMON_DECL_CPP)
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
typedef uint16_t ggml_half;
|
||||||
|
typedef uint32_t ggml_half2;
|
||||||
|
|
||||||
|
// std-c++ allow anonymous unions but some compiler warn on it
|
||||||
|
#define GGML_COMMON_AGGR_U data
|
||||||
|
// std-c++ do not allow it.
|
||||||
|
#define GGML_COMMON_AGGR_S data
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#elif defined(GGML_COMMON_DECL_METAL)
|
#elif defined(GGML_COMMON_DECL_METAL)
|
||||||
|
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
||||||
typedef half ggml_half;
|
typedef half ggml_half;
|
||||||
typedef half2 ggml_half2;
|
typedef half2 ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_AGGR
|
#define GGML_COMMON_AGGR_U
|
||||||
|
#define GGML_COMMON_AGGR_S
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||||
|
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
||||||
typedef half ggml_half;
|
typedef half ggml_half;
|
||||||
typedef half2 ggml_half2;
|
typedef half2 ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_AGGR data
|
#define GGML_COMMON_AGGR_U
|
||||||
|
#define GGML_COMMON_AGGR_S data
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#elif defined(GGML_COMMON_DECL_HIP)
|
#elif defined(GGML_COMMON_DECL_HIP)
|
||||||
|
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
||||||
typedef half ggml_half;
|
typedef half ggml_half;
|
||||||
typedef half2 ggml_half2;
|
typedef half2 ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_AGGR data
|
#define GGML_COMMON_AGGR_U
|
||||||
|
#define GGML_COMMON_AGGR_S data
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#elif defined(GGML_COMMON_DECL_SYCL)
|
#elif defined(GGML_COMMON_DECL_SYCL)
|
||||||
|
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
||||||
typedef sycl::half ggml_half;
|
typedef sycl::half ggml_half;
|
||||||
typedef sycl::half2 ggml_half2;
|
typedef sycl::half2 ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_AGGR data
|
#define GGML_COMMON_AGGR_U
|
||||||
|
#define GGML_COMMON_AGGR_S data
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#endif
|
#endif
|
||||||
|
@ -154,9 +171,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
ggml_half m; // min
|
ggml_half m; // min
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 dm;
|
ggml_half2 dm;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
} block_q4_1;
|
} block_q4_1;
|
||||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
@ -175,9 +192,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
ggml_half m; // min
|
ggml_half m; // min
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 dm;
|
ggml_half2 dm;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
uint8_t qh[4]; // 5-th bit of quants
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||||
} block_q5_1;
|
} block_q5_1;
|
||||||
|
@ -196,9 +213,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
ggml_half s; // d * sum(qs[i])
|
ggml_half s; // d * sum(qs[i])
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 ds;
|
ggml_half2 ds;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
int8_t qs[QK8_1]; // quants
|
int8_t qs[QK8_1]; // quants
|
||||||
} block_q8_1;
|
} block_q8_1;
|
||||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||||
|
@ -261,9 +278,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // super-block scale for quantized scales
|
ggml_half d; // super-block scale for quantized scales
|
||||||
ggml_half dmin; // super-block scale for quantized mins
|
ggml_half dmin; // super-block scale for quantized mins
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 dm;
|
ggml_half2 dm;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
} block_q2_K;
|
} block_q2_K;
|
||||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
|
@ -288,9 +305,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // super-block scale for quantized scales
|
ggml_half d; // super-block scale for quantized scales
|
||||||
ggml_half dmin; // super-block scale for quantized mins
|
ggml_half dmin; // super-block scale for quantized mins
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 dm;
|
ggml_half2 dm;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
|
@ -305,9 +322,9 @@ typedef struct {
|
||||||
struct {
|
struct {
|
||||||
ggml_half d; // super-block scale for quantized scales
|
ggml_half d; // super-block scale for quantized scales
|
||||||
ggml_half dmin; // super-block scale for quantized mins
|
ggml_half dmin; // super-block scale for quantized mins
|
||||||
} GGML_COMMON_AGGR;
|
} GGML_COMMON_AGGR_S;
|
||||||
ggml_half2 dm;
|
ggml_half2 dm;
|
||||||
};
|
} GGML_COMMON_AGGR_U;
|
||||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
||||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||||
#define GGML_TABLE_END() };
|
#define GGML_TABLE_END() };
|
||||||
|
|
||||||
|
#define GGML_COMMON_IMPL
|
||||||
|
#elif defined(GGML_COMMON_IMPL_CPP)
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||||
|
#define GGML_TABLE_END() };
|
||||||
|
|
||||||
#define GGML_COMMON_IMPL
|
#define GGML_COMMON_IMPL
|
||||||
#elif defined(GGML_COMMON_IMPL_METAL)
|
#elif defined(GGML_COMMON_IMPL_METAL)
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
|
|
|
@ -44,16 +44,6 @@ if (GGML_OPENMP)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_LLAMAFILE)
|
|
||||||
message(STATUS "Using llamafile")
|
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
|
||||||
|
|
||||||
target_sources(ggml-cpu PRIVATE
|
|
||||||
llamafile/sgemm.cpp
|
|
||||||
llamafile/sgemm.h)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CPU_HBM)
|
if (GGML_CPU_HBM)
|
||||||
find_library(memkind memkind REQUIRED)
|
find_library(memkind memkind REQUIRED)
|
||||||
|
|
||||||
|
|
|
@ -39,14 +39,6 @@
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
|
||||||
#undef GGML_USE_LLAMAFILE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_LLAMAFILE
|
|
||||||
#include "llamafile/sgemm.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid hundreds of casts
|
// disable "possible loss of data" to avoid hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
|
@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
|
||||||
// broadcast factors
|
|
||||||
const int64_t r2 = ne12 / ne02;
|
|
||||||
const int64_t r3 = ne13 / ne03;
|
|
||||||
|
|
||||||
const bool src1_cont = ggml_is_contiguous(src1);
|
|
||||||
|
|
||||||
if (src1_cont) {
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
|
||||||
nb01/ggml_type_size(type),
|
|
||||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
|
||||||
nb11/ggml_type_size(src1->type),
|
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
|
||||||
nb1/ggml_type_size(dst->type),
|
|
||||||
ith, nth,
|
|
||||||
type,
|
|
||||||
src1->type,
|
|
||||||
dst->type))
|
|
||||||
goto UseGgmlGemm1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
UseGgmlGemm1:;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
|
|
||||||
|
@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
|
||||||
if (src1->type != vec_dot_type) {
|
|
||||||
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
|
||||||
nb01/ggml_type_size(type),
|
|
||||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
|
||||||
row_size/ggml_type_size(vec_dot_type),
|
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
|
||||||
nb1/ggml_type_size(dst->type),
|
|
||||||
ith, nth,
|
|
||||||
type,
|
|
||||||
vec_dot_type,
|
|
||||||
dst->type))
|
|
||||||
goto UseGgmlGemm2;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
UseGgmlGemm2:;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
||||||
const int64_t nr0 = ne0;
|
const int64_t nr0 = ne0;
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
message(STATUS "Using TINYBLAS")
|
||||||
|
|
||||||
add_library(ggml-tinyblas
|
add_library(ggml-tinyblas
|
||||||
ggml-tinyblas.cpp
|
ggml-tinyblas.cpp
|
||||||
)
|
)
|
||||||
|
@ -225,6 +227,10 @@ endif()
|
||||||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||||
|
|
||||||
|
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
|
||||||
|
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
|
||||||
|
target_compile_features (ggml-tinyblas PRIVATE cxx_std_17)
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
|
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,3 +1,48 @@
|
||||||
|
// Copyright 2024 Mozilla Foundation
|
||||||
|
//
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
// a copy of this software and associated documentation files (the
|
||||||
|
// "Software"), to deal in the Software without restriction, including
|
||||||
|
// without limitation the rights to use, copy, modify, merge, publish,
|
||||||
|
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||||
|
// permit persons to whom the Software is furnished to do so, subject to
|
||||||
|
// the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||||
|
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
|
||||||
|
//
|
||||||
|
// _ _ ___ _ _ ___
|
||||||
|
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||||
|
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||||
|
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||||
|
// |__/
|
||||||
|
//
|
||||||
|
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// This file implements multithreaded CPU matrix multiplication for the
|
||||||
|
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||||
|
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||||
|
// without imposing any overhead such as cache filling or malloc calls.
|
||||||
|
//
|
||||||
|
// This implementation does not guarantee any upper bound with rounding
|
||||||
|
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||||
|
// hardware for performance, and then use whatever resources remain for
|
||||||
|
// improving numerical accuracy.
|
||||||
|
//
|
||||||
|
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||||
|
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||||
|
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-tinyblas.h"
|
#include "ggml-tinyblas.h"
|
||||||
|
@ -7,8 +52,9 @@
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
|
// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
|
||||||
|
// https://github.com/ggerganov/llama.cpp/pull/1999
|
||||||
#ifdef GGML_USE_OPENMP
|
#ifdef GGML_USE_OPENMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
|
||||||
int n_threads = GGML_DEFAULT_N_THREADS;
|
int n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
std::unique_ptr<char[]> work_data;
|
std::unique_ptr<char[]> work_data;
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
//int pp_threads = GGML_DEFAULT_N_THREADS;
|
|
||||||
//int tg_threads = GGML_DEFAULT_N_THREADS;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool RUN>
|
template<bool RUN>
|
||||||
|
@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// apres conversion de B: FP32 => src0->vec_dot_type
|
// after convert B: FP32 => src0->vec_dot_type
|
||||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||||
if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
|
if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
|
||||||
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
|
||||||
src1->data, nb11/ggml_type_size(src1->type),
|
src1->data, nb11/ggml_type_size(src1->type),
|
||||||
dst->data, nb1/ggml_type_size(dst->type),
|
dst->data, nb1/ggml_type_size(dst->type),
|
||||||
0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
|
0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
|
||||||
// @ voir ca aurait etait bien de redimensioner work_data ici..
|
// TODO: how to resize work_data here
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
|
||||||
const enum ggml_type type0 = src0->type;
|
const enum ggml_type type0 = src0->type;
|
||||||
const enum ggml_type type1 = src1->type;
|
const enum ggml_type type1 = src1->type;
|
||||||
|
|
||||||
// les type "directs"
|
|
||||||
// broadcast factors
|
// broadcast factors
|
||||||
const int64_t r2 = ne12 / ne02;
|
const int64_t r2 = ne12 / ne02;
|
||||||
const int64_t r3 = ne13 / ne03;
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
UseGgmlGemm1:;
|
UseGgmlGemm1:;
|
||||||
|
|
||||||
// apres conversion de B ?
|
// with B converted from FP32 -> vec_dot_type
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
|
GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
|
||||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
|
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
|
||||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
|
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
|
||||||
// auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
|
|
||||||
|
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
// OK on va au moins essayer de changer le type de B
|
|
||||||
|
|
||||||
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
||||||
// const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
// const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
const size_t nbw2 = nbw1*ne11;
|
const size_t nbw2 = nbw1*ne11;
|
||||||
const size_t nbw3 = nbw2*ne12;
|
const size_t nbw3 = nbw2*ne12;
|
||||||
|
|
||||||
// TOD0: vor si on peu caller ca dans supports_mul_mat
|
// TODO: move to: supports_mul_mat
|
||||||
if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
|
if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
|
||||||
ctx->work_data.reset(new char[ne13*nbw3]);
|
ctx->work_data.reset(new char[ne13*nbw3]);
|
||||||
ctx->work_size = ne13*nbw3;
|
ctx->work_size = ne13*nbw3;
|
||||||
|
@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
|
||||||
#ifdef GGML_USE_OPENMP
|
#ifdef GGML_USE_OPENMP
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
#else
|
#else
|
||||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||||
#endif
|
#endif
|
||||||
char * wdata = ctx->work_data.get();
|
char * wdata = ctx->work_data.get();
|
||||||
|
|
||||||
|
@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
|
||||||
#ifdef GGML_USE_OPENMP
|
#ifdef GGML_USE_OPENMP
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
#else
|
#else
|
||||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||||
#endif
|
#endif
|
||||||
// mat-mul bis...
|
// mat-mul bis...
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
|
@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
|
|
||||||
// - voir ggml_graph_compute / ggml_threadpool
|
|
||||||
// https://github.com/ggerganov/llama.cpp/pull/1999
|
|
||||||
//
|
|
||||||
static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
context * ctx = (context *)backend->context;
|
context * ctx = (context *)backend->context;
|
||||||
|
|
||||||
|
@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
|
||||||
mul_mat(ctx, node, ith, nth);
|
mul_mat(ctx, node, ith, nth);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||||
mul_mat(ctx, node, 0, 1);
|
mul_mat(ctx, node, 0, 1);
|
||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
|
@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, guid());
|
return backend != NULL && ggml_guid_matches(backend->guid, guid());
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of threads to use for compute
|
|
||||||
static void set_pp_threads(ggml_backend_t backend, int n_threads) {
|
|
||||||
GGML_ASSERT(is_tinyblas(backend));
|
|
||||||
context * ctx = (context *)backend->context;
|
|
||||||
//ctx->pp_threads = n_threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void set_tg_threads(ggml_backend_t backend, int n_threads) {
|
|
||||||
GGML_ASSERT(is_tinyblas(backend));
|
|
||||||
context * ctx = (context *)backend->context;
|
|
||||||
//ctx->tg_threads = n_threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void set_n_threads(ggml_backend_t backend, int n_threads) {
|
static void set_n_threads(ggml_backend_t backend, int n_threads) {
|
||||||
GGML_ASSERT(is_tinyblas(backend));
|
GGML_ASSERT(is_tinyblas(backend));
|
||||||
context * ctx = (context *)backend->context;
|
context * ctx = (context *)backend->context;
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
//ctx->tg_threads = n_threads;
|
|
||||||
//ctx->pp_threads = n_threads;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||||
//const struct ggml_tensor * src0 = op->src[0];
|
|
||||||
//const struct ggml_tensor * src1 = op->src[1];
|
|
||||||
|
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
|
@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
|
||||||
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||||
return (void *)ggml::backend::tinyblas::set_n_threads;
|
return (void *)ggml::backend::tinyblas::set_n_threads;
|
||||||
}
|
}
|
||||||
if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
|
|
||||||
return (void *)ggml::backend::tinyblas::set_pp_threads;
|
|
||||||
}
|
|
||||||
if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
|
|
||||||
return (void *)ggml::backend::tinyblas::set_tg_threads;
|
|
||||||
}
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
|
||||||
// TODO
|
// TODO
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
GGML_UNUSED(m);
|
||||||
|
GGML_UNUSED(n);
|
||||||
|
GGML_UNUSED(k);
|
||||||
|
GGML_UNUSED(A);
|
||||||
|
GGML_UNUSED(lda);
|
||||||
|
GGML_UNUSED(B);
|
||||||
|
GGML_UNUSED(ldb);
|
||||||
|
GGML_UNUSED(C);
|
||||||
|
GGML_UNUSED(ldc);
|
||||||
|
GGML_UNUSED(ith);
|
||||||
|
GGML_UNUSED(nth);
|
||||||
}
|
}
|
||||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
|
|
@ -1,17 +1,56 @@
|
||||||
#pragma once
|
// Copyright 2024 Mozilla Foundation
|
||||||
//#include <cstdint>
|
//
|
||||||
#include "ggml.h"
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||||||
#define GGML_COMMON_DECL_C
|
// a copy of this software and associated documentation files (the
|
||||||
//#define GGML_COMMON_DECL_CPP
|
// "Software"), to deal in the Software without restriction, including
|
||||||
#include "ggml-common.h"
|
// without limitation the rights to use, copy, modify, merge, publish,
|
||||||
|
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||||
|
// permit persons to whom the Software is furnished to do so, subject to
|
||||||
|
// the following conditions:
|
||||||
|
//
|
||||||
|
// The above copyright notice and this permission notice shall be
|
||||||
|
// included in all copies or substantial portions of the Software.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||||
|
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
|
||||||
// appelé que depuis du c++ (le tinyBLAS backend)
|
//
|
||||||
|
// _ _ ___ _ _ ___
|
||||||
|
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||||
|
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||||
|
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||||
|
// |__/
|
||||||
|
//
|
||||||
|
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// This file implements multithreaded CPU matrix multiplication for the
|
||||||
|
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||||
|
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||||
|
// without imposing any overhead such as cache filling or malloc calls.
|
||||||
|
//
|
||||||
|
// This implementation does not guarantee any upper bound with rounding
|
||||||
|
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||||
|
// hardware for performance, and then use whatever resources remain for
|
||||||
|
// improving numerical accuracy.
|
||||||
|
//
|
||||||
|
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||||
|
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "ggml.h"
|
||||||
|
#define GGML_COMMON_DECL_CPP
|
||||||
|
#include "ggml-common.h"
|
||||||
|
|
||||||
namespace ggml::backend::tinyblas {
|
namespace ggml::backend::tinyblas {
|
||||||
|
|
||||||
// on est en C++
|
// compute: C = Aᵀ * B
|
||||||
// => on peu avoir autant de fonction que de type.
|
|
||||||
// calcule C = Aᵀ * B
|
|
||||||
template<bool RUN>
|
template<bool RUN>
|
||||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue