Merge a3822fb59b
into cce5a90075
This commit is contained in:
commit
a27dc771c3
19 changed files with 1380 additions and 342 deletions
|
@ -91,8 +91,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||
if (NOT DEFINED GGML_TINYBLAS)
|
||||
set(GGML_TINYBLAS ON)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED GGML_AMX)
|
||||
|
|
8
Makefile
8
Makefile
|
@ -567,8 +567,8 @@ ifdef GGML_NVPL
|
|||
endif # GGML_NVPL
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
|
||||
MK_CPPFLAGS += -DGGML_USE_TINYBLAS
|
||||
OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas-cpp17.o ggml/src/ggml-tinyblas/sgemm-cpp17.o
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_AMX
|
||||
|
@ -1099,6 +1099,10 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
|
|||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
# for c++17 build
|
||||
$(DIR_GGML)/%-cpp17.o: $(DIR_GGML)/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@
|
||||
|
||||
# Rules for building object files
|
||||
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
|
||||
$(CC) $(CFLAGS) -MMD -c $< -o $@
|
||||
|
|
|
@ -45,7 +45,7 @@ $ cmake \
|
|||
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DGGML_LLAMAFILE=OFF \
|
||||
-DGGML_TINYBLAS=OFF \
|
||||
-B build-android
|
||||
```
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
|
|||
|
||||
**Notes**:
|
||||
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, there are two cases:
|
||||
|
@ -405,4 +405,4 @@ To read documentation for how to build on Android, [click here](./android.md)
|
|||
|
||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
|
||||
|
|
|
@ -57,8 +57,8 @@ else()
|
|||
endif()
|
||||
|
||||
# defaults
|
||||
if (NOT GGML_LLAMAFILE_DEFAULT)
|
||||
set(GGML_LLAMAFILE_DEFAULT OFF)
|
||||
if (NOT GGML_TINYBLAS_DEFAULT)
|
||||
set(GGML_TINYBLAS_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
||||
|
@ -125,8 +125,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
|||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||
"ggml: BLAS library vendor")
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
|
||||
|
||||
option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||
|
@ -230,6 +229,7 @@ set(GGML_PUBLIC_HEADERS
|
|||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
include/ggml-sycl.h
|
||||
include/ggml-tinyblas.h
|
||||
include/ggml-vulkan.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
|
|
@ -124,7 +124,6 @@ extern "C" {
|
|||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
|
|
17
ggml/include/ggml-tinyblas.h
Normal file
17
ggml/include/ggml-tinyblas.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend register
|
||||
GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -258,6 +258,7 @@ ggml_add_backend(Kompute)
|
|||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(TINYBLAS)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(MUSA)
|
||||
|
||||
|
|
|
@ -27,6 +27,10 @@
|
|||
#include "ggml-blas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_TINYBLAS
|
||||
#include "ggml-tinyblas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
#include "ggml-rpc.h"
|
||||
#endif
|
||||
|
@ -66,6 +70,9 @@ struct ggml_backend_registry {
|
|||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_TINYBLAS
|
||||
register_backend(ggml_backend_tinyblas_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
|
@ -84,10 +91,12 @@ struct ggml_backend_registry {
|
|||
return;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
//#ifndef NDEBUG
|
||||
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
//#endif
|
||||
backends.push_back(reg);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
|
|
|
@ -6,7 +6,20 @@
|
|||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
// std-c++ allow anonymous unions but some compiler warn on it
|
||||
#define GGML_COMMON_AGGR_U data
|
||||
// std-c++ do not allow it.
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_METAL)
|
||||
|
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||
|
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_HIP)
|
||||
|
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_SYCL)
|
||||
|
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
|||
typedef sycl::half ggml_half;
|
||||
typedef sycl::half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#endif
|
||||
|
@ -154,9 +171,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
@ -175,9 +192,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
|
@ -196,9 +213,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 ds;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
@ -261,9 +278,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
|
@ -288,9 +305,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
|
@ -305,9 +322,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
|
@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_METAL)
|
||||
#include <metal_stdlib>
|
||||
|
|
|
@ -44,16 +44,6 @@ if (GGML_OPENMP)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
message(STATUS "Using llamafile")
|
||||
|
||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||
|
||||
target_sources(ggml-cpu PRIVATE
|
||||
llamafile/sgemm.cpp
|
||||
llamafile/sgemm.h)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
|
||||
|
|
|
@ -39,14 +39,6 @@
|
|||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
|
@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||
nb11/ggml_type_size(src1->type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm1;
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm1:;
|
||||
#endif
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
char * wdata = params->wdata;
|
||||
|
||||
|
@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
|
|||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
if (src1->type != vec_dot_type) {
|
||||
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm2;
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm2:;
|
||||
#endif
|
||||
|
||||
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
||||
const int64_t nr0 = ne0;
|
||||
|
||||
|
@ -13863,14 +13804,6 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_llamafile(void) {
|
||||
#if defined(GGML_USE_LLAMAFILE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
#if defined(__SSE3__)
|
||||
return 1;
|
||||
|
|
|
@ -616,9 +616,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|||
if (ggml_cpu_has_wasm_simd()) {
|
||||
features.push_back({ "WASM_SIMD", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_llamafile()) {
|
||||
features.push_back({ "LLAMAFILE", "1" });
|
||||
}
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||
const void *, int64_t, void *, int64_t, int, int,
|
||||
int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
236
ggml/src/ggml-tinyblas/CMakeLists.txt
Normal file
236
ggml/src/ggml-tinyblas/CMakeLists.txt
Normal file
|
@ -0,0 +1,236 @@
|
|||
message(STATUS "Using TINYBLAS")
|
||||
|
||||
add_library(ggml-tinyblas
|
||||
ggml-tinyblas.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-tinyblas PRIVATE ggml-base)
|
||||
target_include_directories(ggml-tinyblas PRIVATE . ..)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
message(STATUS "Accelerate framework found")
|
||||
|
||||
add_compile_definitions(GGML_USE_ACCELERATE)
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
|
||||
target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
message(STATUS "OpenMP found")
|
||||
|
||||
add_compile_definitions(GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
|
||||
else()
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
target_sources(ggml-tinyblas PRIVATE
|
||||
sgemm.cpp
|
||||
sgemm.h)
|
||||
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND
|
||||
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
||||
|
||||
message(STATUS "ARM detected")
|
||||
|
||||
if (MSVC)
|
||||
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||
add_compile_definitions(__ARM_NEON)
|
||||
add_compile_definitions(__ARM_FEATURE_FMA)
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
||||
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
endif ()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (GGML_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
# TODO: improve, should not reference files from the parent folder
|
||||
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (GGML_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
||||
endif()
|
||||
elseif (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (GGML_FMA)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
list(APPEND ARCH_FLAGS -mamx-tile)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
list(APPEND ARCH_FLAGS -mamx-int8)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
list(APPEND ARCH_FLAGS -mamx-bf16)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
endif()
|
||||
if (GGML_LSX)
|
||||
list(APPEND ARCH_FLAGS -mlsx)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
|
||||
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
|
||||
target_compile_features (ggml-tinyblas PRIVATE cxx_std_17)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
484
ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
Normal file
484
ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
Normal file
|
@ -0,0 +1,484 @@
|
|||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of this software and associated documentation files (the
|
||||
// "Software"), to deal in the Software without restriction, including
|
||||
// without limitation the rights to use, copy, modify, merge, publish,
|
||||
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||
// permit persons to whom the Software is furnished to do so, subject to
|
||||
// the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
//
|
||||
// _ _ ___ _ _ ___
|
||||
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||
// |__/
|
||||
//
|
||||
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||
//
|
||||
//
|
||||
// This file implements multithreaded CPU matrix multiplication for the
|
||||
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||
// without imposing any overhead such as cache filling or malloc calls.
|
||||
//
|
||||
// This implementation does not guarantee any upper bound with rounding
|
||||
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||
// hardware for performance, and then use whatever resources remain for
|
||||
// improving numerical accuracy.
|
||||
//
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-tinyblas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "sgemm.h"
|
||||
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
|
||||
// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
|
||||
// https://github.com/ggerganov/llama.cpp/pull/1999
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
namespace ggml::backend::tinyblas {
|
||||
|
||||
static const char* NAME = "tinyBLAS";
|
||||
|
||||
struct context {
|
||||
int n_threads = GGML_DEFAULT_N_THREADS;
|
||||
std::unique_ptr<char[]> work_data;
|
||||
size_t work_size = 0;
|
||||
};
|
||||
|
||||
template<bool RUN>
|
||||
static bool mul_mat(int64_t m, int64_t n, int64_t k,
|
||||
const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc,
|
||||
int ith, int nth,
|
||||
const enum ggml_type Atype, const enum ggml_type Btype, const enum ggml_type Ctype)
|
||||
{
|
||||
GGML_ASSERT(Ctype == GGML_TYPE_F32);
|
||||
switch (Atype) {
|
||||
case GGML_TYPE_F32:
|
||||
if (Btype != GGML_TYPE_F32) return false;
|
||||
return gemm<RUN>(m, n, k, (const float*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
switch (Btype) {
|
||||
case GGML_TYPE_F32:
|
||||
return gemm<RUN>(m, n, k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
case GGML_TYPE_F16:
|
||||
return gemm<RUN>(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
switch (Btype) {
|
||||
case GGML_TYPE_F32:
|
||||
return gemm<RUN>(m, n, k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
case GGML_TYPE_BF16:
|
||||
return gemm<RUN>(m, n, k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (Btype != GGML_TYPE_Q8_0) return false;
|
||||
return gemm<RUN>(m, n, k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (Btype != GGML_TYPE_Q8_0) return false;
|
||||
return gemm<RUN>(m, n, k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
if (Btype != GGML_TYPE_Q8_0) return false;
|
||||
return gemm<RUN>(m, n, k, (const block_q5_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
if (Btype != GGML_TYPE_Q8_0) return false;
|
||||
return gemm<RUN>(m, n, k, (const block_iq4_nl*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool supports_mul_mat(ggml_backend_dev_t, const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
if (dst->type != GGML_TYPE_F32) return false;
|
||||
|
||||
if (ne0 != ne01) return false;
|
||||
if (ne1 != ne11) return false;
|
||||
if (ne2 != ne12) return false;
|
||||
if (ne3 != ne13) return false;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
if (nb00 != ggml_type_size(src0->type)) return false;
|
||||
if (nb10 != ggml_type_size(src1->type)) return false;
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
if (nb0 != sizeof(float)) return false;
|
||||
if (nb0 > nb1) return false;
|
||||
if (nb1 > nb2) return false;
|
||||
if (nb2 > nb3) return false;
|
||||
|
||||
if (ggml_is_contiguous(src1)) {
|
||||
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
src0->data, nb01/ggml_type_size(src0->type),
|
||||
src1->data, nb11/ggml_type_size(src1->type),
|
||||
dst->data, nb1/ggml_type_size(dst->type),
|
||||
0, 1, src0->type, src1->type, GGML_TYPE_F32)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// after convert B: FP32 => src0->vec_dot_type
|
||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
|
||||
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
src0->data, nb01/ggml_type_size(src0->type),
|
||||
src1->data, nb11/ggml_type_size(src1->type),
|
||||
dst->data, nb1/ggml_type_size(dst->type),
|
||||
0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
|
||||
// TODO: how to resize work_data here
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void mul_mat(ggml::backend::tinyblas::context * ctx, struct ggml_tensor * dst, const int ith, const int nth) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const enum ggml_type type0 = src0->type;
|
||||
const enum ggml_type type1 = src1->type;
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
if (ggml_is_contiguous(src1)) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03;
|
||||
const void* data1 = (const char *)src1->data + i12*nb12 + i13*nb13;
|
||||
void* data = (char *)dst->data + i12*nb2 + i13*nb3;
|
||||
if (!mul_mat<true>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
data0, nb01/ggml_type_size(src0->type),
|
||||
data1, nb11/ggml_type_size(src1->type),
|
||||
data, nb1/ggml_type_size(dst->type),
|
||||
ith, nth, type0, type1, GGML_TYPE_F32)) {
|
||||
goto UseGgmlGemm1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm1:;
|
||||
|
||||
// with B converted from FP32 -> vec_dot_type
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
|
||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
||||
// const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
const size_t nbw2 = nbw1*ne11;
|
||||
const size_t nbw3 = nbw2*ne12;
|
||||
|
||||
// TODO: move to: supports_mul_mat
|
||||
if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
|
||||
ctx->work_data.reset(new char[ne13*nbw3]);
|
||||
ctx->work_size = ne13*nbw3;
|
||||
}
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp barrier
|
||||
#else
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
#endif
|
||||
char * wdata = ctx->work_data.get();
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// synchronize all threads!
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp barrier
|
||||
#else
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
#endif
|
||||
// mat-mul bis...
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03;
|
||||
const void* data1 = (const char *)wdata + i12*nbw2 + i13*nbw3;
|
||||
|
||||
void* data = (char *)dst->data + i12*nb2 + i13*nb3;
|
||||
if (!mul_mat<true>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
data0, nb01/ggml_type_size(src0->type),
|
||||
data1, nbw1/ggml_type_size(vec_dot_type),
|
||||
data, nb1/ggml_type_size(dst->type),
|
||||
ith, nth, type0, vec_dot_type, GGML_TYPE_F32)) {
|
||||
goto UseGgmlGemm2;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm2:;
|
||||
}
|
||||
|
||||
static const char * get_name(ggml_backend_t /*backend*/) {
|
||||
return NAME;
|
||||
}
|
||||
|
||||
static void free(ggml_backend_t backend) {
|
||||
context * ctx = (context *)backend->context;
|
||||
delete ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
context * ctx = (context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel num_threads(ctx->n_threads)
|
||||
{
|
||||
int ith = omp_get_thread_num();
|
||||
int nth = ctx->n_threads;
|
||||
mul_mat(ctx, node, ith, nth);
|
||||
}
|
||||
#else
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
mul_mat(ctx, node, 0, 1);
|
||||
#endif
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static struct ggml_backend_i interface = {
|
||||
/* .get_name = */ get_name,
|
||||
/* .free = */ free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t guid(void) {
|
||||
static ggml_guid guid = { 0x23, 0xf5, 0x9f, 0xa2, 0xb1, 0x48, 0x39, 0x25, 0x83, 0xcd, 0x79, 0x16, 0xb7, 0x23, 0x94, 0xde };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
static ggml_backend_t init(void) {
|
||||
context * ctx = new context;
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ guid(),
|
||||
/* .interface = */ interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_tinyblas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
static bool is_tinyblas(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, guid());
|
||||
}
|
||||
|
||||
static void set_n_threads(ggml_backend_t backend, int n_threads) {
|
||||
GGML_ASSERT(is_tinyblas(backend));
|
||||
context * ctx = (context *)backend->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// device interface
|
||||
namespace ggml::backend::tinyblas::device {
|
||||
static const char * get_name(ggml_backend_dev_t) {
|
||||
return "BLAS";
|
||||
}
|
||||
|
||||
static const char * get_description(ggml_backend_dev_t) {
|
||||
return "tinyBLAS";
|
||||
}
|
||||
|
||||
static void get_memory(ggml_backend_dev_t, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type get_type(ggml_backend_dev_t) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||
}
|
||||
|
||||
static void get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = get_name(dev);
|
||||
props->description = get_description(dev);
|
||||
props->type = get_type(dev);
|
||||
get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t init_backend(ggml_backend_dev_t, const char *) {
|
||||
return ggml::backend::tinyblas::init();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t get_buffer_type(ggml_backend_dev_t) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t buffer_from_host_ptr(ggml_backend_dev_t, void * ptr, size_t size, size_t) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
}
|
||||
|
||||
static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
return supports_mul_mat(device, op);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool supports_buft(ggml_backend_dev_t, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i interface = {
|
||||
/* .get_name = */ get_name,
|
||||
/* .get_description = */ get_description,
|
||||
/* .get_memory = */ get_memory,
|
||||
/* .get_type = */ get_type,
|
||||
/* .get_props = */ get_props,
|
||||
/* .init_backend = */ init_backend,
|
||||
/* .get_buffer_type = */ get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ buffer_from_host_ptr,
|
||||
/* .supports_op = */ supports_op,
|
||||
/* .supports_buft = */ supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// backend reg interface
|
||||
namespace ggml::backend::tinyblas::reg {
|
||||
static const char * get_name(ggml_backend_reg_t) {
|
||||
return ggml::backend::tinyblas::NAME;
|
||||
}
|
||||
|
||||
static size_t get_device_count(ggml_backend_reg_t) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
static ggml_backend_device device = {
|
||||
/* .iface = */ ggml::backend::tinyblas::device::interface,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &device;
|
||||
}
|
||||
|
||||
static void * get_proc_address(ggml_backend_reg_t, const char * name) {
|
||||
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml::backend::tinyblas::set_n_threads;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i interface = {
|
||||
/* .get_name = */ get_name,
|
||||
/* .get_device_count = */ get_device_count,
|
||||
/* .get_device = */ get_device,
|
||||
/* .get_proc_address = */ get_proc_address,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_tinyblas_reg(void) {
|
||||
static struct ggml_backend_reg backend_reg = {
|
||||
/* .iface = */ ggml::backend::tinyblas::reg::interface,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
return &backend_reg;
|
||||
}
|
|
@ -50,8 +50,6 @@
|
|||
|
||||
#include "sgemm.h"
|
||||
#include "ggml-impl.h"
|
||||
// hack until moved into the CPU backend
|
||||
#include "../ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
@ -135,6 +133,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
|
|||
return _mm512_fmadd_ps(a, b, c);
|
||||
}
|
||||
#endif
|
||||
#if defined(__AVX512BF16__)
|
||||
template <>
|
||||
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
|
||||
return _mm512_dpbf16_ps(c, a, b);
|
||||
}
|
||||
template <>
|
||||
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
|
||||
return _mm256_dpbf16_ps(c, a, b);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
|
@ -226,6 +234,13 @@ template <> inline __m256 load(const float *p) {
|
|||
}
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
template <> inline __m256 load(const ggml_bf16_t *p) {
|
||||
return _mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__F16C__)
|
||||
template <> inline __m256 load(const ggml_fp16_t *p) {
|
||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
|
||||
|
@ -239,8 +254,27 @@ template <> inline __m512 load(const float *p) {
|
|||
template <> inline __m512 load(const ggml_fp16_t *p) {
|
||||
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
|
||||
}
|
||||
template <> inline __m512 load(const ggml_bf16_t *p) {
|
||||
return _mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
template <> inline __m512bh load(const ggml_bf16_t *p) {
|
||||
return (__m512bh)_mm512_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m256bh load(const ggml_bf16_t *p) {
|
||||
return (__m256bh)_mm256_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m512bh load(const float *p) {
|
||||
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
|
||||
}
|
||||
template <> inline __m256bh load(const float *p) {
|
||||
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
|
@ -1627,259 +1661,494 @@ class tinyBLAS_PPC {
|
|||
#endif
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Performs optimized matrix multiplication on CPU.
|
||||
*
|
||||
* This subroutine may compute C = Aᵀ * B with column major ordering.
|
||||
* Despite its name, this isn't a generalized implementation. Work is
|
||||
* only performed when a handwritten kernel is written and available.
|
||||
* Otherwise the caller should fall back to a general matmul routine.
|
||||
*
|
||||
* For example, for single-threaded single-precision GEMM you can say
|
||||
*
|
||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
||||
* 0, 1,
|
||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||
*
|
||||
* @param m is rows in `A` and `C`
|
||||
* @param n is cols in `B` and `C`
|
||||
* @param k is cols in `A` and rows in `B`
|
||||
* @param A is first input matrix (always transposed)
|
||||
* @param lda is row stride of `A`
|
||||
* @param B is second input matrix (never transposed)
|
||||
* @param ldb is row stride of `B`
|
||||
* @param C is input/output array of output matrices
|
||||
* @param ldc is row stride of `C`
|
||||
* @param ith is thread id (must be less than `nth`)
|
||||
* @param nth is number of threads (must be greater than zero)
|
||||
* @param Atype is GGML data type of `A`
|
||||
* @param Btype is GGML data type of `B`
|
||||
* @param Ctype is GGML data type of `C`
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||
namespace ggml::backend::tinyblas {
|
||||
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
/**
|
||||
* Performs optimized matrix multiplication on CPU.
|
||||
*
|
||||
* This subroutine may compute C = Aᵀ * B with column major ordering.
|
||||
* Despite its name, this isn't a generalized implementation. Work is
|
||||
* only performed when a handwritten kernel is written and available.
|
||||
* Otherwise the caller should fall back to a general matmul routine.
|
||||
*
|
||||
* For example, for single-threaded single-precision GEMM you can say
|
||||
*
|
||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1);
|
||||
*
|
||||
* @param m is rows in `A` and `C`
|
||||
* @param n is cols in `B` and `C`
|
||||
* @param k is cols in `A` and rows in `B`
|
||||
* @param A is first input matrix (always transposed)
|
||||
* @param lda is row stride of `A`
|
||||
* @param B is second input matrix (never transposed)
|
||||
* @param ldb is row stride of `B`
|
||||
* @param C is input/output array of output matrices
|
||||
* @param ldc is row stride of `C`
|
||||
* @param ith is thread id (must be less than `nth`)
|
||||
* @param nth is number of threads (must be greater than zero)
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
if (n < 2)
|
||||
return false;
|
||||
|
||||
if (Ctype != GGML_TYPE_F32)
|
||||
return false;
|
||||
|
||||
switch (Atype) {
|
||||
|
||||
case GGML_TYPE_F32: {
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__AVX__) || defined(__AVX2__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_NEON)
|
||||
if (n < 4)
|
||||
return false;
|
||||
if (k % 4)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__MMA__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
tinyBLAS_PPC<float, float, float> tb{
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
if ((k % 16) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if defined(__AVX__) || defined(__AVX2__)
|
||||
if ((k % 8)==0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
if ((k % 4) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
// TODO: voir a mettre ca dans un autre fichier...
|
||||
#if defined(__MMA__)
|
||||
if ((k % 8) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_PPC<float, float, float> tb{ k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
case GGML_TYPE_F16: {
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if ((k % 16) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
if ((k % 8) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if ((k % 4) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX512F__)
|
||||
if ((k % 16) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
if ((k % 8) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if ((k % 8) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX512BF16__)
|
||||
// wait for convert B => bf16?
|
||||
//if ((k % 32) == 0) {
|
||||
// if constexpr (RUN) {
|
||||
// tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
// tb.matmul(m, n);
|
||||
// }
|
||||
// return true;
|
||||
//}
|
||||
#elif defined(__AVX512F__)
|
||||
if ((k % 16) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if (n < 8)
|
||||
return false;
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return false;
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if (k % 4)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
// TODO
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX512BF16__)
|
||||
if ((k % 32) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// 2eme chance...
|
||||
if ((k % 16) == 0) {
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS<16, __m256, __m256bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
|
||||
case GGML_TYPE_Q8_0: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_ARM<block_q8_0> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
|
||||
case GGML_TYPE_Q4_0: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_ARM<block_q4_0> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
|
||||
case GGML_TYPE_Q5_0: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
|
||||
k, (const block_q5_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
|
||||
case GGML_TYPE_IQ4_NL: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
|
||||
k, (const block_iq4_nl *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
if constexpr (RUN) {
|
||||
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n);
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
(void)m;
|
||||
(void)n;
|
||||
(void)k;
|
||||
(void)A;
|
||||
(void)lda;
|
||||
(void)B;
|
||||
(void)ldb;
|
||||
(void)C;
|
||||
(void)ldc;
|
||||
(void)ith;
|
||||
(void)nth;
|
||||
(void)Atype;
|
||||
(void)Btype;
|
||||
(void)Ctype;
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
template bool gemm<false>(int64_t m, int64_t n, int64_t k,
|
||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith, int nth);
|
||||
}
|
90
ggml/src/ggml-tinyblas/sgemm.h
Normal file
90
ggml/src/ggml-tinyblas/sgemm.h
Normal file
|
@ -0,0 +1,90 @@
|
|||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of this software and associated documentation files (the
|
||||
// "Software"), to deal in the Software without restriction, including
|
||||
// without limitation the rights to use, copy, modify, merge, publish,
|
||||
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||
// permit persons to whom the Software is furnished to do so, subject to
|
||||
// the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
//
|
||||
// _ _ ___ _ _ ___
|
||||
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||
// |__/
|
||||
//
|
||||
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||
//
|
||||
//
|
||||
// This file implements multithreaded CPU matrix multiplication for the
|
||||
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||
// without imposing any overhead such as cache filling or malloc calls.
|
||||
//
|
||||
// This implementation does not guarantee any upper bound with rounding
|
||||
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||
// hardware for performance, and then use whatever resources remain for
|
||||
// improving numerical accuracy.
|
||||
//
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
#pragma once
|
||||
#include "ggml.h"
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
namespace ggml::backend::tinyblas {
|
||||
|
||||
// compute: C = Aᵀ * B
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
int ith=0, int nth=1);
|
||||
}
|
|
@ -22216,7 +22216,6 @@ const char * llama_print_system_info(void) {
|
|||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue