some cleanup with tinyblas backend
This commit is contained in:
parent
7dd261f3e9
commit
dda8847636
13 changed files with 264 additions and 150 deletions
|
@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
|||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||
if (NOT DEFINED GGML_TINYBLAS)
|
||||
set(GGML_TINYBLAS ON)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED GGML_AMX)
|
||||
|
|
|
@ -45,7 +45,7 @@ $ cmake \
|
|||
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DGGML_LLAMAFILE=OFF \
|
||||
-DGGML_TINYBLAS=OFF \
|
||||
-B build-android
|
||||
```
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
|
|||
|
||||
**Notes**:
|
||||
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, there are two cases:
|
||||
|
@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
|
|||
|
||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
|
||||
|
|
|
@ -57,8 +57,8 @@ else()
|
|||
endif()
|
||||
|
||||
# defaults
|
||||
if (NOT GGML_LLAMAFILE_DEFAULT)
|
||||
set(GGML_LLAMAFILE_DEFAULT OFF)
|
||||
if (NOT GGML_TINYBLAS_DEFAULT)
|
||||
set(GGML_TINYBLAS_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
||||
|
@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
|||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||
"ggml: BLAS library vendor")
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
|
||||
|
||||
option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
|
@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
|
|||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
include/ggml-sycl.h
|
||||
include/ggml-tinyblas.h
|
||||
include/ggml-vulkan.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
|
|
@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
|
|||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(TINYBLAS)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(MUSA)
|
||||
|
||||
|
|
|
@ -91,10 +91,12 @@ struct ggml_backend_registry {
|
|||
return;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
//#ifndef NDEBUG
|
||||
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
//#endif
|
||||
backends.push_back(reg);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
|
|
|
@ -6,7 +6,20 @@
|
|||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
// std-c++ allow anonymous unions but some compiler warn on it
|
||||
#define GGML_COMMON_AGGR_U data
|
||||
// std-c++ do not allow it.
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_METAL)
|
||||
|
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||
|
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_HIP)
|
||||
|
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
|||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_SYCL)
|
||||
|
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
|||
typedef sycl::half ggml_half;
|
||||
typedef sycl::half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#endif
|
||||
|
@ -154,9 +171,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
@ -175,9 +192,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
|
@ -196,9 +213,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 ds;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
@ -261,9 +278,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
|
@ -288,9 +305,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
|
@ -305,9 +322,9 @@ typedef struct {
|
|||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
|
@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_METAL)
|
||||
#include <metal_stdlib>
|
||||
|
|
|
@ -44,16 +44,6 @@ if (GGML_OPENMP)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
message(STATUS "Using llamafile")
|
||||
|
||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||
|
||||
target_sources(ggml-cpu PRIVATE
|
||||
llamafile/sgemm.cpp
|
||||
llamafile/sgemm.h)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
|
||||
|
|
|
@ -39,14 +39,6 @@
|
|||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
|
@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||
nb11/ggml_type_size(src1->type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm1;
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm1:;
|
||||
#endif
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
char * wdata = params->wdata;
|
||||
|
||||
|
@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
|
|||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
if (src1->type != vec_dot_type) {
|
||||
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm2;
|
||||
return;
|
||||
}
|
||||
UseGgmlGemm2:;
|
||||
#endif
|
||||
|
||||
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
||||
const int64_t nr0 = ne0;
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
message(STATUS "Using TINYBLAS")
|
||||
|
||||
add_library(ggml-tinyblas
|
||||
ggml-tinyblas.cpp
|
||||
)
|
||||
|
@ -225,6 +227,10 @@ endif()
|
|||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
|
||||
#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
|
||||
target_compile_features (ggml-tinyblas PRIVATE cxx_std_17)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
|
|
@ -1,3 +1,48 @@
|
|||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of this software and associated documentation files (the
|
||||
// "Software"), to deal in the Software without restriction, including
|
||||
// without limitation the rights to use, copy, modify, merge, publish,
|
||||
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||
// permit persons to whom the Software is furnished to do so, subject to
|
||||
// the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
//
|
||||
// _ _ ___ _ _ ___
|
||||
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||
// |__/
|
||||
//
|
||||
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||
//
|
||||
//
|
||||
// This file implements multithreaded CPU matrix multiplication for the
|
||||
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||
// without imposing any overhead such as cache filling or malloc calls.
|
||||
//
|
||||
// This implementation does not guarantee any upper bound with rounding
|
||||
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||
// hardware for performance, and then use whatever resources remain for
|
||||
// improving numerical accuracy.
|
||||
//
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-tinyblas.h"
|
||||
|
@ -7,8 +52,9 @@
|
|||
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
|
||||
// https://github.com/ggerganov/llama.cpp/pull/1999
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
|
|||
int n_threads = GGML_DEFAULT_N_THREADS;
|
||||
std::unique_ptr<char[]> work_data;
|
||||
size_t work_size = 0;
|
||||
//int pp_threads = GGML_DEFAULT_N_THREADS;
|
||||
//int tg_threads = GGML_DEFAULT_N_THREADS;
|
||||
};
|
||||
|
||||
template<bool RUN>
|
||||
|
@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
}
|
||||
|
||||
// apres conversion de B: FP32 => src0->vec_dot_type
|
||||
// after convert B: FP32 => src0->vec_dot_type
|
||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
|
||||
if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
|
@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
|
|||
src1->data, nb11/ggml_type_size(src1->type),
|
||||
dst->data, nb1/ggml_type_size(dst->type),
|
||||
0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
|
||||
// @ voir ca aurait etait bien de redimensioner work_data ici..
|
||||
// TODO: how to resize work_data here
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
|
|||
const enum ggml_type type0 = src0->type;
|
||||
const enum ggml_type type1 = src1->type;
|
||||
|
||||
// les type "directs"
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
UseGgmlGemm1:;
|
||||
|
||||
// apres conversion de B ?
|
||||
// with B converted from FP32 -> vec_dot_type
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
|
||||
enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
|
||||
// auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
// OK on va au moins essayer de changer le type de B
|
||||
|
||||
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
||||
// const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
const size_t nbw2 = nbw1*ne11;
|
||||
const size_t nbw3 = nbw2*ne12;
|
||||
|
||||
// TOD0: vor si on peu caller ca dans supports_mul_mat
|
||||
// TODO: move to: supports_mul_mat
|
||||
if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
|
||||
ctx->work_data.reset(new char[ne13*nbw3]);
|
||||
ctx->work_size = ne13*nbw3;
|
||||
|
@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
|
|||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp barrier
|
||||
#else
|
||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
#endif
|
||||
char * wdata = ctx->work_data.get();
|
||||
|
||||
|
@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
|
|||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp barrier
|
||||
#else
|
||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
#endif
|
||||
// mat-mul bis...
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
|
@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
|
|||
delete backend;
|
||||
}
|
||||
|
||||
// TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
|
||||
// - voir ggml_graph_compute / ggml_threadpool
|
||||
// https://github.com/ggerganov/llama.cpp/pull/1999
|
||||
//
|
||||
static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
context * ctx = (context *)backend->context;
|
||||
|
||||
|
@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
|
|||
mul_mat(ctx, node, ith, nth);
|
||||
}
|
||||
#else
|
||||
static_assert(false, "Note implemented: use GGML_USE_OPENMP");
|
||||
static_assert(false, "Not implemented: use GGML_USE_OPENMP");
|
||||
mul_mat(ctx, node, 0, 1);
|
||||
#endif
|
||||
break;
|
||||
|
@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
|
|||
return backend != NULL && ggml_guid_matches(backend->guid, guid());
|
||||
}
|
||||
|
||||
// number of threads to use for compute
|
||||
static void set_pp_threads(ggml_backend_t backend, int n_threads) {
|
||||
GGML_ASSERT(is_tinyblas(backend));
|
||||
context * ctx = (context *)backend->context;
|
||||
//ctx->pp_threads = n_threads;
|
||||
}
|
||||
|
||||
static void set_tg_threads(ggml_backend_t backend, int n_threads) {
|
||||
GGML_ASSERT(is_tinyblas(backend));
|
||||
context * ctx = (context *)backend->context;
|
||||
//ctx->tg_threads = n_threads;
|
||||
}
|
||||
|
||||
static void set_n_threads(ggml_backend_t backend, int n_threads) {
|
||||
GGML_ASSERT(is_tinyblas(backend));
|
||||
context * ctx = (context *)backend->context;
|
||||
ctx->n_threads = n_threads;
|
||||
//ctx->tg_threads = n_threads;
|
||||
//ctx->pp_threads = n_threads;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
|
|||
}
|
||||
|
||||
static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||
//const struct ggml_tensor * src0 = op->src[0];
|
||||
//const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
|
@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
|
|||
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml::backend::tinyblas::set_n_threads;
|
||||
}
|
||||
if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
|
||||
return (void *)ggml::backend::tinyblas::set_pp_threads;
|
||||
}
|
||||
if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
|
||||
return (void *)ggml::backend::tinyblas::set_tg_threads;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
|
|||
// TODO
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
|
|||
}
|
||||
#endif
|
||||
return false;
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
|
|||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
|
|||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
|
|||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
|
|||
#else
|
||||
return false;
|
||||
#endif
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(B);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(C);
|
||||
GGML_UNUSED(ldc);
|
||||
GGML_UNUSED(ith);
|
||||
GGML_UNUSED(nth);
|
||||
}
|
||||
template bool gemm<true>(int64_t m, int64_t n, int64_t k,
|
||||
const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
|
|
@ -1,17 +1,56 @@
|
|||
#pragma once
|
||||
//#include <cstdint>
|
||||
#include "ggml.h"
|
||||
#define GGML_COMMON_DECL_C
|
||||
//#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of this software and associated documentation files (the
|
||||
// "Software"), to deal in the Software without restriction, including
|
||||
// without limitation the rights to use, copy, modify, merge, publish,
|
||||
// distribute, sublicense, and/or sell copies of the Software, and to
|
||||
// permit persons to whom the Software is furnished to do so, subject to
|
||||
// the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
// appelé que depuis du c++ (le tinyBLAS backend)
|
||||
//
|
||||
// _ _ ___ _ _ ___
|
||||
// | |_(_)_ _ _ _| _ ) | /_\ / __|
|
||||
// | _| | ' \ || | _ \ |__ / _ \\__ \.
|
||||
// \__|_|_||_\_, |___/____/_/ \_\___/
|
||||
// |__/
|
||||
//
|
||||
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||
//
|
||||
//
|
||||
// This file implements multithreaded CPU matrix multiplication for the
|
||||
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||
// without imposing any overhead such as cache filling or malloc calls.
|
||||
//
|
||||
// This implementation does not guarantee any upper bound with rounding
|
||||
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||
// hardware for performance, and then use whatever resources remain for
|
||||
// improving numerical accuracy.
|
||||
//
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
#pragma once
|
||||
#include "ggml.h"
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
namespace ggml::backend::tinyblas {
|
||||
|
||||
// on est en C++
|
||||
// => on peu avoir autant de fonction que de type.
|
||||
// calcule C = Aᵀ * B
|
||||
// compute: C = Aᵀ * B
|
||||
template<bool RUN>
|
||||
bool gemm(int64_t m, int64_t n, int64_t k,
|
||||
const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue