From 7dd261f3e965348cb631c625fd9e29dd8c48c1f6 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:58:30 +0100 Subject: [PATCH] extract llamafile in new tinyblas backend --- Makefile | 21 +- ggml/include/ggml-cpu.h | 1 - ggml/include/ggml-tinyblas.h | 17 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-cpu/ggml-cpu.c | 8 - ggml/src/ggml-cpu/ggml-cpu.cpp | 3 - ggml/src/ggml-cpu/llamafile/sgemm.h | 14 - ggml/src/ggml-tinyblas/CMakeLists.txt | 230 +++++++ ggml/src/ggml-tinyblas/ggml-tinyblas.cpp | 472 ++++++++++++++ .../llamafile => ggml-tinyblas}/sgemm.cpp | 600 +++++++++++------- ggml/src/ggml-tinyblas/sgemm.h | 51 ++ src/llama.cpp | 1 - 12 files changed, 1181 insertions(+), 244 deletions(-) create mode 100644 ggml/include/ggml-tinyblas.h delete mode 100644 ggml/src/ggml-cpu/llamafile/sgemm.h create mode 100644 ggml/src/ggml-tinyblas/CMakeLists.txt create mode 100644 ggml/src/ggml-tinyblas/ggml-tinyblas.cpp rename ggml/src/{ggml-cpu/llamafile => ggml-tinyblas}/sgemm.cpp (80%) create mode 100644 ggml/src/ggml-tinyblas/sgemm.h diff --git a/Makefile b/Makefile index 539370e06..fa94c3bf3 100644 --- a/Makefile +++ b/Makefile @@ -568,8 +568,8 @@ ifdef GGML_NVPL endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o + MK_CPPFLAGS += -DGGML_USE_TINYBLAS + OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o endif ifndef GGML_NO_AMX @@ -1153,6 +1153,23 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ ggml/src/ggml-impl.h $(CXX) $(CXXFLAGS) -c $< -o $@ +# TODO: renomer en GGML_NO_TINYBLAS +# needed for c++17 build +ifndef GGML_NO_LLAMAFILE +ggml/src/ggml-tinyblas/ggml-tinyblas.o: \ + ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \ + ggml/include/ggml-tinyblas.h \ + ggml/src/ggml-tinyblas/sgemm.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ + +ggml/src/ggml-tinyblas/sgemm.o: \ + ggml/src/ggml-tinyblas/sgemm.cpp \ + ggml/src/ggml-tinyblas/sgemm.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ +endif # GGML_NO_LLAMAFILE + # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c $(CC) $(CFLAGS) -MMD -c $< -o $@ diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef979..49a18ba37 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -124,7 +124,6 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); - GGML_BACKEND_API int ggml_cpu_has_llamafile (void); // Internal types and functions exposed for tests and benchmarks diff --git a/ggml/include/ggml-tinyblas.h b/ggml/include/ggml-tinyblas.h new file mode 100644 index 000000000..4c0075327 --- /dev/null +++ b/ggml/include/ggml-tinyblas.h @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend register +GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d8201..78bcb6c5c 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -27,6 +27,10 @@ #include "ggml-blas.h" #endif +#ifdef GGML_USE_TINYBLAS +#include "ggml-tinyblas.h" +#endif + #ifdef GGML_USE_RPC #include "ggml-rpc.h" #endif @@ -66,6 +70,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_BLAS register_backend(ggml_backend_blas_reg()); #endif +#ifdef GGML_USE_TINYBLAS + register_backend(ggml_backend_tinyblas_reg()); +#endif #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 61f53cd01..7f5c465df 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13868,14 +13868,6 @@ int ggml_cpu_has_wasm_simd(void) { #endif } -int ggml_cpu_has_llamafile(void) { -#if defined(GGML_USE_LLAMAFILE) - return 1; -#else - return 0; -#endif -} - int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9..a131f5e28 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -616,9 +616,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } - if (ggml_cpu_has_llamafile()) { - features.push_back({ "LLAMAFILE", "1" }); - } features.push_back({ nullptr, nullptr }); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h deleted file mode 100644 index caf6dd556..000000000 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once -#include -#include -#ifdef __cplusplus -extern "C" { -#endif - -bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, - const void *, int64_t, void *, int64_t, int, int, - int, int, int); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt new file mode 100644 index 000000000..c8c4fd04e --- /dev/null +++ b/ggml/src/ggml-tinyblas/CMakeLists.txt @@ -0,0 +1,230 @@ +add_library(ggml-tinyblas + ggml-tinyblas.cpp + ) + +target_link_libraries(ggml-tinyblas PRIVATE ggml-base) +target_include_directories(ggml-tinyblas PRIVATE . ..) + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + add_compile_definitions(GGML_USE_ACCELERATE) + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + + target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + + add_compile_definitions(GGML_USE_OPENMP) + + target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + + else() + message(WARNING "OpenMP not found") + endif() +endif() + +target_sources(ggml-tinyblas PRIVATE + sgemm.cpp + sgemm.h) + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + + message(STATUS "ARM detected") + + if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + add_compile_definitions(__ARM_NEON) + add_compile_definitions(__ARM_FEATURE_FMA) + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() + endif() +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + message(STATUS "x86 detected") + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + # TODO: improve, should not reference files from the parent folder + include(../ggml-cpu/cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + add_compile_definitions($<$:__AVX512BF16__>) + add_compile_definitions($<$:__AVX512BF16__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + add_compile_definitions($<$:__AMX_TILE__>) + add_compile_definitions($<$:__AMX_TILE__>) + endif() + if (GGML_AMX_INT8) + add_compile_definitions($<$:__AMX_INT8__>) + add_compile_definitions($<$:__AMX_INT8__>) + endif() + if (GGML_AMX_BF16) + add_compile_definitions($<$:__AMX_BF16__>) + add_compile_definitions($<$:__AMX_BF16__>) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + endif() + else() + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + endif() + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) + endif() + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() +else() + message(STATUS "Unknown architecture") +endif() + +target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") +target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") + +if (EMSCRIPTEN) + set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") +endif() diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp new file mode 100644 index 000000000..7317b5dd3 --- /dev/null +++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp @@ -0,0 +1,472 @@ +#include "ggml-cpu.h" +#include "ggml-impl.h" +#include "ggml-tinyblas.h" +#include "ggml-backend-impl.h" + +#include "sgemm.h" + +#include +#include +#include + +#ifdef GGML_USE_OPENMP +#include +#endif + +namespace ggml::backend::tinyblas { + + static const char* NAME = "tinyBLAS"; + + struct context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; + //int pp_threads = GGML_DEFAULT_N_THREADS; + //int tg_threads = GGML_DEFAULT_N_THREADS; + }; + + template + static bool mul_mat(int64_t m, int64_t n, int64_t k, + const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc, + int ith, int nth, + const enum ggml_type Atype, const enum ggml_type Btype, const enum ggml_type Ctype) + { + GGML_ASSERT(Ctype == GGML_TYPE_F32); + switch (Atype) { + case GGML_TYPE_F32: + if (Btype != GGML_TYPE_F32) return false; + return gemm(m, n, k, (const float*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_F16: + switch (Btype) { + case GGML_TYPE_F32: + return gemm(m, n, k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + case GGML_TYPE_F16: + return gemm(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, (float*)C, ldc, ith, nth); + default: + return false; + } + break; + case GGML_TYPE_BF16: + switch (Btype) { + case GGML_TYPE_F32: + return gemm(m, n, k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + case GGML_TYPE_BF16: + return gemm(m, n, k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, (float*)C, ldc, ith, nth); + default: + return false; + } + break; + case GGML_TYPE_Q8_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_Q4_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_Q5_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q5_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_IQ4_NL: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_iq4_nl*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + default: + return false; + } + return false; + } + + static bool supports_mul_mat(ggml_backend_dev_t, const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + if (dst->type != GGML_TYPE_F32) return false; + + if (ne0 != ne01) return false; + if (ne1 != ne11) return false; + if (ne2 != ne12) return false; + if (ne3 != ne13) return false; + + // we don't support permuted src0 or src1 + if (nb00 != ggml_type_size(src0->type)) return false; + if (nb10 != ggml_type_size(src1->type)) return false; + + // dst cannot be transposed or permuted + if (nb0 != sizeof(float)) return false; + if (nb0 > nb1) return false; + if (nb1 > nb2) return false; + if (nb2 > nb3) return false; + + if (ggml_is_contiguous(src1)) { + if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + src0->data, nb01/ggml_type_size(src0->type), + src1->data, nb11/ggml_type_size(src1->type), + dst->data, nb1/ggml_type_size(dst->type), + 0, 1, src0->type, src1->type, GGML_TYPE_F32)) { + return true; + } + } + + // apres conversion de B: FP32 => src0->vec_dot_type + enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type; + if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) { + if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + src0->data, nb01/ggml_type_size(src0->type), + src1->data, nb11/ggml_type_size(src1->type), + dst->data, nb1/ggml_type_size(dst->type), + 0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) { + // @ voir ca aurait etait bien de redimensioner work_data ici.. + return true; + } + } + return false; + } + + static void mul_mat(ggml::backend::tinyblas::context * ctx, struct ggml_tensor * dst, const int ith, const int nth) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type0 = src0->type; + const enum ggml_type type1 = src1->type; + + // les type "directs" + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + if (ggml_is_contiguous(src1)) { + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03; + const void* data1 = (const char *)src1->data + i12*nb12 + i13*nb13; + void* data = (char *)dst->data + i12*nb2 + i13*nb3; + if (!mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + data0, nb01/ggml_type_size(src0->type), + data1, nb11/ggml_type_size(src1->type), + data, nb1/ggml_type_size(dst->type), + ith, nth, type0, type1, GGML_TYPE_F32)) { + goto UseGgmlGemm1; + } + } + } + return; + } + UseGgmlGemm1:; + + // apres conversion de B ? + GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float' + enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type; + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float; + // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size; + + if (src1->type != vec_dot_type) { + // OK on va au moins essayer de changer le type de B + + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + // const size_t row_size = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + // TOD0: vor si on peu caller ca dans supports_mul_mat + if ((ith == 0) && (ctx->work_size < ne13*nbw3)) { + ctx->work_data.reset(new char[ne13*nbw3]); + ctx->work_size = ne13*nbw3; + } +#ifdef GGML_USE_OPENMP +#pragma omp barrier +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); +#endif + char * wdata = ctx->work_data.get(); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + + // synchronize all threads! +#ifdef GGML_USE_OPENMP +#pragma omp barrier +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); +#endif + // mat-mul bis... + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) { + const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03; + const void* data1 = (const char *)wdata + i12*nbw2 + i13*nbw3; + + void* data = (char *)dst->data + i12*nb2 + i13*nb3; + if (!mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + data0, nb01/ggml_type_size(src0->type), + data1, nbw1/ggml_type_size(vec_dot_type), + data, nb1/ggml_type_size(dst->type), + ith, nth, type0, vec_dot_type, GGML_TYPE_F32)) { + goto UseGgmlGemm2; + } + } + return; + } + UseGgmlGemm2:; + } + + static const char * get_name(ggml_backend_t /*backend*/) { + return NAME; + } + + static void free(ggml_backend_t backend) { + context * ctx = (context *)backend->context; + delete ctx; + delete backend; + } + + // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin... + // - voir ggml_graph_compute / ggml_threadpool + // https://github.com/ggerganov/llama.cpp/pull/1999 + // + static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + context * ctx = (context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: +#ifdef GGML_USE_OPENMP +#pragma omp parallel num_threads(ctx->n_threads) + { + int ith = omp_get_thread_num(); + int nth = ctx->n_threads; + mul_mat(ctx, node, ith, nth); + } +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + mul_mat(ctx, node, 0, 1); +#endif + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } + + return GGML_STATUS_SUCCESS; + } + + static struct ggml_backend_i interface = { + /* .get_name = */ get_name, + /* .free = */ free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + }; + + static ggml_guid_t guid(void) { + static ggml_guid guid = { 0x23, 0xf5, 0x9f, 0xa2, 0xb1, 0x48, 0x39, 0x25, 0x83, 0xcd, 0x79, 0x16, 0xb7, 0x23, 0x94, 0xde }; + return &guid; + } + + static ggml_backend_t init(void) { + context * ctx = new context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ guid(), + /* .interface = */ interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_tinyblas_reg(), 0), + /* .context = */ ctx, + }; + + return backend; + } + + static bool is_tinyblas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, guid()); + } + + // number of threads to use for compute + static void set_pp_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + //ctx->pp_threads = n_threads; + } + + static void set_tg_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + //ctx->tg_threads = n_threads; + } + + static void set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + ctx->n_threads = n_threads; + //ctx->tg_threads = n_threads; + //ctx->pp_threads = n_threads; + } + +} + +// device interface +namespace ggml::backend::tinyblas::device { + static const char * get_name(ggml_backend_dev_t) { + return "BLAS"; + } + + static const char * get_description(ggml_backend_dev_t) { + return "tinyBLAS"; + } + + static void get_memory(ggml_backend_dev_t, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + } + + static enum ggml_backend_dev_type get_type(ggml_backend_dev_t) { + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + } + + static void get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = get_name(dev); + props->description = get_description(dev); + props->type = get_type(dev); + get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; + } + + static ggml_backend_t init_backend(ggml_backend_dev_t, const char *) { + return ggml::backend::tinyblas::init(); + } + + static ggml_backend_buffer_type_t get_buffer_type(ggml_backend_dev_t) { + return ggml_backend_cpu_buffer_type(); + } + + static ggml_backend_buffer_t buffer_from_host_ptr(ggml_backend_dev_t, void * ptr, size_t size, size_t) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + } + + static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + //const struct ggml_tensor * src0 = op->src[0]; + //const struct ggml_tensor * src1 = op->src[1]; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + case GGML_OP_MUL_MAT: + return supports_mul_mat(device, op); + default: + return false; + } + } + + static bool supports_buft(ggml_backend_dev_t, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + } + + static const struct ggml_backend_device_i interface = { + /* .get_name = */ get_name, + /* .get_description = */ get_description, + /* .get_memory = */ get_memory, + /* .get_type = */ get_type, + /* .get_props = */ get_props, + /* .init_backend = */ init_backend, + /* .get_buffer_type = */ get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ buffer_from_host_ptr, + /* .supports_op = */ supports_op, + /* .supports_buft = */ supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, + }; + +} + +// backend reg interface +namespace ggml::backend::tinyblas::reg { + static const char * get_name(ggml_backend_reg_t) { + return ggml::backend::tinyblas::NAME; + } + + static size_t get_device_count(ggml_backend_reg_t) { + return 1; + } + + static ggml_backend_dev_t get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device device = { + /* .iface = */ ggml::backend::tinyblas::device::interface, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &device; + } + + static void * get_proc_address(ggml_backend_reg_t, const char * name) { + if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_n_threads; + } + if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_pp_threads; + } + if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_tg_threads; + } + return NULL; + } + + static const struct ggml_backend_reg_i interface = { + /* .get_name = */ get_name, + /* .get_device_count = */ get_device_count, + /* .get_device = */ get_device, + /* .get_proc_address = */ get_proc_address, + }; + +} + +ggml_backend_reg_t ggml_backend_tinyblas_reg(void) { + static struct ggml_backend_reg backend_reg = { + /* .iface = */ ggml::backend::tinyblas::reg::interface, + /* .context = */ NULL, + }; + return &backend_reg; +} diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp similarity index 80% rename from ggml/src/ggml-cpu/llamafile/sgemm.cpp rename to ggml/src/ggml-tinyblas/sgemm.cpp index b2ce2e664..5c7a3c357 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-tinyblas/sgemm.cpp @@ -50,8 +50,6 @@ #include "sgemm.h" #include "ggml-impl.h" -// hack until moved into the CPU backend -#include "../ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER @@ -135,6 +133,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) { return _mm512_fmadd_ps(a, b, c); } #endif +#if defined(__AVX512BF16__) +template <> +inline __m512 madd(__m512bh a, __m512bh b, __m512 c) { + return _mm512_dpbf16_ps(c, a, b); +} +template <> +inline __m256 madd(__m256bh a, __m256bh b, __m256 c) { + return _mm256_dpbf16_ps(c, a, b); +} +#endif #endif #if defined(__ARM_FEATURE_FMA) @@ -226,6 +234,13 @@ template <> inline __m256 load(const float *p) { } #endif // __AVX__ +#if defined(__AVX2__) || defined(__AVX512F__) +template <> inline __m256 load(const ggml_bf16_t *p) { + return _mm256_castsi256_ps( + _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16)); +} +#endif // __AVX2__ + #if defined(__F16C__) template <> inline __m256 load(const ggml_fp16_t *p) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p)); @@ -239,8 +254,27 @@ template <> inline __m512 load(const float *p) { template <> inline __m512 load(const ggml_fp16_t *p) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p)); } +template <> inline __m512 load(const ggml_bf16_t *p) { + return _mm512_castsi512_ps( + _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16)); +} #endif // __AVX512F__ +#if defined(__AVX512BF16__) +template <> inline __m512bh load(const ggml_bf16_t *p) { + return (__m512bh)_mm512_loadu_ps((const float *)p); +} +template <> inline __m256bh load(const ggml_bf16_t *p) { + return (__m256bh)_mm256_loadu_ps((const float *)p); +} +template <> inline __m512bh load(const float *p) { + return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p)); +} +template <> inline __m256bh load(const float *p) { + return _mm512_cvtneps_pbh(_mm512_loadu_ps(p)); +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // CONSTANTS @@ -1627,259 +1661,395 @@ class tinyBLAS_PPC { #endif } // namespace -/** - * Performs optimized matrix multiplication on CPU. - * - * This subroutine may compute C = Aᵀ * B with column major ordering. - * Despite its name, this isn't a generalized implementation. Work is - * only performed when a handwritten kernel is written and available. - * Otherwise the caller should fall back to a general matmul routine. - * - * For example, for single-threaded single-precision GEMM you can say - * - * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, - * 0, 1, - * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); - * - * @param m is rows in `A` and `C` - * @param n is cols in `B` and `C` - * @param k is cols in `A` and rows in `B` - * @param A is first input matrix (always transposed) - * @param lda is row stride of `A` - * @param B is second input matrix (never transposed) - * @param ldb is row stride of `B` - * @param C is input/output array of output matrices - * @param ldc is row stride of `C` - * @param ith is thread id (must be less than `nth`) - * @param nth is number of threads (must be greater than zero) - * @param Atype is GGML data type of `A` - * @param Btype is GGML data type of `B` - * @param Ctype is GGML data type of `C` - * @return true if this function was able to service the matmul request - */ -bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) { +namespace ggml::backend::tinyblas { - assert(m >= 0); - assert(n >= 0); - assert(k >= 0); - assert(lda >= k); - assert(ldb >= k); - assert(ldc >= m); - assert(nth > 0); - assert(ith < nth); + /** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * For example, for single-threaded single-precision GEMM you can say + * + * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1); + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @return true if this function was able to service the matmul request + */ - // only enable sgemm for prompt processing - if (n < 2) - return false; - - if (Ctype != GGML_TYPE_F32) - return false; - - switch (Atype) { - - case GGML_TYPE_F32: { - if (Btype != GGML_TYPE_F32) - return false; + template + bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX512F__) - if (k % 16) - return false; - tinyBLAS<16, __m512, __m512, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__AVX__) || defined(__AVX2__) - if (k % 8) - return false; - tinyBLAS<8, __m256, __m256, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_NEON) - if (n < 4) - return false; - if (k % 4) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__MMA__) - if (k % 8) - return false; - tinyBLAS_PPC tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#else - return false; + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } #endif +#if defined(__AVX__) || defined(__AVX2__) + if ((k % 8)==0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_NEON) + if ((k % 4) == 0) { + if constexpr (RUN) { + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + // TODO: voir a mettre ca dans un autre fichier... +#if defined(__MMA__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS_PPC tb{ k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; } + template bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); - case GGML_TYPE_F16: { + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX512F__) - if (k % 16) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_NEON) && !defined(_MSC_VER) + if ((k % 4) == 0) { + if constexpr (RUN) { + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512F__) + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512BF16__) + // wait for convert B => bf16? + //if ((k % 32) == 0) { + // if constexpr (RUN) { + // tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + // tb.matmul(m, n); + // } + // return true; + //} +#elif defined(__AVX512F__) + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) - if (k % 8) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) - if (n < 8) - return false; - if (k % 8) - return false; - if (Btype != GGML_TYPE_F16) - return false; - tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const ggml_fp16_t *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - if (k % 4) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#else - return false; + // TODO #endif + return false; } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512BF16__) + if ((k % 32) == 0) { + if constexpr (RUN) { + tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } + // 2eme chance... + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m256, __m256bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q8_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q8_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM tb{ - k, (const block_q8_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_ARM tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q4_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q4_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM tb{ - k, (const block_q4_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_ARM tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q5_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q5_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); - case GGML_TYPE_IQ4_NL: { - if (Btype != GGML_TYPE_Q8_0) - return false; + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_iq4_nl *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } - - default: - return false; - } - - (void)m; - (void)n; - (void)k; - (void)A; - (void)lda; - (void)B; - (void)ldb; - (void)C; - (void)ldc; - (void)ith; - (void)nth; - (void)Atype; - (void)Btype; - (void)Ctype; + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); } diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h new file mode 100644 index 000000000..88c014e3e --- /dev/null +++ b/ggml/src/ggml-tinyblas/sgemm.h @@ -0,0 +1,51 @@ +#pragma once +//#include +#include "ggml.h" +#define GGML_COMMON_DECL_C +//#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" + +// appelé que depuis du c++ (le tinyBLAS backend) + +namespace ggml::backend::tinyblas { + + // on est en C++ + // => on peu avoir autant de fonction que de type. + // calcule C = Aᵀ * B + template + bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); +} diff --git a/src/llama.cpp b/src/llama.cpp index 1703104fb..d6c7cd08e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22034,7 +22034,6 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; return s.c_str(); }