add amx kernel for gemm

add intel amx isa detection

add vnni kernel for gemv cases

add vnni and amx kernel support for block_q8_0

code cleanup

fix packing B issue

enable openmp

fine tune amx kernel

switch to aten parallel pattern

add error message for nested parallelism

code cleanup

add f16 support in ggml-amx

add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS
This commit is contained in:
mingfeima 2024-04-06 19:57:25 -07:00
parent 15fa07a5c5
commit 3ff0c0e16f
6 changed files with 2636 additions and 5 deletions

View file

@ -90,11 +90,6 @@ GGML_METAL := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_OPENMP
GGML_OPENMP := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_RPC ifdef LLAMA_RPC
GGML_RPC := 1 GGML_RPC := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
@ -348,6 +343,12 @@ ifdef LLAMA_SANITIZE_UNDEFINED
MK_LDFLAGS += -fsanitize=undefined -g MK_LDFLAGS += -fsanitize=undefined -g
endif endif
ifdef LLAMA_OPENMP
MK_CPPFLAGS += -fopenmp
MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp
endif
ifdef LLAMA_SERVER_VERBOSE ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif endif
@ -576,6 +577,11 @@ ifndef GGML_NO_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o OBJ_GGML += ggml/src/llamafile/sgemm.o
endif endif
ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx/mmq.o
endif
ifdef GGML_RPC ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML += ggml/src/ggml-rpc.o OBJ_GGML += ggml/src/ggml-rpc.o
@ -1065,6 +1071,14 @@ ggml/src/llamafile/sgemm.o: \
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE endif # GGML_NO_LLAMAFILE
ifndef GGML_NO_AMX
ggml/src/ggml-amx/mmq.o: \
ggml/src/ggml-amx/mmq.cpp \
ggml/src/ggml-amx/mmq.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef GGML_RPC ifdef GGML_RPC
ggml/src/ggml-rpc.o: \ ggml/src/ggml-rpc.o: \
ggml/src/ggml-rpc.cpp \ ggml/src/ggml-rpc.cpp \
@ -1210,6 +1224,7 @@ clean:
rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -rvf $(BUILD_TARGETS) rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS) rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

View file

@ -2387,6 +2387,7 @@ extern "C" {
GGML_API int ggml_cpu_has_avx512_vbmi(void); GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void); GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void); GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void); GGML_API int ggml_cpu_has_sve (void);

2564
ggml/src/ggml-amx/mmq.cpp Normal file

File diff suppressed because it is too large Load diff

17
ggml/src/ggml-amx/mmq.h Normal file
View file

@ -0,0 +1,17 @@
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
bool ggml_amx_init(void);
bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor * dst);
void ggml_mul_mat_amx(struct ggml_tensor * dst, int nth, int ith, void * wdata, int wsize);
#ifdef __cplusplus
}
#endif

View file

@ -44,10 +44,19 @@ int ggml_sve_cnt_b = 0;
#undef GGML_USE_LLAMAFILE #undef GGML_USE_LLAMAFILE
#endif #endif
// enable AMX only with OPENMP
#if !defined(__AMX_INT8__) || !defined(GGML_USE_OPENMP)
#undef GGML_USE_AMX
#endif
#ifdef GGML_USE_LLAMAFILE #ifdef GGML_USE_LLAMAFILE
#include <llamafile/sgemm.h> #include <llamafile/sgemm.h>
#endif #endif
#ifdef GGML_USE_AMX
#include <ggml-amx/mmq.h>
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts // disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :) // we should just be careful :)
@ -399,6 +408,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
// precomputed f32 table for f16 (256 KB) (ggml-impl.h) // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16]; float ggml_table_f32_f16[1 << 16];
#if GGML_USE_AMX
// global flag for amx init
static bool ggml_amx_initialized = false;
#endif
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
switch (status) { switch (status) {
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
@ -3513,6 +3527,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
} }
#if GGML_USE_AMX
ggml_amx_initialized = ggml_amx_init();
#endif
is_first_call = false; is_first_call = false;
} }
@ -12311,6 +12329,13 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if GGML_USE_AMX
if (ggml_compute_forward_mul_mat_use_amx(dst) && ggml_amx_initialized) {
ggml_mul_mat_amx(dst, nth, ith, params->wdata, params->wsize);
return;
}
#endif
#if GGML_USE_LLAMAFILE #if GGML_USE_LLAMAFILE
// broadcast factors // broadcast factors
const int64_t r2 = ne12 / ne02; const int64_t r2 = ne12 / ne02;
@ -21953,6 +21978,14 @@ int ggml_cpu_has_avx512_bf16(void) {
#endif #endif
} }
int ggml_cpu_has_amx_int8(void) {
#if defined(__AMX_INT8__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fma(void) { int ggml_cpu_has_fma(void) {
#if defined(__FMA__) #if defined(__FMA__)
return 1; return 1;

View file

@ -19077,6 +19077,7 @@ const char * llama_print_system_info(void) {
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";