Merge 82755ed08a
into 3c7989fd29
This commit is contained in:
commit
721e2b1d8b
10 changed files with 2656 additions and 6 deletions
|
@ -85,6 +85,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
|
||||||
set(GGML_LLAMAFILE ON)
|
set(GGML_LLAMAFILE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED GGML_AMX)
|
||||||
|
set(GGML_AMX ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
||||||
set(GGML_CUDA_USE_GRAPHS ON)
|
set(GGML_CUDA_USE_GRAPHS ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
19
Makefile
19
Makefile
|
@ -93,11 +93,6 @@ GGML_METAL := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_OPENMP
|
|
||||||
GGML_OPENMP := 1
|
|
||||||
DEPRECATE_WARNING := 1
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef LLAMA_RPC
|
ifdef LLAMA_RPC
|
||||||
GGML_RPC := 1
|
GGML_RPC := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
|
@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
|
||||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef GGML_NO_AMX
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_AMX
|
||||||
|
OBJ_GGML += ggml/src/ggml-amx/mmq.o
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef GGML_RPC
|
ifdef GGML_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
OBJ_GGML += ggml/src/ggml-rpc.o
|
OBJ_GGML += ggml/src/ggml-rpc.o
|
||||||
|
@ -1077,6 +1077,14 @@ ggml/src/llamafile/sgemm.o: \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif # GGML_NO_LLAMAFILE
|
endif # GGML_NO_LLAMAFILE
|
||||||
|
|
||||||
|
ifndef GGML_NO_AMX
|
||||||
|
ggml/src/ggml-amx/mmq.o: \
|
||||||
|
ggml/src/ggml-amx/mmq.cpp \
|
||||||
|
ggml/src/ggml-amx/mmq.h \
|
||||||
|
ggml/include/ggml.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef GGML_RPC
|
ifdef GGML_RPC
|
||||||
ggml/src/ggml-rpc.o: \
|
ggml/src/ggml-rpc.o: \
|
||||||
ggml/src/ggml-rpc.cpp \
|
ggml/src/ggml-rpc.cpp \
|
||||||
|
@ -1223,6 +1231,7 @@ clean:
|
||||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
rm -vrf ggml/src/ggml-metal-embed.metal
|
||||||
rm -vrf ggml/src/ggml-cuda/*.o
|
rm -vrf ggml/src/ggml-cuda/*.o
|
||||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||||
|
rm -vrf ggml/src/ggml-amx/*.o
|
||||||
rm -rvf $(BUILD_TARGETS)
|
rm -rvf $(BUILD_TARGETS)
|
||||||
rm -rvf $(TEST_TARGETS)
|
rm -rvf $(TEST_TARGETS)
|
||||||
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
|
|
|
@ -28,7 +28,7 @@ variety of hardware - locally and in the cloud.
|
||||||
|
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
|
|
|
@ -149,6 +149,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
||||||
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
||||||
option(GGML_RPC "ggml: use RPC" OFF)
|
option(GGML_RPC "ggml: use RPC" OFF)
|
||||||
|
option(GGML_AMX "ggml: use AMX" OFF)
|
||||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
|
|
|
@ -2455,6 +2455,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
|
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
GGML_API int ggml_cpu_has_fma (void);
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
GGML_API int ggml_cpu_has_neon (void);
|
||||||
GGML_API int ggml_cpu_has_sve (void);
|
GGML_API int ggml_cpu_has_sve (void);
|
||||||
|
|
|
@ -265,6 +265,14 @@ if (GGML_LLAMAFILE)
|
||||||
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
|
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_AMX)
|
||||||
|
message(STATUS "Using AMX")
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_AMX)
|
||||||
|
set(GGML_HEADERS_AMX ggml-amx/mmq.h)
|
||||||
|
set(GGML_SOURCES_AMX ggml-amx/mmq.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA)
|
if (GGML_CUDA)
|
||||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||||
|
|
||||||
|
@ -1324,6 +1332,7 @@ add_library(ggml
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
|
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
|
||||||
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
|
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
|
||||||
ggml-aarch64.c ggml-aarch64.h
|
ggml-aarch64.c ggml-aarch64.h
|
||||||
)
|
)
|
||||||
|
|
2575
ggml/src/ggml-amx/mmq.cpp
Normal file
2575
ggml/src/ggml-amx/mmq.cpp
Normal file
File diff suppressed because it is too large
Load diff
17
ggml/src/ggml-amx/mmq.h
Normal file
17
ggml/src/ggml-amx/mmq.h
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma once
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
bool ggml_amx_init(void);
|
||||||
|
|
||||||
|
bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_mul_mat_amx(struct ggml_tensor * dst, int nth, int ith, void * wdata, int wsize);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -44,10 +44,19 @@ int ggml_sve_cnt_b = 0;
|
||||||
#undef GGML_USE_LLAMAFILE
|
#undef GGML_USE_LLAMAFILE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// enable AMX only with OPENMP
|
||||||
|
#if !defined(__AMX_INT8__) || !defined(GGML_USE_OPENMP)
|
||||||
|
#undef GGML_USE_AMX
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_LLAMAFILE
|
#ifdef GGML_USE_LLAMAFILE
|
||||||
#include <llamafile/sgemm.h>
|
#include <llamafile/sgemm.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_AMX
|
||||||
|
#include <ggml-amx/mmq.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid hundreds of casts
|
// disable "possible loss of data" to avoid hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
|
@ -430,6 +439,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||||
float ggml_table_f32_f16[1 << 16];
|
float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
#if GGML_USE_AMX
|
||||||
|
// global flag for amx init
|
||||||
|
static bool ggml_amx_initialized = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
||||||
switch (status) {
|
switch (status) {
|
||||||
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
||||||
|
@ -3693,6 +3707,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if GGML_USE_AMX
|
||||||
|
ggml_amx_initialized = ggml_amx_init();
|
||||||
|
#endif
|
||||||
|
|
||||||
is_first_call = false;
|
is_first_call = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12819,6 +12837,13 @@ static void ggml_compute_forward_mul_mat(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
|
#if GGML_USE_AMX
|
||||||
|
if (ggml_compute_forward_mul_mat_use_amx(dst) && ggml_amx_initialized) {
|
||||||
|
ggml_mul_mat_amx(dst, nth, ith, params->wdata, params->wsize);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
// broadcast factors
|
// broadcast factors
|
||||||
const int64_t r2 = ne12 / ne02;
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
@ -23256,6 +23281,14 @@ int ggml_cpu_has_avx512_bf16(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_amx_int8(void) {
|
||||||
|
#if defined(__AMX_INT8__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_fma(void) {
|
int ggml_cpu_has_fma(void) {
|
||||||
#if defined(__FMA__)
|
#if defined(__FMA__)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -20678,6 +20678,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
||||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||||
|
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
|
||||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue