ggml : move AMX to the CPU backend (#10570)
* ggml : move AMX to the CPU backend --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							parent
							
								
									b782e5c7d4
								
							
						
					
					
						commit
						7cc2d2c889
					
				
					 64 changed files with 514 additions and 801 deletions
				
			
		|  | @ -17,8 +17,10 @@ Checks: > | ||||||
|     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, |     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, | ||||||
|     performance-*, |     performance-*, | ||||||
|     portability-*, |     portability-*, | ||||||
|  |     -portability-simd-intrinsics, | ||||||
|     misc-*, |     misc-*, | ||||||
|     -misc-const-correctness, |     -misc-const-correctness, | ||||||
|     -misc-non-private-member-variables-in-classes, |     -misc-non-private-member-variables-in-classes, | ||||||
|     -misc-no-recursion, |     -misc-no-recursion, | ||||||
|  |     -misc-use-anonymous-namespace, | ||||||
| FormatStyle: none | FormatStyle: none | ||||||
|  |  | ||||||
							
								
								
									
										5
									
								
								.github/workflows/build.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/build.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -1121,6 +1121,11 @@ jobs: | ||||||
|         run: | |         run: | | ||||||
|           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version |           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version | ||||||
| 
 | 
 | ||||||
|  |       - name: Install ccache | ||||||
|  |         uses: hendrikmuhs/ccache-action@v1.2 | ||||||
|  |         with: | ||||||
|  |           key: ${{ github.job }} | ||||||
|  | 
 | ||||||
|       - name: Build |       - name: Build | ||||||
|         id: cmake_build |         id: cmake_build | ||||||
|         run: | |         run: | | ||||||
|  |  | ||||||
							
								
								
									
										9
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										9
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -254,8 +254,8 @@ endif | ||||||
| # keep standard at C11 and C++11
 | # keep standard at C11 and C++11
 | ||||||
| MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU | MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU | ||||||
| MK_CFLAGS    = -std=c11   -fPIC | MK_CFLAGS    = -std=c11   -fPIC | ||||||
| MK_CXXFLAGS  = -std=c++11 -fPIC | MK_CXXFLAGS  = -std=c++17 -fPIC | ||||||
| MK_NVCCFLAGS = -std=c++11 | MK_NVCCFLAGS = -std=c++17 | ||||||
| 
 | 
 | ||||||
| ifdef LLAMA_NO_CCACHE | ifdef LLAMA_NO_CCACHE | ||||||
| GGML_NO_CCACHE := 1 | GGML_NO_CCACHE := 1 | ||||||
|  | @ -575,9 +575,12 @@ endif | ||||||
| 
 | 
 | ||||||
| ifndef GGML_NO_AMX | ifndef GGML_NO_AMX | ||||||
| 	MK_CPPFLAGS += -DGGML_USE_AMX | 	MK_CPPFLAGS += -DGGML_USE_AMX | ||||||
| 	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o | 	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o | ||||||
| endif | endif | ||||||
| 
 | 
 | ||||||
|  | # only necessary for the CPU backend files
 | ||||||
|  | MK_CPPFLAGS += -Iggml/src/ggml-cpu | ||||||
|  | 
 | ||||||
| ifdef GGML_RPC | ifdef GGML_RPC | ||||||
| 	MK_CPPFLAGS  += -DGGML_USE_RPC | 	MK_CPPFLAGS  += -DGGML_USE_RPC | ||||||
| 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o | 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o | ||||||
|  |  | ||||||
|  | @ -28,13 +28,16 @@ var cSettings: [CSetting] =  [ | ||||||
|     .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), |     .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), | ||||||
|     .unsafeFlags(["-fno-objc-arc"]), |     .unsafeFlags(["-fno-objc-arc"]), | ||||||
|     .headerSearchPath("ggml/src"), |     .headerSearchPath("ggml/src"), | ||||||
|  |     .headerSearchPath("ggml/src/ggml-cpu"), | ||||||
|     // NOTE: NEW_LAPACK will required iOS version 16.4+ |     // NOTE: NEW_LAPACK will required iOS version 16.4+ | ||||||
|     // We should consider add this in the future when we drop support for iOS 14 |     // We should consider add this in the future when we drop support for iOS 14 | ||||||
|     // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) |     // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) | ||||||
|     // .define("ACCELERATE_NEW_LAPACK"), |     // .define("ACCELERATE_NEW_LAPACK"), | ||||||
|     // .define("ACCELERATE_LAPACK_ILP64") |     // .define("ACCELERATE_LAPACK_ILP64") | ||||||
|  |     .define("GGML_USE_CPU"), | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| #if canImport(Darwin) | #if canImport(Darwin) | ||||||
| sources.append("ggml/src/ggml-common.h") | sources.append("ggml/src/ggml-common.h") | ||||||
| sources.append("ggml/src/ggml-metal/ggml-metal.m") | sources.append("ggml/src/ggml-metal/ggml-metal.m") | ||||||
|  | @ -44,7 +47,6 @@ cSettings.append( | ||||||
|     contentsOf: [ |     contentsOf: [ | ||||||
|         .define("GGML_USE_ACCELERATE"), |         .define("GGML_USE_ACCELERATE"), | ||||||
|         .define("GGML_USE_METAL"), |         .define("GGML_USE_METAL"), | ||||||
|         .define("GGML_USE_CPU") |  | ||||||
|     ] |     ] | ||||||
| ) | ) | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -88,5 +88,5 @@ if (LLAMA_CURL) | ||||||
| endif () | endif () | ||||||
| 
 | 
 | ||||||
| target_include_directories(${TARGET} PUBLIC .) | target_include_directories(${TARGET} PUBLIC .) | ||||||
| target_compile_features   (${TARGET} PUBLIC cxx_std_11) | target_compile_features   (${TARGET} PUBLIC cxx_std_17) | ||||||
| target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) | target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) | ||||||
|  |  | ||||||
|  | @ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) { | ||||||
| 
 | 
 | ||||||
|     std::u32string filename_utf32; |     std::u32string filename_utf32; | ||||||
|     try { |     try { | ||||||
|  | #if defined(__clang__) | ||||||
|  |         // disable C++17 deprecation warning for std::codecvt_utf8
 | ||||||
|  | #    pragma clang diagnostic push | ||||||
|  | #    pragma clang diagnostic ignored "-Wdeprecated-declarations" | ||||||
|  | #endif | ||||||
|         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; |         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; | ||||||
|  | 
 | ||||||
|  | #if defined(__clang__) | ||||||
|  | #    pragma clang diagnostic pop | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|         filename_utf32 = converter.from_bytes(filename); |         filename_utf32 = converter.from_bytes(filename); | ||||||
| 
 | 
 | ||||||
|         // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
 |         // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
 | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-batched-bench) | ||||||
| add_executable(${TARGET} batched-bench.cpp) | add_executable(${TARGET} batched-bench.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-batched) | ||||||
| add_executable(${TARGET} batched.cpp) | add_executable(${TARGET} batched.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml) | ||||||
| add_executable(${TARGET} convert-llama2c-to-ggml.cpp) | add_executable(${TARGET} convert-llama2c-to-ggml.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-cvector-generator) | ||||||
| add_executable(${TARGET} cvector-generator.cpp pca.hpp) | add_executable(${TARGET} cvector-generator.cpp pca.hpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-embedding) | ||||||
| add_executable(${TARGET} embedding.cpp) | add_executable(${TARGET} embedding.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,7 +2,7 @@ set(TARGET llama-eval-callback) | ||||||
| add_executable(${TARGET} eval-callback.cpp) | add_executable(${TARGET} eval-callback.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TEST_TARGET test-eval-callback) | set(TEST_TARGET test-eval-callback) | ||||||
| add_test(NAME ${TEST_TARGET} | add_test(NAME ${TEST_TARGET} | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-export-lora) | ||||||
| add_executable(${TARGET} export-lora.cpp) | add_executable(${TARGET} export-lora.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator) | ||||||
| add_executable(${TARGET} gbnf-validator.cpp) | add_executable(${TARGET} gbnf-validator.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-gen-docs) | ||||||
| add_executable(${TARGET} gen-docs.cpp) | add_executable(${TARGET} gen-docs.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) | ||||||
| target_link_libraries(${TARGET} PRIVATE sha256) | target_link_libraries(${TARGET} PRIVATE sha256) | ||||||
| 
 | 
 | ||||||
| target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-gguf-split) | ||||||
| add_executable(${TARGET} gguf-split.cpp) | add_executable(${TARGET} gguf-split.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-gguf) | ||||||
| add_executable(${TARGET} gguf.cpp) | add_executable(${TARGET} gguf.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-gritlm) | ||||||
| add_executable(${TARGET} gritlm.cpp) | add_executable(${TARGET} gritlm.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-imatrix) | ||||||
| add_executable(${TARGET} imatrix.cpp) | add_executable(${TARGET} imatrix.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-infill) | ||||||
| add_executable(${TARGET} infill.cpp) | add_executable(${TARGET} infill.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-bench) | ||||||
| add_executable(${TARGET} llama-bench.cpp) | add_executable(${TARGET} llama-bench.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .) | ||||||
| target_include_directories(llava PUBLIC ../..) | target_include_directories(llava PUBLIC ../..) | ||||||
| target_include_directories(llava PUBLIC ../../common) | target_include_directories(llava PUBLIC ../../common) | ||||||
| 
 | 
 | ||||||
| target_compile_features(llava PRIVATE cxx_std_11) | target_compile_features(llava PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| add_library(llava_static STATIC $<TARGET_OBJECTS:llava>) | add_library(llava_static STATIC $<TARGET_OBJECTS:llava>) | ||||||
| if (BUILD_SHARED_LIBS) | if (BUILD_SHARED_LIBS) | ||||||
|  | @ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp) | ||||||
| set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) | set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-minicpmv-cli) | set(TARGET llama-minicpmv-cli) | ||||||
| add_executable(${TARGET} minicpmv-cli.cpp) | add_executable(${TARGET} minicpmv-cli.cpp) | ||||||
| set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) | set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-lookahead) | ||||||
| add_executable(${TARGET} lookahead.cpp) | add_executable(${TARGET} lookahead.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,22 +2,22 @@ set(TARGET llama-lookup) | ||||||
| add_executable(${TARGET} lookup.cpp) | add_executable(${TARGET} lookup.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-lookup-create) | set(TARGET llama-lookup-create) | ||||||
| add_executable(${TARGET} lookup-create.cpp) | add_executable(${TARGET} lookup-create.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-lookup-merge) | set(TARGET llama-lookup-merge) | ||||||
| add_executable(${TARGET} lookup-merge.cpp) | add_executable(${TARGET} lookup-merge.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-lookup-stats) | set(TARGET llama-lookup-stats) | ||||||
| add_executable(${TARGET} lookup-stats.cpp) | add_executable(${TARGET} lookup-stats.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) | ||||||
| target_include_directories(${TARGET} PRIVATE ${_common_path}) | target_include_directories(${TARGET} PRIVATE ${_common_path}) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-cli) | ||||||
| add_executable(${TARGET} main.cpp) | add_executable(${TARGET} main.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-parallel) | ||||||
| add_executable(${TARGET} parallel.cpp) | add_executable(${TARGET} parallel.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-passkey) | ||||||
| add_executable(${TARGET} passkey.cpp) | add_executable(${TARGET} passkey.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-perplexity) | ||||||
| add_executable(${TARGET} perplexity.cpp) | add_executable(${TARGET} perplexity.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_include_directories(${TARGET} PRIVATE ../../common) | target_include_directories(${TARGET} PRIVATE ../../common) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_include_directories(${TARGET} PRIVATE ../../common) | target_include_directories(${TARGET} PRIVATE ../../common) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-retrieval) | ||||||
| add_executable(${TARGET} retrieval.cpp) | add_executable(${TARGET} retrieval.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-run) | ||||||
| add_executable(${TARGET} run.cpp) | add_executable(${TARGET} run.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-save-load-state) | ||||||
| add_executable(${TARGET} save-load-state.cpp) | add_executable(${TARGET} save-load-state.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -50,4 +50,4 @@ if (WIN32) | ||||||
|     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) |     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-simple-chat) | ||||||
| add_executable(${TARGET} simple-chat.cpp) | add_executable(${TARGET} simple-chat.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-simple) | ||||||
| add_executable(${TARGET} simple.cpp) | add_executable(${TARGET} simple.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-speculative-simple) | ||||||
| add_executable(${TARGET} speculative-simple.cpp) | add_executable(${TARGET} speculative-simple.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-speculative) | ||||||
| add_executable(${TARGET} speculative.cpp) | add_executable(${TARGET} speculative.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -2,4 +2,4 @@ set(TARGET llama-tokenize) | ||||||
| add_executable(${TARGET} tokenize.cpp) | add_executable(${TARGET} tokenize.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -161,7 +161,6 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING | ||||||
| set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)") | set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)") | ||||||
| option(GGML_OPENMP                          "ggml: use OpenMP"                                ON) | option(GGML_OPENMP                          "ggml: use OpenMP"                                ON) | ||||||
| option(GGML_RPC                             "ggml: use RPC"                                   OFF) | option(GGML_RPC                             "ggml: use RPC"                                   OFF) | ||||||
| option(GGML_AMX                             "ggml: use AMX"                                   OFF) |  | ||||||
| option(GGML_SYCL                            "ggml: use SYCL"                                  OFF) | option(GGML_SYCL                            "ggml: use SYCL"                                  OFF) | ||||||
| option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF) | option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF) | ||||||
| set   (GGML_SYCL_TARGET "INTEL" CACHE STRING | set   (GGML_SYCL_TARGET "INTEL" CACHE STRING | ||||||
|  |  | ||||||
|  | @ -1,25 +0,0 @@ | ||||||
| #pragma once |  | ||||||
| 
 |  | ||||||
| #include "ggml.h" |  | ||||||
| #include "ggml-backend.h" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| #ifdef  __cplusplus |  | ||||||
| extern "C" { |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| // buffer_type API
 |  | ||||||
| GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); |  | ||||||
| 
 |  | ||||||
| GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend); |  | ||||||
| 
 |  | ||||||
| // backend API
 |  | ||||||
| GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void); |  | ||||||
| 
 |  | ||||||
| GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); |  | ||||||
| 
 |  | ||||||
| GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void); |  | ||||||
| 
 |  | ||||||
| #ifdef  __cplusplus |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  | @ -261,21 +261,15 @@ function(ggml_add_backend backend) | ||||||
|     if (${backend_id}) |     if (${backend_id}) | ||||||
|         string(TOLOWER "ggml-${backend}" backend_target) |         string(TOLOWER "ggml-${backend}" backend_target) | ||||||
|         add_subdirectory(${backend_target}) |         add_subdirectory(${backend_target}) | ||||||
|         # check again in case the backend disabled itself |  | ||||||
|         # note that this should NOT be the normal behavior, in case of errors the backend should fail the build |  | ||||||
|         # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp |  | ||||||
|         if (${backend_id}) |  | ||||||
|         message(STATUS "Including ${backend} backend") |         message(STATUS "Including ${backend} backend") | ||||||
|         if (NOT GGML_BACKEND_DL) |         if (NOT GGML_BACKEND_DL) | ||||||
|             string(TOUPPER "GGML_USE_${backend}" backend_use) |             string(TOUPPER "GGML_USE_${backend}" backend_use) | ||||||
|             target_compile_definitions(ggml PUBLIC ${backend_use}) |             target_compile_definitions(ggml PUBLIC ${backend_use}) | ||||||
|         endif() |         endif() | ||||||
|     endif() |     endif() | ||||||
|     endif() |  | ||||||
| endfunction() | endfunction() | ||||||
| 
 | 
 | ||||||
| ggml_add_backend(CPU) | ggml_add_backend(CPU) | ||||||
| ggml_add_backend(AMX) |  | ||||||
| ggml_add_backend(BLAS) | ggml_add_backend(BLAS) | ||||||
| ggml_add_backend(CANN) | ggml_add_backend(CANN) | ||||||
| ggml_add_backend(CUDA) | ggml_add_backend(CUDA) | ||||||
|  | @ -289,7 +283,7 @@ ggml_add_backend(Vulkan) | ||||||
| 
 | 
 | ||||||
| foreach (target ggml-base ggml) | foreach (target ggml-base ggml) | ||||||
|     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) |     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) | ||||||
|     target_compile_features   (${target} PRIVATE c_std_11) # don't bump |     target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump | ||||||
| endforeach() | endforeach() | ||||||
| 
 | 
 | ||||||
| target_link_libraries(ggml-base PRIVATE Threads::Threads) | target_link_libraries(ggml-base PRIVATE Threads::Threads) | ||||||
|  |  | ||||||
|  | @ -1,105 +0,0 @@ | ||||||
| if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR |  | ||||||
|         (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND |  | ||||||
|          CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND |  | ||||||
|         CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) |  | ||||||
|     message(STATUS "Using AMX") |  | ||||||
| 
 |  | ||||||
|     file(GLOB   GGML_HEADERS_AMX "*.h") |  | ||||||
|     list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h") |  | ||||||
| 
 |  | ||||||
|     file(GLOB   GGML_SOURCES_AMX "*.cpp") |  | ||||||
| 
 |  | ||||||
|     ggml_add_backend_library(ggml-amx |  | ||||||
|                              ${GGML_HEADERS_AMX} |  | ||||||
|                              ${GGML_SOURCES_AMX} |  | ||||||
|                             ) |  | ||||||
| 
 |  | ||||||
|     # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags |  | ||||||
|     # TODO: integrate AMX backend into the CPU backend |  | ||||||
|     if (MSVC) |  | ||||||
|         # instruction set detection for MSVC only |  | ||||||
|         if (GGML_NATIVE) |  | ||||||
|             # TODO: improve, should not reference files from the parent folder |  | ||||||
|             include(../ggml-cpu/cmake/FindSIMD.cmake) |  | ||||||
|         endif () |  | ||||||
|         if (GGML_AVX512) |  | ||||||
|             list(APPEND ARCH_FLAGS /arch:AVX512) |  | ||||||
|             # MSVC has no compile-time flags enabling specific |  | ||||||
|             # AVX512 extensions, neither it defines the |  | ||||||
|             # macros corresponding to the extensions. |  | ||||||
|             # Do it manually. |  | ||||||
|             if (GGML_AVX512_VBMI) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>) |  | ||||||
|             endif() |  | ||||||
|             if (GGML_AVX512_VNNI) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) |  | ||||||
|             endif() |  | ||||||
|             if (GGML_AVX512_BF16) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>) |  | ||||||
|             endif() |  | ||||||
|             if (GGML_AMX_TILE) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>) |  | ||||||
|             endif() |  | ||||||
|             if (GGML_AMX_INT8) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>) |  | ||||||
|             endif() |  | ||||||
|             if (GGML_AMX_BF16) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>) |  | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>) |  | ||||||
|             endif() |  | ||||||
|         elseif (GGML_AVX2) |  | ||||||
|             list(APPEND ARCH_FLAGS /arch:AVX2) |  | ||||||
|         elseif (GGML_AVX) |  | ||||||
|             list(APPEND ARCH_FLAGS /arch:AVX) |  | ||||||
|         endif() |  | ||||||
|     else() |  | ||||||
|         if (GGML_NATIVE) |  | ||||||
|             list(APPEND ARCH_FLAGS -march=native) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_F16C) |  | ||||||
|             list(APPEND ARCH_FLAGS -mf16c) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_FMA) |  | ||||||
|             list(APPEND ARCH_FLAGS -mfma) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX2) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx2) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX512) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512f) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512dq) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512bw) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX512_VBMI) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512vbmi) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX512_VNNI) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512vnni) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AVX512_BF16) |  | ||||||
|             list(APPEND ARCH_FLAGS -mavx512bf16) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AMX_TILE) |  | ||||||
|             list(APPEND ARCH_FLAGS -mamx-tile) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AMX_INT8) |  | ||||||
|             list(APPEND ARCH_FLAGS -mamx-int8) |  | ||||||
|         endif() |  | ||||||
|         if (GGML_AMX_BF16) |  | ||||||
|             list(APPEND ARCH_FLAGS -mamx-bf16) |  | ||||||
|         endif() |  | ||||||
|     endif() |  | ||||||
| 
 |  | ||||||
|     target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS}) |  | ||||||
| else() |  | ||||||
|     set(GGML_AMX OFF PARENT_SCOPE) |  | ||||||
|     message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.") |  | ||||||
| endif() |  | ||||||
|  | @ -1,449 +0,0 @@ | ||||||
| #include "ggml-amx.h" |  | ||||||
| #include "ggml-amx/common.h" |  | ||||||
| #include "ggml-amx/mmq.h" |  | ||||||
| #include "ggml-backend-impl.h" |  | ||||||
| #include "ggml-impl.h" |  | ||||||
| 
 |  | ||||||
| #if defined(__gnu_linux__) |  | ||||||
| #include <sys/syscall.h> |  | ||||||
| #include <unistd.h> |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #include <cstdlib> |  | ||||||
| #include <cstring> |  | ||||||
| #include <memory> |  | ||||||
| 
 |  | ||||||
| #if defined(__AMX_INT8__) |  | ||||||
| 
 |  | ||||||
| // AMX buffer interface
 |  | ||||||
| static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { |  | ||||||
|     free(buffer->context); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { |  | ||||||
|     return (void *)(buffer->context); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { |  | ||||||
|     memset((char *)tensor->data + offset, value, size); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |  | ||||||
|     if (qtype_has_amx_kernels(tensor->type)) { |  | ||||||
|         ggml_backend_amx_convert_weight(tensor, data, offset, size); |  | ||||||
|     } else { |  | ||||||
|         memcpy((char *)tensor->data + offset, data, size); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { |  | ||||||
|     GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); |  | ||||||
|     memcpy(data, (const char *)tensor->data + offset, size); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { |  | ||||||
|     if (ggml_backend_buffer_is_host(src->buffer)) { |  | ||||||
|         if (qtype_has_amx_kernels(src->type)) { |  | ||||||
|             ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst)); |  | ||||||
|         } else { |  | ||||||
|             memcpy(dst->data, src->data, ggml_nbytes(src)); |  | ||||||
|         } |  | ||||||
|         return true; |  | ||||||
|     } |  | ||||||
|     return false; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { |  | ||||||
|     memset(buffer->context, value, buffer->size); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { |  | ||||||
|     /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer, |  | ||||||
|     /* .get_base        = */ ggml_backend_amx_buffer_get_base, |  | ||||||
|     /* .init_tensor     = */ NULL, // no initialization required
 |  | ||||||
|     /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor, |  | ||||||
|     /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor, |  | ||||||
|     /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor, |  | ||||||
|     /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor, |  | ||||||
|     /* .clear           = */ ggml_backend_amx_buffer_clear, |  | ||||||
|     /* .reset           = */ NULL, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |  | ||||||
|     return "AMX"; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buft); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |  | ||||||
|     void * data = aligned_alloc(TENSOR_ALIGNMENT, size); |  | ||||||
|     if (data == NULL) { |  | ||||||
|         fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); |  | ||||||
|         return NULL; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |  | ||||||
|     return TENSOR_ALIGNMENT; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buft); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { |  | ||||||
|     return ggml_backend_amx_get_alloc_size(tensor); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buft); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { |  | ||||||
|     return false; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(buft); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { |  | ||||||
|     static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { |  | ||||||
|         /* .iface = */ { |  | ||||||
|             /* .get_name         = */ ggml_backend_amx_buffer_type_get_name, |  | ||||||
|             /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer, |  | ||||||
|             /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment, |  | ||||||
|             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
 |  | ||||||
|             /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size, |  | ||||||
|             /* .is_host          = */ ggml_backend_amx_buffer_type_is_host, |  | ||||||
|         }, |  | ||||||
|         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), |  | ||||||
|         /* .context = */ NULL, |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     return &ggml_backend_buffer_type_amx; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| // backend interface
 |  | ||||||
| 
 |  | ||||||
| static const char * ggml_backend_amx_name(ggml_backend_t backend) { |  | ||||||
|     return "AMX"; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(backend); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_free(ggml_backend_t backend) { |  | ||||||
|     ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; |  | ||||||
|     delete ctx; |  | ||||||
|     delete backend; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |  | ||||||
|     ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; |  | ||||||
| 
 |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |  | ||||||
|         struct ggml_tensor * node = cgraph->nodes[i]; |  | ||||||
| 
 |  | ||||||
|         switch (node->op) { |  | ||||||
|         case GGML_OP_MUL_MAT: |  | ||||||
|             ggml_backend_amx_mul_mat(ctx, node); |  | ||||||
|             break; |  | ||||||
| 
 |  | ||||||
|         case GGML_OP_NONE: |  | ||||||
|         case GGML_OP_RESHAPE: |  | ||||||
|         case GGML_OP_VIEW: |  | ||||||
|         case GGML_OP_PERMUTE: |  | ||||||
|         case GGML_OP_TRANSPOSE: |  | ||||||
|             break; |  | ||||||
| 
 |  | ||||||
|         default: |  | ||||||
|             fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); |  | ||||||
|             GGML_ASSERT(false); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     return GGML_STATUS_SUCCESS; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(backend); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static struct ggml_backend_i ggml_backend_amx_i = { |  | ||||||
|     /* .get_name                = */ ggml_backend_amx_name, |  | ||||||
|     /* .free                    = */ ggml_backend_amx_free, |  | ||||||
|     /* .set_tensor_async        = */ NULL, |  | ||||||
|     /* .get_tensor_async        = */ NULL, |  | ||||||
|     /* .cpy_tensor_async        = */ NULL, |  | ||||||
|     /* .synchronize             = */ NULL, |  | ||||||
|     /* .graph_plan_create       = */ NULL, |  | ||||||
|     /* .graph_plan_free         = */ NULL, |  | ||||||
|     /* .graph_plan_update       = */ NULL, |  | ||||||
|     /* .graph_plan_compute      = */ NULL, |  | ||||||
|     /* .graph_compute           = */ ggml_backend_amx_graph_compute, |  | ||||||
|     /* .event_record            = */ NULL, |  | ||||||
|     /* .event_wait              = */ NULL, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static ggml_guid_t ggml_backend_amx_guid() { |  | ||||||
|     static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e }; |  | ||||||
|     return &guid; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #define ARCH_GET_XCOMP_PERM     0x1022 |  | ||||||
| #define ARCH_REQ_XCOMP_PERM     0x1023 |  | ||||||
| #define XFEATURE_XTILECFG       17 |  | ||||||
| #define XFEATURE_XTILEDATA      18 |  | ||||||
| 
 |  | ||||||
| static bool ggml_amx_init() { |  | ||||||
| #if defined(__gnu_linux__) |  | ||||||
|     if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { |  | ||||||
|         fprintf(stderr, "AMX is not ready to be used!\n"); |  | ||||||
|         return false; |  | ||||||
|     } |  | ||||||
|     return true; |  | ||||||
| #elif defined(_WIN32) |  | ||||||
|     return true; |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| ggml_backend_t ggml_backend_amx_init() { |  | ||||||
| 
 |  | ||||||
|     // invoke a Linux system call to request access to AMX features
 |  | ||||||
|     ggml_amx_init(); |  | ||||||
| 
 |  | ||||||
|     // backend context
 |  | ||||||
|     ggml_backend_amx_context * ctx = new ggml_backend_amx_context; |  | ||||||
| 
 |  | ||||||
|     // ggml amx backend
 |  | ||||||
|     ggml_backend_t backend = new ggml_backend { |  | ||||||
|         /* .guid      = */ ggml_backend_amx_guid(), |  | ||||||
|         /* .interface = */ ggml_backend_amx_i, |  | ||||||
|         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), |  | ||||||
|         /* .context   = */ ctx, |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     return backend; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool ggml_backend_is_amx(ggml_backend_t backend) { |  | ||||||
|     return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid()); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { |  | ||||||
|     GGML_ASSERT(ggml_backend_is_amx(backend_amx)); |  | ||||||
| 
 |  | ||||||
|     ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context; |  | ||||||
|     ctx->n_threads = n_threads; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| // device interface
 |  | ||||||
| 
 |  | ||||||
| static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) { |  | ||||||
|     return "AMX"; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) { |  | ||||||
|     return "Intel Advanced Matrix Extensions"; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |  | ||||||
|     // TODO
 |  | ||||||
|     *free = 0; |  | ||||||
|     *total = 0; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { |  | ||||||
|     return GGML_BACKEND_DEVICE_TYPE_ACCEL; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |  | ||||||
|     props->name        = ggml_backend_amx_device_get_name(dev); |  | ||||||
|     props->description = ggml_backend_amx_device_get_description(dev); |  | ||||||
|     props->type        = ggml_backend_amx_device_get_type(dev); |  | ||||||
|     ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total); |  | ||||||
| 
 |  | ||||||
|     // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
 |  | ||||||
|     props->caps = { |  | ||||||
|         /* .async                 = */ false, |  | ||||||
|         /* .host_buffer           = */ false, |  | ||||||
|         /* .buffer_from_host_ptr  = */ false, |  | ||||||
|         /* .events                = */ false, |  | ||||||
|     }; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) { |  | ||||||
|     return ggml_backend_amx_init(); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
|     GGML_UNUSED(params); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) { |  | ||||||
|     return ggml_backend_amx_buffer_type(); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |  | ||||||
| 
 |  | ||||||
|     // handle only 2d gemm for now
 |  | ||||||
|     auto is_contiguous_2d = [](const struct ggml_tensor * t) { |  | ||||||
|         return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     switch (op->op) { |  | ||||||
|         case GGML_OP_NONE: |  | ||||||
|         case GGML_OP_RESHAPE: |  | ||||||
|         case GGML_OP_VIEW: |  | ||||||
|         case GGML_OP_PERMUTE: |  | ||||||
|         case GGML_OP_TRANSPOSE: |  | ||||||
|             return true; |  | ||||||
| 
 |  | ||||||
|         case GGML_OP_MUL_MAT: { |  | ||||||
|             const struct ggml_tensor * src0 = op->src[0]; |  | ||||||
|             const struct ggml_tensor * src1 = op->src[1]; |  | ||||||
| 
 |  | ||||||
|             const enum ggml_type type = src0->type; |  | ||||||
|             const int64_t ne0 = op->ne[0]; |  | ||||||
| 
 |  | ||||||
|             // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
 |  | ||||||
|             // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
 |  | ||||||
|             bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); |  | ||||||
| 
 |  | ||||||
|             bool can_use_amx = |  | ||||||
|                 is_contiguous_2d(src0) &&       // src0 must be contiguous
 |  | ||||||
|                 is_contiguous_2d(src1) &&       // src1 must be contiguous
 |  | ||||||
|                 src1->type == GGML_TYPE_F32 &&  // src1 must be float32
 |  | ||||||
|                 has_amx_kernels &&              // with amx kernel impls
 |  | ||||||
|                 ne0 % (TILE_N * 2) == 0;        // out_features is 32x
 |  | ||||||
| 
 |  | ||||||
|             return can_use_amx; |  | ||||||
|         } |  | ||||||
|         default: |  | ||||||
|             return false; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |  | ||||||
|     return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(dev); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static const struct ggml_backend_device_i ggml_backend_amx_device_i = { |  | ||||||
|     /* .get_name             = */ ggml_backend_amx_device_get_name, |  | ||||||
|     /* .get_description      = */ ggml_backend_amx_device_get_description, |  | ||||||
|     /* .get_memory           = */ ggml_backend_amx_device_get_memory, |  | ||||||
|     /* .get_type             = */ ggml_backend_amx_device_get_type, |  | ||||||
|     /* .get_props            = */ ggml_backend_amx_device_get_props, |  | ||||||
|     /* .init_backend         = */ ggml_backend_amx_device_init, |  | ||||||
|     /* .get_buffer_type      = */ ggml_backend_amx_device_get_buffer_type, |  | ||||||
|     /* .get_host_buffer_type = */ NULL, |  | ||||||
|     /* .buffer_from_host_ptr = */ NULL, |  | ||||||
|     /* .supports_op          = */ ggml_backend_amx_device_supports_op, |  | ||||||
|     /* .supports_buft        = */ ggml_backend_amx_device_supports_buft, |  | ||||||
|     /* .offload_op           = */ NULL, |  | ||||||
|     /* .event_new            = */ NULL, |  | ||||||
|     /* .event_free           = */ NULL, |  | ||||||
|     /* .event_synchronize    = */ NULL, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| // backend reg interface
 |  | ||||||
| 
 |  | ||||||
| static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) { |  | ||||||
|     return "AMX"; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(reg); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) { |  | ||||||
|     return 1; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(reg); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) { |  | ||||||
|     GGML_ASSERT(index == 0); |  | ||||||
| 
 |  | ||||||
|     static ggml_backend_device ggml_backend_amx_device = { |  | ||||||
|         /* .iface   = */ ggml_backend_amx_device_i, |  | ||||||
|         /* .reg     = */ reg, |  | ||||||
|         /* .context = */ nullptr, |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     return &ggml_backend_amx_device; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(reg); |  | ||||||
|     GGML_UNUSED(index); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) { |  | ||||||
|     if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { |  | ||||||
|         return (void *)ggml_backend_amx_set_n_threads; |  | ||||||
|     } |  | ||||||
|     return NULL; |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(reg); |  | ||||||
|     GGML_UNUSED(name); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { |  | ||||||
|     /* .get_name         = */ ggml_backend_amx_reg_get_name, |  | ||||||
|     /* .get_device_count = */ ggml_backend_amx_reg_get_device_count, |  | ||||||
|     /* .get_device       = */ ggml_backend_amx_reg_get_device, |  | ||||||
|     /* .get_proc_address = */ ggml_backend_amx_get_proc_address, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| ggml_backend_reg_t ggml_backend_amx_reg(void) { |  | ||||||
|     static struct ggml_backend_reg ggml_backend_amx_reg = { |  | ||||||
|         /* .api_version = */ GGML_BACKEND_API_VERSION, |  | ||||||
|         /* .iface       = */ ggml_backend_amx_reg_i, |  | ||||||
|         /* .context     = */ NULL, |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     return &ggml_backend_amx_reg; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #else // if defined(__AMX_INT8__)
 |  | ||||||
| 
 |  | ||||||
| ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) { |  | ||||||
|     return nullptr; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool ggml_backend_is_amx(ggml_backend_t backend) { |  | ||||||
|     GGML_UNUSED(backend); |  | ||||||
|     return false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| ggml_backend_t ggml_backend_amx_init(void) { |  | ||||||
|     fprintf(stderr, "GGML is not compiled with AMX support!\n"); |  | ||||||
|     return nullptr; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { |  | ||||||
|     fprintf(stderr, "GGML is not compiled with AMX support!\n"); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(backend_amx); |  | ||||||
|     GGML_UNUSED(n_threads); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| ggml_backend_reg_t ggml_backend_amx_reg(void) { |  | ||||||
|     return nullptr; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) |  | ||||||
|  | @ -49,10 +49,6 @@ | ||||||
| #include "ggml-rpc.h" | #include "ggml-rpc.h" | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_AMX |  | ||||||
| #  include "ggml-amx.h" |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #ifdef GGML_USE_CANN | #ifdef GGML_USE_CANN | ||||||
| #include "ggml-cann.h" | #include "ggml-cann.h" | ||||||
| #endif | #endif | ||||||
|  | @ -92,9 +88,6 @@ struct ggml_backend_registry { | ||||||
| #ifdef GGML_USE_RPC | #ifdef GGML_USE_RPC | ||||||
|         register_backend(ggml_backend_rpc_reg()); |         register_backend(ggml_backend_rpc_reg()); | ||||||
| #endif | #endif | ||||||
| #ifdef GGML_USE_AMX |  | ||||||
|         register_backend(ggml_backend_amx_reg()); |  | ||||||
| #endif |  | ||||||
| #ifdef GGML_USE_KOMPUTE | #ifdef GGML_USE_KOMPUTE | ||||||
|         register_backend(ggml_backend_kompute_reg()); |         register_backend(ggml_backend_kompute_reg()); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st | ||||||
| 
 | 
 | ||||||
|     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { |     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { | ||||||
|         // since the tensor is pre-allocated, it cannot be moved to another backend
 |         // since the tensor is pre-allocated, it cannot be moved to another backend
 | ||||||
|         GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name); |         ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; | ||||||
|  |         GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op)); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // graph input
 |     // graph input
 | ||||||
|  |  | ||||||
|  | @ -1,12 +1,20 @@ | ||||||
| ggml_add_backend_library(ggml-cpu | ggml_add_backend_library(ggml-cpu) | ||||||
|  | 
 | ||||||
|  | list (APPEND GGML_CPU_SOURCES | ||||||
|     ggml-cpu.c |     ggml-cpu.c | ||||||
|     ggml-cpu.cpp |     ggml-cpu.cpp | ||||||
|     ggml-cpu-aarch64.c |     ggml-cpu-aarch64.c | ||||||
|     ggml-cpu-aarch64.h |     ggml-cpu-aarch64.h | ||||||
|     ggml-cpu-quants.c |     ggml-cpu-quants.c | ||||||
|     ggml-cpu-quants.h |     ggml-cpu-quants.h | ||||||
|  |     amx/amx.cpp | ||||||
|  |     amx/amx.h | ||||||
|  |     amx/mmq.cpp | ||||||
|  |     amx/mmq.h | ||||||
|  |     ggml-cpu-impl.h | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  | target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) | ||||||
| target_include_directories(ggml-cpu PRIVATE .) | target_include_directories(ggml-cpu PRIVATE .) | ||||||
| 
 | 
 | ||||||
| if (APPLE AND GGML_ACCELERATE) | if (APPLE AND GGML_ACCELERATE) | ||||||
|  | @ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE) | ||||||
|     if (ACCELERATE_FRAMEWORK) |     if (ACCELERATE_FRAMEWORK) | ||||||
|         message(STATUS "Accelerate framework found") |         message(STATUS "Accelerate framework found") | ||||||
| 
 | 
 | ||||||
|         add_compile_definitions(GGML_USE_ACCELERATE) |         target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) | ||||||
|         add_compile_definitions(ACCELERATE_NEW_LAPACK) |         target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) | ||||||
|         add_compile_definitions(ACCELERATE_LAPACK_ILP64) |         target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) | ||||||
| 
 | 
 | ||||||
|         target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) |         target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) | ||||||
|     else() |     else() | ||||||
|  | @ -29,15 +37,9 @@ if (GGML_OPENMP) | ||||||
|     if (OpenMP_FOUND) |     if (OpenMP_FOUND) | ||||||
|         message(STATUS "OpenMP found") |         message(STATUS "OpenMP found") | ||||||
| 
 | 
 | ||||||
|         add_compile_definitions(GGML_USE_OPENMP) |         target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) | ||||||
| 
 | 
 | ||||||
|         target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) |         target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) | ||||||
| 
 |  | ||||||
|         # FIXME: should be replaced with a compiler id check |  | ||||||
|         #if (GGML_MUSA) |  | ||||||
|         #    list(APPEND GGML_CPU_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include") |  | ||||||
|         #    list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so") |  | ||||||
|         #endif() |  | ||||||
|     else() |     else() | ||||||
|         message(WARNING "OpenMP not found") |         message(WARNING "OpenMP not found") | ||||||
|     endif() |     endif() | ||||||
|  | @ -46,9 +48,9 @@ endif() | ||||||
| if (GGML_LLAMAFILE) | if (GGML_LLAMAFILE) | ||||||
|     message(STATUS "Using llamafile") |     message(STATUS "Using llamafile") | ||||||
| 
 | 
 | ||||||
|     add_compile_definitions(GGML_USE_LLAMAFILE) |     target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) | ||||||
| 
 | 
 | ||||||
|     target_sources(ggml-cpu PRIVATE |     list(APPEND GGML_CPU_SOURCES | ||||||
|                 llamafile/sgemm.cpp |                 llamafile/sgemm.cpp | ||||||
|                 llamafile/sgemm.h) |                 llamafile/sgemm.h) | ||||||
| endif() | endif() | ||||||
|  | @ -58,7 +60,7 @@ if (GGML_CPU_HBM) | ||||||
| 
 | 
 | ||||||
|     message(STATUS "Using memkind for CPU HBM") |     message(STATUS "Using memkind for CPU HBM") | ||||||
| 
 | 
 | ||||||
|     add_compile_definitions(GGML_USE_CPU_HBM) |     target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) | ||||||
| 
 | 
 | ||||||
|     target_link_libraries(ggml-cpu PUBLIC memkind) |     target_link_libraries(ggml-cpu PUBLIC memkind) | ||||||
| endif() | endif() | ||||||
|  | @ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR | ||||||
|     message(STATUS "ARM detected") |     message(STATUS "ARM detected") | ||||||
| 
 | 
 | ||||||
|     if (MSVC) |     if (MSVC) | ||||||
|         add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead |         list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead | ||||||
|         add_compile_definitions(__ARM_NEON) |         list(APPEND ARCH_DEFINITIONS __ARM_NEON) | ||||||
|         add_compile_definitions(__ARM_FEATURE_FMA) |         list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) | ||||||
| 
 | 
 | ||||||
|         set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) |         set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) | ||||||
|         string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") |         string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") | ||||||
| 
 | 
 | ||||||
|         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) |         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) | ||||||
|         if (GGML_COMPILER_SUPPORT_DOTPROD) |         if (GGML_COMPILER_SUPPORT_DOTPROD) | ||||||
|             add_compile_definitions(__ARM_FEATURE_DOTPROD) |             list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) | ||||||
| 
 | 
 | ||||||
|             message(STATUS "ARM feature DOTPROD enabled") |             message(STATUS "ARM feature DOTPROD enabled") | ||||||
|         endif () |         endif () | ||||||
|  | @ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR | ||||||
|         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) |         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) | ||||||
| 
 | 
 | ||||||
|         if (GGML_COMPILER_SUPPORT_MATMUL_INT8) |         if (GGML_COMPILER_SUPPORT_MATMUL_INT8) | ||||||
|             add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) |             list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) | ||||||
| 
 | 
 | ||||||
|             message(STATUS "ARM feature MATMUL_INT8 enabled") |             message(STATUS "ARM feature MATMUL_INT8 enabled") | ||||||
|         endif () |         endif () | ||||||
| 
 | 
 | ||||||
|         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) |         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) | ||||||
|         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) |         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) | ||||||
|             add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |             list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | ||||||
| 
 | 
 | ||||||
|             message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") |             message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") | ||||||
|         endif () |         endif () | ||||||
|  | @ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR | ||||||
|                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) |                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) | ||||||
|                 if (GGML_COMPILER_SUPPORT_DOTPROD) |                 if (GGML_COMPILER_SUPPORT_DOTPROD) | ||||||
|                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") |                     set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") | ||||||
|                     add_compile_definitions(__ARM_FEATURE_DOTPROD) |                     list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) | ||||||
| 
 | 
 | ||||||
|                     message(STATUS "ARM feature DOTPROD enabled") |                     message(STATUS "ARM feature DOTPROD enabled") | ||||||
|                 endif () |                 endif () | ||||||
|  | @ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR | ||||||
|                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) |                 check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) | ||||||
|                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8) |                 if (GGML_COMPILER_SUPPORT_MATMUL_INT8) | ||||||
|                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") |                     set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") | ||||||
|                     add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) |                     list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) | ||||||
| 
 | 
 | ||||||
|                     message(STATUS "ARM feature MATMUL_INT8 enabled") |                     message(STATUS "ARM feature MATMUL_INT8 enabled") | ||||||
|                 endif () |                 endif () | ||||||
|  | @ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW | ||||||
|     if (MSVC) |     if (MSVC) | ||||||
|         # instruction set detection for MSVC only |         # instruction set detection for MSVC only | ||||||
|         if (GGML_NATIVE) |         if (GGML_NATIVE) | ||||||
|             # TODO: improve, should not reference files from the parent folder |  | ||||||
|             include(cmake/FindSIMD.cmake) |             include(cmake/FindSIMD.cmake) | ||||||
|         endif () |         endif () | ||||||
|         if (GGML_AVX512) |         if (GGML_AVX512) | ||||||
|  | @ -185,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW | ||||||
|             # macros corresponding to the extensions. |             # macros corresponding to the extensions. | ||||||
|             # Do it manually. |             # Do it manually. | ||||||
|             if (GGML_AVX512_VBMI) |             if (GGML_AVX512_VBMI) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>) |                 list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>) |  | ||||||
|                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||||||
|                     list(APPEND ARCH_FLAGS -mavx512vbmi) |                     list(APPEND ARCH_FLAGS -mavx512vbmi) | ||||||
|                 endif() |                 endif() | ||||||
|             endif() |             endif() | ||||||
|             if (GGML_AVX512_VNNI) |             if (GGML_AVX512_VNNI) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) |                 list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) |  | ||||||
|                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||||||
|                     list(APPEND ARCH_FLAGS -mavx512vnni) |                     list(APPEND ARCH_FLAGS -mavx512vnni) | ||||||
|                 endif() |                 endif() | ||||||
|             endif() |             endif() | ||||||
|             if (GGML_AVX512_BF16) |             if (GGML_AVX512_BF16) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>) |                 list(APPEND ARCH_DEFINITIONS __AVX512BF16__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>) |  | ||||||
|                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") |                 if (CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||||||
|                     list(APPEND ARCH_FLAGS -mavx512bf16) |                     list(APPEND ARCH_FLAGS -mavx512bf16) | ||||||
|                 endif() |                 endif() | ||||||
|             endif() |             endif() | ||||||
|             if (GGML_AMX_TILE) |             if (GGML_AMX_TILE) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>) |                 list(APPEND ARCH_DEFINITIONS __AMX_TILE__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>) |  | ||||||
|             endif() |             endif() | ||||||
|             if (GGML_AMX_INT8) |             if (GGML_AMX_INT8) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>) |                 list(APPEND ARCH_DEFINITIONS __AMX_INT8__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>) |  | ||||||
|             endif() |             endif() | ||||||
|             if (GGML_AMX_BF16) |             if (GGML_AMX_BF16) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>) |                 list(APPEND ARCH_DEFINITIONS __AMX_BF16__) | ||||||
|                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>) |  | ||||||
|             endif() |             endif() | ||||||
|         elseif (GGML_AVX2) |         elseif (GGML_AVX2) | ||||||
|             list(APPEND ARCH_FLAGS /arch:AVX2) |             list(APPEND ARCH_FLAGS /arch:AVX2) | ||||||
|  | @ -299,11 +294,12 @@ endif() | ||||||
| 
 | 
 | ||||||
| if (GGML_CPU_AARCH64) | if (GGML_CPU_AARCH64) | ||||||
|     message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") |     message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") | ||||||
|     add_compile_definitions(GGML_USE_CPU_AARCH64) |     target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") | target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) | ||||||
| target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") | set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}") | ||||||
|  | set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") | ||||||
| 
 | 
 | ||||||
| if (EMSCRIPTEN) | if (EMSCRIPTEN) | ||||||
|     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") |     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") | ||||||
|  |  | ||||||
							
								
								
									
										196
									
								
								ggml/src/ggml-cpu/amx/amx.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										196
									
								
								ggml/src/ggml-cpu/amx/amx.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,196 @@ | ||||||
|  | #include "amx.h" | ||||||
|  | #include "common.h" | ||||||
|  | #include "mmq.h" | ||||||
|  | #include "ggml-backend-impl.h" | ||||||
|  | #include "ggml-backend.h" | ||||||
|  | #include "ggml-impl.h" | ||||||
|  | #include "ggml-cpu.h" | ||||||
|  | 
 | ||||||
|  | #if defined(__gnu_linux__) | ||||||
|  | #include <sys/syscall.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #include <cstdlib> | ||||||
|  | #include <cstring> | ||||||
|  | #include <memory> | ||||||
|  | 
 | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  | 
 | ||||||
|  | // AMX buffer interface
 | ||||||
|  | static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { | ||||||
|  |     free(buffer->context); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { | ||||||
|  |     return (void *)(buffer->context); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { | ||||||
|  |     memset((char *)tensor->data + offset, value, size); | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buffer); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | ||||||
|  |     if (qtype_has_amx_kernels(tensor->type)) { | ||||||
|  |         ggml_backend_amx_convert_weight(tensor, data, offset, size); | ||||||
|  |     } else { | ||||||
|  |         memcpy((char *)tensor->data + offset, data, size); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buffer); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { | ||||||
|  |     GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); | ||||||
|  |     memcpy(data, (const char *)tensor->data + offset, size); | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buffer); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { | ||||||
|  |     if (ggml_backend_buffer_is_host(src->buffer)) { | ||||||
|  |         if (qtype_has_amx_kernels(src->type)) { | ||||||
|  |             ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst)); | ||||||
|  |         } else { | ||||||
|  |             memcpy(dst->data, src->data, ggml_nbytes(src)); | ||||||
|  |         } | ||||||
|  |         return true; | ||||||
|  |     } | ||||||
|  |     return false; | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buffer); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { | ||||||
|  |     memset(buffer->context, value, buffer->size); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { | ||||||
|  |     /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer, | ||||||
|  |     /* .get_base        = */ ggml_backend_amx_buffer_get_base, | ||||||
|  |     /* .init_tensor     = */ NULL, // no initialization required
 | ||||||
|  |     /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor, | ||||||
|  |     /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor, | ||||||
|  |     /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor, | ||||||
|  |     /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor, | ||||||
|  |     /* .clear           = */ ggml_backend_amx_buffer_clear, | ||||||
|  |     /* .reset           = */ NULL, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { | ||||||
|  |     return "AMX"; | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buft); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { | ||||||
|  |     void * data = aligned_alloc(TENSOR_ALIGNMENT, size); | ||||||
|  |     if (data == NULL) { | ||||||
|  |         fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); | ||||||
|  |         return NULL; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { | ||||||
|  |     return TENSOR_ALIGNMENT; | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buft); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { | ||||||
|  |     return ggml_backend_amx_get_alloc_size(tensor); | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buft); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { | ||||||
|  |     return false; | ||||||
|  | 
 | ||||||
|  |     GGML_UNUSED(buft); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #define ARCH_GET_XCOMP_PERM     0x1022 | ||||||
|  | #define ARCH_REQ_XCOMP_PERM     0x1023 | ||||||
|  | #define XFEATURE_XTILECFG       17 | ||||||
|  | #define XFEATURE_XTILEDATA      18 | ||||||
|  | 
 | ||||||
|  | static bool ggml_amx_init() { | ||||||
|  | #if defined(__gnu_linux__) | ||||||
|  |     if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { | ||||||
|  |         fprintf(stderr, "AMX is not ready to be used!\n"); | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |     return true; | ||||||
|  | #elif defined(_WIN32) | ||||||
|  |     return true; | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { | ||||||
|  |     static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { | ||||||
|  |         /* .iface = */ { | ||||||
|  |             /* .get_name         = */ ggml_backend_amx_buffer_type_get_name, | ||||||
|  |             /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer, | ||||||
|  |             /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment, | ||||||
|  |             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
 | ||||||
|  |             /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size, | ||||||
|  |             /* .is_host          = */ ggml_backend_amx_buffer_type_is_host, | ||||||
|  |         }, | ||||||
|  |         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), | ||||||
|  |         /* .context = */ NULL, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     if (!ggml_amx_init()) { | ||||||
|  |         return NULL; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return &ggml_backend_buffer_type_amx; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) { | ||||||
|  |     return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) { | ||||||
|  |     // handle only 2d gemm for now
 | ||||||
|  |     auto is_contiguous_2d = [](const struct ggml_tensor * t) { | ||||||
|  |         return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     switch (op->op) { | ||||||
|  |         case GGML_OP_NONE: | ||||||
|  |         case GGML_OP_RESHAPE: | ||||||
|  |         case GGML_OP_VIEW: | ||||||
|  |         case GGML_OP_PERMUTE: | ||||||
|  |         case GGML_OP_TRANSPOSE: | ||||||
|  |             return true; | ||||||
|  | 
 | ||||||
|  |         case GGML_OP_MUL_MAT: { | ||||||
|  |             const struct ggml_tensor * src0 = op->src[0]; | ||||||
|  |             const struct ggml_tensor * src1 = op->src[1]; | ||||||
|  | 
 | ||||||
|  |             const enum ggml_type type = src0->type; | ||||||
|  |             const int64_t ne0 = op->ne[0]; | ||||||
|  | 
 | ||||||
|  |             // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
 | ||||||
|  |             // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
 | ||||||
|  |             bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); | ||||||
|  | 
 | ||||||
|  |             bool can_use_amx = | ||||||
|  |                 is_contiguous_2d(src0) &&       // src0 must be contiguous
 | ||||||
|  |                 is_contiguous_2d(src1) &&       // src1 must be contiguous
 | ||||||
|  |                 src1->type == GGML_TYPE_F32 &&  // src1 must be float32
 | ||||||
|  |                 has_amx_kernels &&              // with amx kernel impls
 | ||||||
|  |                 ne0 % (TILE_N * 2) == 0;        // out_features is 32x
 | ||||||
|  | 
 | ||||||
|  |             return can_use_amx; | ||||||
|  |         } | ||||||
|  |         default: | ||||||
|  |             return false; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 | ||||||
							
								
								
									
										20
									
								
								ggml/src/ggml-cpu/amx/amx.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								ggml/src/ggml-cpu/amx/amx.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | ||||||
|  | #include "ggml-backend.h" | ||||||
|  | #include "ggml-cpu-impl.h" | ||||||
|  | 
 | ||||||
|  | #ifdef __cplusplus | ||||||
|  | extern "C" { | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  | 
 | ||||||
|  | ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); | ||||||
|  | bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft); | ||||||
|  | bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op); | ||||||
|  | void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||||
|  | size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #ifdef __cplusplus | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | @ -1,8 +1,7 @@ | ||||||
| #pragma once | #pragma once | ||||||
| 
 | 
 | ||||||
| #include "ggml.h" | #include "ggml.h" | ||||||
| // hack until AMX is moved into the CPU backend
 | #include "ggml-cpu-impl.h" | ||||||
| #include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h> |  | ||||||
| 
 | 
 | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
| #include <memory> | #include <memory> | ||||||
|  | @ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) { | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | template <typename func_t> | ||||||
|  | inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) { | ||||||
|  |     int tbegin, tend; | ||||||
|  |     balance211(n, params->nth, params->ith, tbegin, tend); | ||||||
|  |     f(tbegin, tend); | ||||||
|  |     ggml_barrier(params->threadpool); // TODO: might not always be needed
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // quantized types that have AMX support
 | // quantized types that have AMX support
 | ||||||
| inline bool qtype_has_amx_kernels(const enum ggml_type type) { | inline bool qtype_has_amx_kernels(const enum ggml_type type) { | ||||||
|     // TODO: fix padding for vnni format
 |     // TODO: fix padding for vnni format
 | ||||||
|     return (type == GGML_TYPE_Q4_0) || |     return (type == GGML_TYPE_Q4_0) || | ||||||
|         (type == GGML_TYPE_Q4_1); |         (type == GGML_TYPE_Q4_1) || | ||||||
|         //(type == GGML_TYPE_Q8_0) ||
 |         (type == GGML_TYPE_Q8_0) || | ||||||
|         //(type == GGML_TYPE_Q4_K) ||
 |         (type == GGML_TYPE_Q4_K) || | ||||||
|         //(type == GGML_TYPE_Q5_K) ||
 |         (type == GGML_TYPE_Q5_K) || | ||||||
|         //(type == GGML_TYPE_Q6_K) ||
 |         (type == GGML_TYPE_Q6_K) || | ||||||
|         //(type == GGML_TYPE_IQ4_XS);
 |         (type == GGML_TYPE_IQ4_XS); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // ggml backend context
 | // ggml backend context
 | ||||||
|  | @ -4,8 +4,11 @@ | ||||||
| #pragma GCC diagnostic ignored "-Wunused-local-typedefs" | #pragma GCC diagnostic ignored "-Wunused-local-typedefs" | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #include "amx.h" | ||||||
| #include "mmq.h" | #include "mmq.h" | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
|  | #include "ggml-cpu-impl.h" | ||||||
|  | #include "ggml-cpu-quants.h" | ||||||
| #include "ggml-quants.h" | #include "ggml-quants.h" | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
|  | @ -33,7 +36,7 @@ | ||||||
| #define ALWAYS_INLINE inline | #define ALWAYS_INLINE inline | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #if defined(__AMX_INT8__) | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
| 
 | 
 | ||||||
| namespace { | namespace { | ||||||
| 
 | 
 | ||||||
|  | @ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k); | ||||||
| 
 | 
 | ||||||
| template <> | template <> | ||||||
| inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) { | inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) { | ||||||
|     // FIXME: using unoptimized reference impl until moved to CPU backend
 |     quantize_row_q8_0(x, (block_q8_0 *)vy, k); | ||||||
|     quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <> | template <> | ||||||
| inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) { | inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) { | ||||||
|     quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k); |     quantize_row_q8_1(x, (block_q8_1 *)vy, k); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <> | template <> | ||||||
|  | @ -950,7 +952,7 @@ template<typename TB, typename packed_B_t = packed_B_type<TB>> | ||||||
| void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { | void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { | ||||||
|     GGML_UNUSED(tile); |     GGML_UNUSED(tile); | ||||||
|     GGML_UNUSED(packed_B); |     GGML_UNUSED(packed_B); | ||||||
| }; | } | ||||||
| 
 | 
 | ||||||
| template <> | template <> | ||||||
| void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { | void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { | ||||||
|  | @ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) { | ||||||
| 
 | 
 | ||||||
| // pack weight to vnni format
 | // pack weight to vnni format
 | ||||||
| void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | ||||||
| 
 |     GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
 | ||||||
|     size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor); |  | ||||||
|     GGML_ASSERT(alloc_size == size); |  | ||||||
| 
 | 
 | ||||||
|     const enum ggml_type TYPE = tensor->type; |     const enum ggml_type TYPE = tensor->type; | ||||||
| 
 | 
 | ||||||
|  | @ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) { | ||||||
|  |     struct ggml_tensor * src0 = dst->src[0]; | ||||||
|  | 
 | ||||||
|  |     const enum ggml_type TYPE = src0->type; | ||||||
|  | 
 | ||||||
|  |     const bool is_floating_type = TYPE == GGML_TYPE_F16; | ||||||
|  |     if (is_floating_type) { | ||||||
|  |         return 0; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const int M = dst->ne[1]; | ||||||
|  |     const int K = src0->ne[0]; | ||||||
|  | 
 | ||||||
|  |     size_t desired_wsize = 0; | ||||||
|  | 
 | ||||||
|  |     GGML_DISPATCH_QTYPES(TYPE, [&] { | ||||||
|  |         const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); | ||||||
|  |         desired_wsize = M * row_size_A; | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     return desired_wsize; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
 | // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
 | ||||||
| //
 | //
 | ||||||
| // src0: weight in shape of {N, K}, quantized
 | // src0: weight in shape of {N, K}, quantized
 | ||||||
|  | @ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d | ||||||
| //
 | //
 | ||||||
| // the function performs: dst = src1 @ src0.T
 | // the function performs: dst = src1 @ src0.T
 | ||||||
| //
 | //
 | ||||||
| void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { | void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) { | ||||||
|     struct ggml_tensor * src0 = dst->src[0]; |     struct ggml_tensor * src0 = dst->src[0]; | ||||||
|     struct ggml_tensor * src1 = dst->src[1]; |     struct ggml_tensor * src1 = dst->src[1]; | ||||||
| 
 | 
 | ||||||
|     const enum ggml_type TYPE = src0->type; |     const enum ggml_type TYPE = src0->type; | ||||||
| 
 | 
 | ||||||
|     const int n_threads = ctx->n_threads; |  | ||||||
| 
 |  | ||||||
|     // f16 only has avx512 kernels for now,
 |     // f16 only has avx512 kernels for now,
 | ||||||
|     // amx kernels will be added once 6th gen xeon is released.
 |     // amx kernels will be added once 6th gen xeon is released.
 | ||||||
|     const bool is_floating_type = TYPE == GGML_TYPE_F16; |     const bool is_floating_type = TYPE == GGML_TYPE_F16; | ||||||
|  | @ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|         const int MB = div_up(M, BLOCK_M); |         const int MB = div_up(M, BLOCK_M); | ||||||
|         const int NB = div_up(N, BLOCK_N); |         const int NB = div_up(N, BLOCK_N); | ||||||
| 
 | 
 | ||||||
|         parallel_for(n_threads, MB * NB, [&](int begin, int end) { |         parallel_for_ggml(params, MB * NB, [&](int begin, int end) { | ||||||
|             GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { |             GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { | ||||||
|                 for (int i = begin; i < end; ++i) { |                 for (int i = begin; i < end; ++i) { | ||||||
|                     int mb = i / NB; |                     int mb = i / NB; | ||||||
|  | @ -2412,17 +2433,16 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // pointer to work space, used convert A from float to quantized type
 |     // pointer to work space, used convert A from float to quantized type
 | ||||||
|     void * wdata = nullptr; |     void * wdata = params->wdata; | ||||||
| 
 | 
 | ||||||
|     //TODO: performance improvement: merge quant A
 |     //TODO: performance improvement: merge quant A
 | ||||||
|  |     if (params->ith == 0) { | ||||||
|         GGML_DISPATCH_QTYPES(TYPE, [&] { |         GGML_DISPATCH_QTYPES(TYPE, [&] { | ||||||
|             const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); |             const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); | ||||||
|             const size_t desired_wsize = M * row_size_A; |             const size_t desired_wsize = M * row_size_A; | ||||||
|         if (ctx->work_size < desired_wsize) { |             if (params->wsize < desired_wsize) { | ||||||
|             ctx->work_data.reset(new char[desired_wsize]); |                 GGML_ABORT("insufficient work space size"); | ||||||
|             ctx->work_size = desired_wsize; |  | ||||||
|             } |             } | ||||||
|         wdata = ctx->work_data.get(); |  | ||||||
| 
 | 
 | ||||||
|             // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
 |             // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
 | ||||||
|             // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
 |             // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
 | ||||||
|  | @ -2433,6 +2453,9 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|                 from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K); |                 from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K); | ||||||
|             } |             } | ||||||
|         }); |         }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     ggml_barrier(params->threadpool); | ||||||
| 
 | 
 | ||||||
|     if (M == 1) { |     if (M == 1) { | ||||||
|         // MB = 1 and handle 8 tiles in each block
 |         // MB = 1 and handle 8 tiles in each block
 | ||||||
|  | @ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|         constexpr int BLOCK_N = TILE_N * kTilesN; |         constexpr int BLOCK_N = TILE_N * kTilesN; | ||||||
|         const int NB = div_up(N, BLOCK_N); |         const int NB = div_up(N, BLOCK_N); | ||||||
| 
 | 
 | ||||||
|         parallel_for(n_threads, NB, [&](int begin, int end) { |         parallel_for_ggml(params, NB, [&](int begin, int end) { | ||||||
|             GGML_DISPATCH_QTYPES(TYPE, [&] { |             GGML_DISPATCH_QTYPES(TYPE, [&] { | ||||||
|                 const int KB = K / blck_size; |                 const int KB = K / blck_size; | ||||||
|                 const int TILE_SIZE = get_tile_size<type>(); |                 const int TILE_SIZE = get_tile_size<type>(); | ||||||
|  | @ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|     const int MB = div_up(M, BLOCK_M); |     const int MB = div_up(M, BLOCK_M); | ||||||
|     const int NB = div_up(N, BLOCK_N); |     const int NB = div_up(N, BLOCK_N); | ||||||
| 
 | 
 | ||||||
|     parallel_for(n_threads, MB * NB, [&](int begin, int end) { |     parallel_for_ggml(params, MB * NB, [&](int begin, int end) { | ||||||
|         // init tile config for each thread
 |         // init tile config for each thread
 | ||||||
|         ggml_tile_config_init(); |         ggml_tile_config_init(); | ||||||
| 
 | 
 | ||||||
|  | @ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #else // if defined(__AMX_INT8__)
 | #endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
 | ||||||
| 
 |  | ||||||
| void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { |  | ||||||
|     fprintf(stderr, "GGML is not compiled with AMX support!\n"); |  | ||||||
| 
 |  | ||||||
|     GGML_UNUSED(ctx); |  | ||||||
|     GGML_UNUSED(dst); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #endif // if defined(__AMX_INT8__)
 |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| #pragma once | #pragma once | ||||||
| #include "common.h" | #include "common.h" | ||||||
| #include <stdint.h> |  | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| extern "C" { | extern "C" { | ||||||
|  | @ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); | ||||||
| 
 | 
 | ||||||
| void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | ||||||
| 
 | 
 | ||||||
| void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst); | void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
|  | @ -15,6 +15,18 @@ | ||||||
| extern "C" { | extern "C" { | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | struct ggml_compute_params { | ||||||
|  |     // ith = thread index, nth = number of threads
 | ||||||
|  |     int ith, nth; | ||||||
|  | 
 | ||||||
|  |     // work buffer for all threads
 | ||||||
|  |     size_t wsize; | ||||||
|  |     void * wdata; | ||||||
|  | 
 | ||||||
|  |     struct ggml_threadpool * threadpool; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| #if defined(_MSC_VER) | #if defined(_MSC_VER) | ||||||
| 
 | 
 | ||||||
| #define m512bh(p) p | #define m512bh(p) p | ||||||
|  | @ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) { | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | // TODO: move to ggml-threading
 | ||||||
|  | void ggml_barrier(struct ggml_threadpool * tp); | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -10,6 +10,7 @@ | ||||||
| #include "ggml-quants.h" | #include "ggml-quants.h" | ||||||
| #include "ggml-cpu-quants.h" | #include "ggml-cpu-quants.h" | ||||||
| #include "ggml-threading.h" | #include "ggml-threading.h" | ||||||
|  | #include "amx/amx.h" | ||||||
| #include "ggml.h" | #include "ggml.h" | ||||||
| 
 | 
 | ||||||
| #if defined(_MSC_VER) || defined(__MINGW32__) | #if defined(_MSC_VER) || defined(__MINGW32__) | ||||||
|  | @ -624,7 +625,7 @@ do {                                                                  \ | ||||||
|     for (int i = 0; i < offset; ++i) {                                \ |     for (int i = 0; i < offset; ++i) {                                \ | ||||||
|         x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \ |         x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \ | ||||||
|     }                                                                 \ |     }                                                                 \ | ||||||
|     res = _mm512_reduce_add_ps(x[0]);                                 \ |     res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \ | ||||||
| } while (0) | } while (0) | ||||||
| 
 | 
 | ||||||
| // TODO: is this optimal ?
 | // TODO: is this optimal ?
 | ||||||
|  | @ -674,7 +675,7 @@ do {                                                              \ | ||||||
|     for (int i = 0; i < offset; ++i) {                            \ |     for (int i = 0; i < offset; ++i) {                            \ | ||||||
|         x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \ |         x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \ | ||||||
|     }                                                             \ |     }                                                             \ | ||||||
|     res = _mm512_reduce_add_ps(x[0]);                             \ |     res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \ | ||||||
| } while (0) | } while (0) | ||||||
| 
 | 
 | ||||||
| #define GGML_F16_VEC                GGML_F32Cx16 | #define GGML_F16_VEC                GGML_F32Cx16 | ||||||
|  | @ -685,8 +686,8 @@ do {                                                              \ | ||||||
| #define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA | #define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA | ||||||
| #define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD | #define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD | ||||||
| #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL | #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL | ||||||
| #define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE |  | ||||||
| 
 | 
 | ||||||
|  | #define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE | ||||||
| #elif defined(__AVX__) | #elif defined(__AVX__) | ||||||
| 
 | 
 | ||||||
| #define GGML_SIMD | #define GGML_SIMD | ||||||
|  | @ -1367,31 +1368,15 @@ struct ggml_compute_state { | ||||||
|     int ith; |     int ith; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct ggml_compute_params { |  | ||||||
|     // ith = thread index, nth = number of threads
 |  | ||||||
|     int ith, nth; |  | ||||||
| 
 |  | ||||||
|     // work buffer for all threads
 |  | ||||||
|     size_t wsize; |  | ||||||
|     void * wdata; |  | ||||||
| 
 |  | ||||||
|     struct ggml_threadpool * threadpool; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| //
 | //
 | ||||||
| // fundamental operations
 | // fundamental operations
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | ||||||
| 
 |  | ||||||
| inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | ||||||
| 
 |  | ||||||
| inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | ||||||
| 
 |  | ||||||
| inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | ||||||
| 
 |  | ||||||
| inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | ||||||
| 
 |  | ||||||
| inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; } | inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; } | ||||||
| inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    } | inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    } | ||||||
| inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        } | inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        } | ||||||
|  | @ -2286,7 +2271,7 @@ struct ggml_state { | ||||||
| 
 | 
 | ||||||
| static struct ggml_state g_state = {0}; | static struct ggml_state g_state = {0}; | ||||||
| 
 | 
 | ||||||
| static void ggml_barrier(struct ggml_threadpool * tp) { | void ggml_barrier(struct ggml_threadpool * tp) { | ||||||
|     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); |     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); | ||||||
|     if (n_threads == 1) { |     if (n_threads == 1) { | ||||||
|         return; |         return; | ||||||
|  | @ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat( | ||||||
|         type = (enum ggml_type)(intptr_t)src0->extra; |         type = (enum ggml_type)(intptr_t)src0->extra; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  |     if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { | ||||||
|  |         ggml_backend_amx_mul_mat(params, dst); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type; |     enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type; | ||||||
|     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float; |     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float; | ||||||
|     ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat; |     ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat; | ||||||
|  | @ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan( | ||||||
|                 } break; |                 } break; | ||||||
|             case GGML_OP_MUL_MAT: |             case GGML_OP_MUL_MAT: | ||||||
|                 { |                 { | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  |                     if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) { | ||||||
|  |                         cur = ggml_backend_amx_desired_wsize(node); | ||||||
|  |                     } | ||||||
|  | #endif | ||||||
|                     const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; |                     const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; | ||||||
| 
 | 
 | ||||||
|                     if (node->src[1]->type != vec_dot_type) { |                     if (node->src[1]->type != vec_dot_type) { | ||||||
|                         cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); |                         size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); | ||||||
|  |                         cur = MAX(cur, cur2); | ||||||
|                     } |                     } | ||||||
|                 } break; |                 } break; | ||||||
|             case GGML_OP_MUL_MAT_ID: |             case GGML_OP_MUL_MAT_ID: | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ | ||||||
| #include "ggml-cpu.h" | #include "ggml-cpu.h" | ||||||
| #include "ggml-cpu-aarch64.h" | #include "ggml-cpu-aarch64.h" | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
|  | #include "amx/amx.h" | ||||||
| #include <cctype> | #include <cctype> | ||||||
| #include <string> | #include <string> | ||||||
| #include <vector> | #include <vector> | ||||||
|  | @ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen | ||||||
|     static std::vector<ggml_backend_buffer_type_t> bufts = []() { |     static std::vector<ggml_backend_buffer_type_t> bufts = []() { | ||||||
|         std::vector<ggml_backend_buffer_type_t> bufts; |         std::vector<ggml_backend_buffer_type_t> bufts; | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_CPU_HBM | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|         bufts.push_back(ggml_backend_cpu_hbm_buffer_type()); |         if (ggml_backend_amx_buffer_type()) { | ||||||
|  |             bufts.push_back(ggml_backend_amx_buffer_type()); | ||||||
|  |         } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_CPU_AARCH64 | #ifdef GGML_USE_CPU_AARCH64 | ||||||
|  |         if (ggml_backend_cpu_aarch64_buffer_type()) { | ||||||
|             bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); |             bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); | ||||||
|  |         } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|         bufts.push_back(NULL); |         bufts.push_back(NULL); | ||||||
|  | @ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st | ||||||
|     const struct ggml_tensor * src0 = op->src[0]; |     const struct ggml_tensor * src0 = op->src[0]; | ||||||
|     const struct ggml_tensor * src1 = op->src[1]; |     const struct ggml_tensor * src1 = op->src[1]; | ||||||
| 
 | 
 | ||||||
|  |     if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) { | ||||||
|  |         return true; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { |     if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { | ||||||
|         if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { |         if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { | ||||||
|             return false; |             return false; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  |     if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { | ||||||
|  |         return ggml_backend_amx_device_supports_op(op); | ||||||
|  |     } | ||||||
|  |     for (int i = 1; i < GGML_MAX_SRC; i++) { | ||||||
|  |         if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) { | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|     for (int i = 1; i < GGML_MAX_SRC; i++) { |     for (int i = 1; i < GGML_MAX_SRC; i++) { | ||||||
|         if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { |         if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { | ||||||
|             return false; |             return false; | ||||||
|  | @ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { | static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { | ||||||
|     return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); |     bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); | ||||||
|  | 
 | ||||||
|  | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) | ||||||
|  |     supported = supported || ggml_backend_amx_buft_is_amx(buft); | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  |     return supported; | ||||||
| 
 | 
 | ||||||
|     GGML_UNUSED(dev); |     GGML_UNUSED(dev); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -50,8 +50,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "sgemm.h" | #include "sgemm.h" | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
| // hack until moved into the CPU backend
 | #include "ggml-cpu-impl.h" | ||||||
| #include "../ggml-cpu-impl.h" |  | ||||||
| #include "ggml-quants.h" | #include "ggml-quants.h" | ||||||
| 
 | 
 | ||||||
| #ifdef _MSC_VER | #ifdef _MSC_VER | ||||||
|  |  | ||||||
|  | @ -30,11 +30,13 @@ | ||||||
| extern "C" { | extern "C" { | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #undef MIN | #ifndef MIN | ||||||
| #undef MAX |  | ||||||
| 
 |  | ||||||
| #    define MIN(a, b) ((a) < (b) ? (a) : (b)) | #    define MIN(a, b) ((a) < (b) ? (a) : (b)) | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #ifndef MAX | ||||||
| #    define MAX(a, b) ((a) > (b) ? (a) : (b)) | #    define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||||||
|  | #endif | ||||||
| 
 | 
 | ||||||
| // required for mmap as gguf only guarantees 32-byte alignment
 | // required for mmap as gguf only guarantees 32-byte alignment
 | ||||||
| #define TENSOR_ALIGNMENT 32 | #define TENSOR_ALIGNMENT 32 | ||||||
|  |  | ||||||
|  | @ -3,5 +3,5 @@ find_package (Threads REQUIRED) | ||||||
| set(TARGET vulkan-shaders-gen) | set(TARGET vulkan-shaders-gen) | ||||||
| add_executable(${TARGET} vulkan-shaders-gen.cpp) | add_executable(${TARGET} vulkan-shaders-gen.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) | target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| set(TARGET llama-vdot) | set(TARGET llama-vdot) | ||||||
| add_executable(${TARGET} vdot.cpp) | add_executable(${TARGET} vdot.cpp) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-q8dot) | set(TARGET llama-q8dot) | ||||||
| add_executable(${TARGET} q8dot.cpp) | add_executable(${TARGET} q8dot.cpp) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_17) | ||||||
|  |  | ||||||
|  | @ -25,7 +25,7 @@ add_library(llama | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
| target_include_directories(llama PUBLIC . ../include) | target_include_directories(llama PUBLIC . ../include) | ||||||
| target_compile_features   (llama PUBLIC cxx_std_11) # don't bump | target_compile_features   (llama PUBLIC cxx_std_17) # don't bump | ||||||
| 
 | 
 | ||||||
| target_link_libraries(llama PUBLIC ggml) | target_link_libraries(llama PUBLIC ggml) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { | static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { | ||||||
|  | #if defined(__clang__) | ||||||
|  |     // disable C++17 deprecation warning for std::codecvt_utf8
 | ||||||
|  | #    pragma clang diagnostic push | ||||||
|  | #    pragma clang diagnostic ignored "-Wdeprecated-declarations" | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv; |     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv; | ||||||
|  | 
 | ||||||
|  | #if defined(__clang__) | ||||||
|  | #    pragma clang diagnostic pop | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|     return conv.from_bytes(s); |     return conv.from_bytes(s); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -284,7 +284,7 @@ static void test_perf() { | ||||||
| 
 | 
 | ||||||
|     data.reserve(n_vocab); |     data.reserve(n_vocab); | ||||||
|     for (int i = 0; i < n_vocab; i++) { |     for (int i = 0; i < n_vocab; i++) { | ||||||
|         const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f); |         const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5); | ||||||
|         data.emplace_back(llama_token_data{i, logit, 0.0f}); |         data.emplace_back(llama_token_data{i, logit, 0.0f}); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue