diff --git a/Makefile b/Makefile index 87fe795aa..804c1bbec 100644 --- a/Makefile +++ b/Makefile @@ -359,6 +359,10 @@ ifdef LLAMA_SERVER_SSL MK_LDFLAGS += -lssl -lcrypto endif +ifndef GGML_NO_CPU_AARCH64 + MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 +endif + # warnings WARN_FLAGS = \ -Wall \ @@ -940,10 +944,6 @@ ggml/src/ggml-cuda/%.o: \ $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< endif # GGML_MUSA -ifndef GGML_NO_CPU_AARCH64 - MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 -endif - ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 30de6c99a..cef41a074 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -143,14 +143,23 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (GGML_AVX512_VBMI) add_compile_definitions($<$:__AVX512VBMI__>) add_compile_definitions($<$:__AVX512VBMI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() endif() if (GGML_AVX512_VNNI) add_compile_definitions($<$:__AVX512VNNI__>) add_compile_definitions($<$:__AVX512VNNI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() endif() if (GGML_AVX512_BF16) add_compile_definitions($<$:__AVX512BF16__>) add_compile_definitions($<$:__AVX512BF16__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() endif() if (GGML_AMX_TILE) add_compile_definitions($<$:__AMX_TILE__>) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4a97bfc32..5cdf59f25 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -49,6 +49,14 @@ #define UNUSED GGML_UNUSED +#if defined(_MSC_VER) +#define m512bh(p) p +#define m512i(p) p +#else +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) +#endif + // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 4ac6b5fc0..6125a080a 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench") # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ - "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", - "blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads", - "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen" + "cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch", + "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload", + "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen" ] # Properties that are boolean and are converted to Yes/No for the table: -BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"] +BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"] # Header names for the table: PRETTY_NAMES = { - "cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC", - "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model", - "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", - "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode", - "main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", - "use_mmap": "Use mmap", "embeddings": "Embeddings", + "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers", + "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]", + "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", + "embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll", + "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU", + "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap", } DEFAULT_SHOW = ["model_type"] # Always show these properties by default.