diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 685d19bd6..7110dda9e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -15,7 +15,7 @@ RUN \ scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \ scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \ scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ - scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ + scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ # Build llama-server cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ cmake --build build --target llama-server -j $(nproc) && \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d389dccb..f84fff9e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE) set(GGML_LLAMAFILE_DEFAULT ON) endif() -if (NOT DEFINED GGML_AMX) - set(GGML_AMX ON) -endif() - if (NOT DEFINED GGML_CUDA_GRAPHS) set(GGML_CUDA_GRAPHS_DEFAULT ON) endif() diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 789fa3b0c..06d371e09 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX512 "ggml: enable AVX512" OFF) option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 9c422cc4e..5df63884c 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_DEFINITIONS __AVXVNNI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavxvnni) + endif() + endif() else() if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) @@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (GGML_AVX2) list(APPEND ARCH_FLAGS -mavx2) endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + endif() if (GGML_AVX512) list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512dq) diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 0d889f714..514701ffe 100644 --- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -281,8 +281,8 @@ static int ggml_backend_cpu_x86_score() { score += ggml_cpu_has_f16c () * 1<<1; score += ggml_cpu_has_ssse3 () * 1<<2; score += ggml_cpu_has_sse3 () * 1<<3; - score += ggml_cpu_has_avx () * 1<<5; score += ggml_cpu_has_avx_vnni () * 1<<4; + score += ggml_cpu_has_avx () * 1<<5; score += ggml_cpu_has_avx2 () * 1<<6; score += ggml_cpu_has_avx512 () * 1<<7; // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 14a1f00eb..f03ab77d7 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) { } static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { -#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) +#if defined(__AVX512VNNI__) const __m512i zero = _mm512_setzero_si512(); return _mm512_dpbusd_epi32(zero, ax, sy); #else