From 8f7080bf48828b538bc9387c3d150bbd4fb4cf2d Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 15 May 2024 23:41:03 +0200
Subject: [PATCH 01/20] readme : remove stray double quote (#7310)

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f2f8df64..ecbe802df 100644
--- a/README.md
+++ b/README.md
@@ -532,7 +532,7 @@ Building the program with BLAS support may lead to some performance improvements
         cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
         && cmake --build build --config Release -- -j 16
     ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
     However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
 
   - Using `make` (example for target gfx1030, build with 16 CPU threads):

From 13ad16af1231ab2d245d35df3295bcfa23de1305 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <max.krasnyansky@gmail.com>
Date: Wed, 15 May 2024 19:47:36 -0700
Subject: [PATCH 02/20] Add support for properly optimized Windows ARM64 builds
 with LLVM and MSVC (#7191)

* logging: add proper checks for clang to avoid errors and warnings with VA_ARGS

* build: add CMake Presets and toolchian files for Windows ARM64

* matmul-int8: enable matmul-int8 with MSVC and fix Clang warnings

* ci: add support for optimized Windows ARM64 builds with MSVC and LLVM

* matmul-int8: fixed typos in q8_0_q8_0 matmuls

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* matmul-int8: remove unnecessary casts in q8_0_q8_0

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/build.yml    | 61 ++++++++++++++++++----------------
 CMakeLists.txt                 |  5 +++
 CMakePresets.json              | 45 +++++++++++++++++++++++++
 cmake/arm64-windows-llvm.cmake | 16 +++++++++
 cmake/arm64-windows-msvc.cmake |  6 ++++
 common/log.h                   | 10 +++---
 ggml-quants.c                  | 53 +++++++++++++++--------------
 7 files changed, 138 insertions(+), 58 deletions(-)
 create mode 100644 CMakePresets.json
 create mode 100644 cmake/arm64-windows-llvm.cmake
 create mode 100644 cmake/arm64-windows-msvc.cmake

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7ac0e5f6e..2d2fea4a2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -693,26 +693,28 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'rpc'
+          - build: 'rpc-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'noavx'
+          - build: 'noavx-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
+          - build: 'avx2-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx'
+          - build: 'avx-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
+          - build: 'avx512-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast'
+          - build: 'clblast-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas'
+          - build: 'openblas-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute'
+          - build: 'kompute-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'vulkan'
+          - build: 'vulkan-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'arm64'
-            defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'llvm-arm64'
+            defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'msvc-arm64'
+            defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone
@@ -723,13 +725,13 @@ jobs:
 
       - name: Clone Kompute submodule
         id: clone_kompute
-        if: ${{ matrix.build == 'kompute' }}
+        if: ${{ matrix.build == 'kompute-x64' }}
         run: |
           git submodule update --init kompute
 
       - name: Download OpenCL SDK
         id: get_opencl
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
           mkdir $env:RUNNER_TEMP/opencl
@@ -737,7 +739,7 @@ jobs:
 
       - name: Download CLBlast
         id: get_clblast
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
           curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@@ -750,7 +752,7 @@ jobs:
 
       - name: Download OpenBLAS
         id: get_openblas
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
           curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -763,38 +765,41 @@ jobs:
 
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
+        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake -S . -B build ${{ matrix.defines }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
       - name: Add clblast.dll
         id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast' }}
+        if: ${{ matrix.build == 'clblast-x64' }}
         run: |
           cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
           cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
         run: |
           cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
           cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
 
       - name: Check AVX512F support
         id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
+        if: ${{ matrix.build == 'avx512-x64' }}
         continue-on-error: true
         run: |
           cd build
@@ -808,14 +813,14 @@ jobs:
       - name: Test
         id: cmake_test
         # not all machines have native AVX-512
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
         run: |
           cd build
           ctest -L main -C Release --verbose --timeout 900
 
       - name: Test (Intel SDE)
         id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
         run: |
           curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
           # for some weird reason windows tar doesn't like sde tar.xz
@@ -843,14 +848,14 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
-          name: llama-bin-win-${{ matrix.build }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
 
   windows-latest-cmake-cuda:
     runs-on: windows-latest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index feb6f39d0..8ab6a45a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1007,6 +1007,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
         if (GGML_COMPILER_SUPPORT_DOTPROD)
             add_compile_definitions(__ARM_FEATURE_DOTPROD)
         endif ()
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+        endif ()
+
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
         if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
             add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 000000000..ad1af7ecc
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,45 @@
+﻿{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name":  "base",
+        "hidden": true,
+        "generator":   "Ninja",
+        "binaryDir":   "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+
+    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
+
+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-windows-llvm", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+        }
+    },
+
+    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+
+    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] }
+  ]
+}
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
new file mode 100644
index 000000000..46fba6514
--- /dev/null
+++ b/cmake/arm64-windows-llvm.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-msvc.cmake b/cmake/arm64-windows-msvc.cmake
new file mode 100644
index 000000000..c77631420
--- /dev/null
+++ b/cmake/arm64-windows-msvc.cmake
@@ -0,0 +1,6 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
diff --git a/common/log.h b/common/log.h
index 6934c57b2..09fa63c26 100644
--- a/common/log.h
+++ b/common/log.h
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
         #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
     #else
         #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
     #endif
 #else
     #define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
         #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
     #else
         #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
     #endif
 #else
     #define LOG_TEE_FLF_FMT "%s"
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
     #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
     #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
     #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
     #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif
 
 // LOG macro variants with auto endline.
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__)
     #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
     #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
diff --git a/ggml-quants.c b/ggml-quants.c
index 9e62a3f32..f13599f6b 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3487,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
         const block_q4_0 * restrict vx0 = vx;
-        const block_q4_0 * restrict vx1 = vx + bx;
-
+        const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
         const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = vy + by;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
@@ -3524,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                    GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                    GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+
+            float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
             int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3894,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
         const block_q4_1 * restrict vx0 = vx;
-        const block_q4_1 * restrict vx1 = vx + bx;
+        const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
         const block_q8_1 * restrict vy0 = vy;
-        const block_q8_1 * restrict vy1 = vy + by;
+        const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
         float32x4_t summs0 = vdupq_n_f32(0.0f);
@@ -3907,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const block_q8_1 * restrict b_y0 = &vy0[i];
             const block_q8_1 * restrict b_y1 = &vy1[i];
 
-            float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                   GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
-                                   GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
-                                   GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
-            summs0 += summs_t;
+            float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+                                    GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+                                    GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
+            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
 
             const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3931,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
             // mmla into int32x4_t
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
-                                 GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
-                                 GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
-                                 GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
+                                   GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
+                                   GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
+                                   GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
+            float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
             int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3953,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
 
         float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
         float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-        sumv2 = sumv2 + summs0;
+        sumv2 = vaddq_f32(sumv2, summs0);
 
         vst1_f32(s, vget_low_f32(sumv2));
         vst1_f32(s + bs, vget_high_f32(sumv2));
@@ -4837,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
         const block_q8_0 * restrict vx0 = vx;
-        const block_q8_0 * restrict vx1 = vx + bx;
+        const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
         const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = vy + by;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
@@ -4861,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             const int8x16_t y1_l = vld1q_s8(b_y1->qs);
             const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
 
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                             GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
+            float32x4_t scale = vld1q_f32(_scale);
 
             int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
             int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));

From 172b78210aae0e54d3668c5de14200efab9fac23 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 15 May 2024 22:36:43 -0700
Subject: [PATCH 03/20] ci: fix bin/Release path for windows-arm64 builds
 (#7317)

Switch to Ninja Multi-Config CMake generator to resurect bin/Release path
that broke artifact packaging in CI.
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2d2fea4a2..0742443c6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -712,9 +712,9 @@ jobs:
           - build: 'vulkan-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64'
-            defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'
-            defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone

From ad52d5c259344888b06fd5acd3344c663dd0621d Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav <vaibhavs10@gmail.com>
Date: Thu, 16 May 2024 07:38:43 +0200
Subject: [PATCH 04/20] doc: add references to hugging face GGUF-my-repo
 quantisation web tool. (#7288)

* chore: add references to the quantisation space.

* fix grammer lol.

* Update README.md

Co-authored-by: Julien Chaumond <julien@huggingface.co>

* Update README.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Julien Chaumond <julien@huggingface.co>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 README.md                   | 3 +++
 examples/quantize/README.md | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ecbe802df..5d6217d13 100644
--- a/README.md
+++ b/README.md
@@ -712,6 +712,9 @@ Building the program with BLAS support may lead to some performance improvements
 
 ### Prepare and Quantize
 
+> [!NOTE]
+> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 
 Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index 8a10365c0..b78ece4e7 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -1,6 +1,8 @@
 # quantize
 
-TODO
+You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
+
+Note: It is synced from llama.cpp `main` every 6 hours.
 
 ## Llama 2 7B
 

From 0350f5815218c483fb3026a86adc44a115481625 Mon Sep 17 00:00:00 2001
From: Herman Semenov <GermanAizek@yandex.ru>
Date: Thu, 16 May 2024 06:14:24 +0000
Subject: [PATCH 05/20] grammar, json, llama: replace push on emplace if it
 possible (#7273)

---
 common/grammar-parser.cpp         |  2 +-
 common/json-schema-to-grammar.cpp | 12 ++++++------
 llama.cpp                         |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index fecb7cd71..b5bc7d49b 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -26,7 +26,7 @@ namespace grammar_parser {
 
     static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
         uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
+        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
         return result.first->second;
     }
 
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 0f8f1b1d4..9a71f5d8d 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -272,7 +272,7 @@ private:
                     if (literal.empty()) {
                         return false;
                     }
-                    ret.push_back(std::make_pair(literal, true));
+                    ret.emplace_back(literal, true);
                     literal.clear();
                     return true;
                 };
@@ -298,7 +298,7 @@ private:
             while (i < length) {
                 char c = sub_pattern[i];
                 if (c == '.') {
-                    seq.push_back(std::make_pair(get_dot(), false));
+                    seq.emplace_back(get_dot(), false);
                     i++;
                 } else if (c == '(') {
                     i++;
@@ -307,7 +307,7 @@ private:
                             _warnings.push_back("Unsupported pattern syntax");
                         }
                     }
-                    seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
+                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
                 } else if (c == ')') {
                     i++;
                     if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -331,9 +331,9 @@ private:
                     }
                     square_brackets += ']';
                     i++;
-                    seq.push_back(std::make_pair(square_brackets, false));
+                    seq.emplace_back(square_brackets, false);
                 } else if (c == '|') {
-                    seq.push_back(std::make_pair("|", false));
+                    seq.emplace_back("|", false);
                     i++;
                 } else if (c == '*' || c == '+' || c == '?') {
                     seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -417,7 +417,7 @@ private:
                         }
                     }
                     if (!literal.empty()) {
-                        seq.push_back(std::make_pair(literal, true));
+                        seq.emplace_back(literal, true);
                     }
                 }
             }
diff --git a/llama.cpp b/llama.cpp
index 7d26966e4..0ef756a52 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -17015,13 +17015,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
             }
             else {
                 if (cell_range_begin != kv_self.size) {
-                    cell_ranges.push_back({ cell_range_begin, i });
+                    cell_ranges.emplace_back(cell_range_begin, i);
                     cell_range_begin = kv_self.size;
                 }
             }
         }
         if (cell_range_begin != kv_self.size) {
-            cell_ranges.push_back({ cell_range_begin, kv_self.size });
+            cell_ranges.emplace_back(cell_range_begin, kv_self.size);
         }
 
         // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count

From dda64fc17c97820ea9489eb0cc9ae8b8fdce4926 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 16 May 2024 02:15:23 -0400
Subject: [PATCH 06/20] convert : get general.name from model dir, not its
 parent (#5615)

Co-authored-by: Brian <mofosyne@gmail.com>
---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index e2e642351..da1247957 100755
--- a/convert.py
+++ b/convert.py
@@ -1109,7 +1109,7 @@ class OutputFile:
         if metadata is not None and metadata.name is not None:
             name = metadata.name
         elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = params.path_model.name
         elif params.n_ctx == 4096:
             # Heuristic detection of LLaMA v2 model
             name = "LLaMA v2"

From 3b3963c55c8332e33533c44b2aa882b0e45f8292 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Wed, 15 May 2024 15:29:07 +0300
Subject: [PATCH 07/20] rpc : add command line arg for specifying backend
 memory

ref: #7293
---
 examples/rpc/README.md      |  4 +--
 examples/rpc/rpc-server.cpp | 68 +++++++++++++++++++++++++++++++------
 ggml-rpc.cpp                |  2 +-
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index 325d0abc4..eeec71a8e 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -42,7 +42,7 @@ cmake --build . --config Release
 Then, start the `rpc-server` with the backend:
 
 ```bash
-$ bin/rpc-server 0.0.0.0 50052
+$ bin/rpc-server -p 50052
 create_backend: using CUDA backend
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
 ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
@@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
 
 When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
 ```bash
-$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
+$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 ```
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
 
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 496af8496..021185b83 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -10,6 +10,52 @@
 #include <string>
 #include <stdio.h>
 
+struct rpc_server_params {
+    std::string host        = "0.0.0.0";
+    int         port        = 50052;
+    size_t      backend_mem = 0;
+};
+
+static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
+    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
+    fprintf(stderr, "\n");
+}
+
+static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-H" || arg == "--host") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.host = argv[i];
+        } else if (arg == "-p" || arg == "--port") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.port = std::stoi(argv[i]);
+            if (params.port <= 0 || params.port > 65535) {
+                return false;
+            }
+        } else if (arg == "-m" || arg == "--mem") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+    return true;
+}
+
 static ggml_backend_t create_backend() {
     ggml_backend_t backend = NULL;
 #ifdef GGML_USE_CUDA
@@ -45,14 +91,9 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 }
 
 int main(int argc, char * argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
-        return 1;
-    }
-    const char * host = argv[1];
-    int port = std::stoi(argv[2]);
-    if (port <= 0 || port > 65535) {
-        fprintf(stderr, "Invalid port number: %d\n", port);
+    rpc_server_params params;
+    if (!rpc_server_params_parse(argc, argv, params)) {
+        fprintf(stderr, "Invalid parameters\n");
         return 1;
     }
     ggml_backend_t backend = create_backend();
@@ -60,10 +101,15 @@ int main(int argc, char * argv[]) {
         fprintf(stderr, "Failed to create backend\n");
         return 1;
     }
-    printf("Starting RPC server on %s:%d\n", host, port);
+    std::string endpoint = params.host + ":" + std::to_string(params.port);
     size_t free_mem, total_mem;
-    get_backend_memory(&free_mem, &total_mem);
-    std::string endpoint = std::string(host) + ":" + std::to_string(port);
+    if (params.backend_mem > 0) {
+        free_mem = params.backend_mem;
+        total_mem = params.backend_mem;
+    } else {
+        get_backend_memory(&free_mem, &total_mem);
+    }
+    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
     start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
     ggml_backend_free(backend);
     return 0;
diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp
index efeacb297..ba392009f 100644
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
@@ -28,7 +28,7 @@
 
 #define UNUSED GGML_UNUSED
 
-#define GGML_DEBUG 1
+#define GGML_DEBUG 0
 #if (GGML_DEBUG >= 1)
 #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else

From 9afdffe70ebf3166d429b4434783bb0b7f97bdeb Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Wed, 15 May 2024 16:04:40 +0300
Subject: [PATCH 08/20] rpc : get available mem for the CPU backend

This can be overridden with the -m command line option

ref: #7293
---
 examples/rpc/rpc-server.cpp | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 021185b83..41f377376 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -7,6 +7,11 @@
 #endif
 
 #include "ggml-rpc.h"
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <unistd.h>
+#endif
 #include <string>
 #include <stdio.h>
 
@@ -84,9 +89,18 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 #ifdef GGML_USE_CUDA
     ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #else
-    // TODO: implement for other backends
-    *free_mem = 1;
-    *total_mem = 1;
+    #ifdef _WIN32
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatusEx(&status);
+        *total_mem = status.ullTotalPhys;
+        *free_mem = status.ullAvailPhys;
+    #else
+        long pages = sysconf(_SC_PHYS_PAGES);
+        long page_size = sysconf(_SC_PAGE_SIZE);
+        *total_mem = pages * page_size;
+        *free_mem = *total_mem;
+    #endif
 #endif
 }
 

From 24ecb58168dce81646c2ed425690a106591c8c6d Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Thu, 16 May 2024 20:43:45 +0200
Subject: [PATCH 09/20] Revert "server bench: fix bench not waiting for model
 load (#7284)" (#7334)

This reverts commit 583fd6b000ec9ad1b465b5c98524f4a0ae388077.
---
 examples/server/bench/bench.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 25ac29c4c..86c5de101 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -293,14 +293,13 @@ def start_server_background(args):
 
 
 def is_server_listening(server_fqdn, server_port):
-    try:
-        url = f"{server_fqdn}:{server_port}/health"
-        if not url.startswith("http://"):
-            url = f"http://{url}"
-        result = requests.get(url)
-        return result.status_code == 200
-    except Exception:
-        return False
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        result = sock.connect_ex((server_fqdn, server_port))
+        _is_server_listening = result == 0
+        if _is_server_listening:
+            print(f"server is listening on {server_fqdn}:{server_port}...")
+        return _is_server_listening
+
 
 def escape_metric_name(metric_name):
     return re.sub('[^A-Z0-9]', '_', metric_name.upper())

From 9c4fdcbec8c7fcc428e723b0d8a1cf1f351ba642 Mon Sep 17 00:00:00 2001
From: Leon Knauer <git@leonknauer.com>
Date: Fri, 17 May 2024 02:11:03 +0200
Subject: [PATCH 10/20] [Server] Added --verbose option to README [no ci]
 (#7335)

---
 examples/server/README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index f6eb6942c..13cbdcd2c 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -17,7 +17,8 @@ The project is under active development, and we are [looking for feedback and co
 
 **Command line options:**
 
-- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
+- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
+- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
@@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
-- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
-page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
-
+- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
 - `--numa`: Attempt optimizations that may help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

From 934266c0e0b2aa9781fdba2deb112c161ff038a9 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Fri, 17 May 2024 02:58:52 -0400
Subject: [PATCH 11/20] ggml : rewrite silu and softmax for cpu (#7154)

This change upstreams llamafile's vectorized expf() functions. This lets
us compute softmax and silu more accurately than the short[65536] lookup
table that GGML previously used to make this operation go faster. We can
support aarch64 and sse2+ with the worst case rounding error of 2ulp. It
makes make -j8 tests && ./tests/test-backend-ops -o SOFT_MAX -b CPU perf
go 1.5x faster for SSE2+FMA, 1.9x faster for AVX2+FMA and 2.1x on AVX512
---
 ggml.c | 476 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 283 insertions(+), 193 deletions(-)

diff --git a/ggml.c b/ggml.c
index 684d84235..55152bce4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -165,9 +165,6 @@ void ggml_print_backtrace(void) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
-#define GGML_SILU_FP16
-// #define GGML_CROSS_ENTROPY_EXP_FP16
-// #define GGML_FLASH_ATTN_EXP_FP16
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -318,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
 // precomputed quick gelu table for f16 (128 KB)
 static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 
-// precomputed silu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-static ggml_fp16_t ggml_table_exp_f16[1 << 16];
-
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 
@@ -2085,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
     return x/(1.0f + expf(-x));
 }
 
-//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_silu_f16[i16[i]];
-//    }
-//}
+#if defined(__ARM_NEON)
 
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
-    }
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static float32x4_t ggml_v_expf(float32x4_t x) {
+    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
+    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
+    const float32x4_t n = vsubq_f32(z, r);
+    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
+                                    vdupq_n_f32(0x1.7f7d1cp-20f));
+    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
+    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
+    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
+    const float32x4_t u = vmulq_f32(b, b);
+    const float32x4_t j = vfmaq_f32(
+        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
+        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
+                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
+    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
+        return vfmaq_f32(k, j, k);
+    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
+    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
+    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
+    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
+                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
 }
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static float32x4_t ggml_v_silu(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t zero = vdupq_n_f32(0.0f);
+    const float32x4_t neg_x = vsubq_f32(zero, x);
+    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
+    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+    return vdivq_f32(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m512 ggml_v_expf(__m512 x) {
+  const __m512 r = _mm512_set1_ps(0x1.8p23f);
+  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
+  const __m512 n = _mm512_sub_ps(z, r);
+  const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+                                    _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
+  const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
+  const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
+  const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
+  const __m512 u = _mm512_mul_ps(b, b);
+  const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
+                                                                   _mm512_set1_ps(0x1.573e2ep-5f)), u,
+                                                   _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
+                                                                   _mm512_set1_ps(0x1.fffdb6p-2f))),
+                                   u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
+  if (_mm512_kortestz(c, c))
+    return _mm512_fmadd_ps(j, k, k);
+  const __m512i g = _mm512_and_si512(
+      _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
+      _mm512_set1_epi32(0x82000000u));
+  const __m512 s1 =
+      _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
+  const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
+  const __mmask16 d =
+      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
+  return _mm512_mask_blend_ps(
+      d, _mm512_mask_blend_ps(
+          c, _mm512_fmadd_ps(k, j, k),
+          _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
+      _mm512_mul_ps(s1, s1));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m512 ggml_v_silu(__m512 x) {
+    const __m512 one = _mm512_set1_ps(1);
+    const __m512 zero = _mm512_setzero_ps();
+    const __m512 neg_x = _mm512_sub_ps(zero, x);
+    const __m512 exp_neg_x = ggml_v_expf(neg_x);
+    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
+    return _mm512_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX2__) && defined(__FMA__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m256 ggml_v_expf(__m256 x) {
+  const __m256 r = _mm256_set1_ps(0x1.8p23f);
+  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
+  const __m256 n = _mm256_sub_ps(z, r);
+  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
+                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
+  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
+  const __m256 k = _mm256_castsi256_ps(
+      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
+  const __m256i c = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(126), _CMP_GT_OQ));
+  const __m256 u = _mm256_mul_ps(b, b);
+  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
+                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
+                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
+                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
+                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
+  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
+    return _mm256_fmadd_ps(j, k, k);
+  const __m256i g = _mm256_and_si256(
+      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
+      _mm256_set1_epi32(0x82000000u));
+  const __m256 s1 =
+      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
+  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
+  const __m256i d = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(192), _CMP_GT_OQ));
+  return _mm256_or_ps(
+      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
+      _mm256_andnot_ps(
+          _mm256_castsi256_ps(d),
+          _mm256_or_ps(
+              _mm256_and_ps(_mm256_castsi256_ps(c),
+                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
+              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m256 ggml_v_silu(__m256 x) {
+    const __m256 one = _mm256_set1_ps(1);
+    const __m256 zero = _mm256_setzero_ps();
+    const __m256 neg_x = _mm256_sub_ps(zero, x);
+    const __m256 exp_neg_x = ggml_v_expf(neg_x);
+    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
+    return _mm256_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
+
+#if defined(__FMA__)
+#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
+#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
 #else
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
+#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
+#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
+#endif
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m128 ggml_v_expf(__m128 x) {
+    const __m128 r = _mm_set1_ps(0x1.8p23f);
+    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
+    const __m128 n = _mm_sub_ps(z, r);
+    const __m128 b =
+        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
+    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
+    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
+    const __m128i c =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
+    const __m128 u = _mm_mul_ps(b, b);
+    const __m128 j =
+        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
+                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
+                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
+    if (!_mm_movemask_epi8(c))
+        return MADD128(j, k, k);
+    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
+                                    _mm_set1_epi32(0x82000000u));
+    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
+    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
+    const __m128i d =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
+    return _mm_or_ps(
+        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
+        _mm_andnot_ps(_mm_castsi128_ps(d),
+                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
+                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __AVX2__ / __SSE2__
+
+static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
+    }
+#endif
+    for (; i < n; ++i) {
         y[i] = ggml_silu_f32(x[i]);
     }
 }
+
+static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
 #endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
 
 inline static float ggml_silu_backward_f32(float x, float dy) {
     const float s = 1.0f/(1.0f + expf(-x));
     return dy*s*(1.0f + x*(1.0f - s));
 }
 
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        // we did not use x[i] to compute forward silu but its f16 equivalent
-        // take derivative at f16 of x[i]:
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        float usedx = GGML_FP16_TO_FP32(fp16);
-        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
-    }
-}
-#else
 inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
     for (int i = 0; i < n; ++i) {
         dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
     }
 }
-#endif
 
 inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
@@ -2922,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
                 float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
                 ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
                 ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
             }
 
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -13600,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
         float max = -INFINITY;
         ggml_vec_max_f32(nc, &max, wp);
 
-        ggml_float sum = 0.0;
-
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            if (wp[i] == -INFINITY) {
-                dp[i] = 0.0f;
-            } else {
-                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                dp[i] = val;
-            }
-        }
-
+        ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
         assert(sum > 0.0);
 
         sum = 1.0/sum;
@@ -15374,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
                 vvexpf(S, S, &Mup);
                 ggml_vec_sum_f32(Mup, &sum, S);
 #else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    if (i >= masked_begin) {
-                        break;
-                    }
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (i + j >= masked_begin) {
-                            break;
-                        } else if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                            const float val = expf(SS[j] - max);
-#else
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
+                sum = ggml_vec_soft_max_f32(Mup, S, S, max);
 #endif
             }
 
@@ -15586,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
                 vvexpf(S, S, &Mup);
                 ggml_vec_sum_f32(Mup, &sum, S);
 #else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
+                sum = ggml_vec_soft_max_f32(Mup, S, S, max);
 #endif
             }
 
@@ -16234,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                         vvexpf(SM, SM, &Mup);
                         ggml_vec_sum_f32(Mup, &sum, SM);
 #else
-                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                            if (i >= masked_begin) {
-                                break;
-                            }
-                            float * SR =  S + i;
-                            float * SW = SM + i;
-
-                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                                if (i + j >= masked_begin) {
-                                    break;
-                                } else if (SR[j] == -INFINITY) {
-                                    SW[j] = 0.0f;
-                                } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                                    const float val = expf(SR[j] - max);
-#else
-                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
-                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                                    sump[j] += (ggml_float)val;
-                                    SW[j] = val;
-                                }
-                            }
-                        }
-
-                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                            sum += sump[i];
-                        }
+                        sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
 #endif
                     }
 
@@ -17291,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
             assert(!isnan(s1[i]));
         }
 #endif
+
         // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    st[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    st[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            // sum = 1.0/sum;
-        }
-        // avoid log(0) by rescaling from [0..1] to [eps..1]
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
+        assert(sum > 0.0);
         sum = (1.0 - eps) / sum;
+
+        // avoid log(0) by rescaling from [0..1] to [eps..1]
         ggml_vec_scale_f32(nc, st, sum);
         ggml_vec_add1_f32(nc, st, st, eps);
         ggml_vec_log_f32(nc, st, st);
@@ -17409,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
 #endif
 
         // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    ds0[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    ds0[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            sum = (1.0 - eps)/sum;
-        }
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
+        assert(sum > 0.0);
+        sum = (1.0 - eps) / sum;
 
         // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
         ggml_vec_scale_f32(nc, ds0, sum);

From ee94172d33399d2e814ca05c8a3ff8c523ebb093 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Fri, 17 May 2024 10:00:17 +0300
Subject: [PATCH 12/20] server : add support for the RPC backend (#7305)

ref: #7292
---
 examples/server/server.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7e0d068f8..b598076d3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
     printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
     printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
+    printf("  --rpc SERVERS             comma separated list of RPC servers\n");
     printf("  --path PUBLIC_PATH        path from which to serve static files (default: disabled)\n");
     printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
     printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             sparams.port = std::stoi(argv[i]);
+        } else if (arg == "--rpc") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rpc_servers = argv[i];
         } else if (arg == "--host") {
             if (++i >= argc) {
                 invalid_param = true;

From e18bc6aaf3b547890609ed254ee5248e720e5840 Mon Sep 17 00:00:00 2001
From: amd-lalithnc <lalithnc@amd.com>
Date: Fri, 17 May 2024 12:31:58 +0530
Subject: [PATCH 13/20] convert : fix Qwen/Qwen-7b conversion (#7308)

---
 convert-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index cd875fa4a..2810e1e41 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -526,7 +526,7 @@ class Model:
 
         # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
         added_vocab = tokenizer.special_tokens
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
 
         for i in range(vocab_size):
             if i not in reverse_vocab:

From 359cbe3f46c90ce6f5151005e411b8fb74f8139e Mon Sep 17 00:00:00 2001
From: Herman Semenov <GermanAizek@yandex.ru>
Date: Fri, 17 May 2024 07:08:49 +0000
Subject: [PATCH 14/20] ggml-quants, llama : removed excess checks (#7274)

---
 common/common.cpp | 2 +-
 ggml-quants.c     | 2 +-
 llama.cpp         | 8 ++------
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 96130ad54..e624fc7f3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
     size_t pos_start = 0;
     size_t pos_found = 0;
 
-    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
+    if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
         data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
         data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
         data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
diff --git a/ggml-quants.c b/ggml-quants.c
index f13599f6b..9b291d522 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1986,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
 
         for (int j = 0; j < QK_K/16; ++j) {
             if (quant_weights) {
-                const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
+                const float * qw = quant_weights + QK_K * i + 16*j;
                 for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
             } else {
                 for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
diff --git a/llama.cpp b/llama.cpp
index 0ef756a52..daaa138b9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13904,9 +13904,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
 
     // Sample the next word X using top-k sampling
     llama_sample_top_k(nullptr, candidates, int(k), 1);
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
     llama_token X = llama_sample_token(ctx, candidates);
     t_start_sample_us = ggml_time_us();
 
@@ -13920,9 +13918,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
     // Update mu using the learning rate and error
     *mu = *mu - eta * e;
 
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
     return X;
 }
 

From 29c60d8cddcfd14fa8a6bf023a6c4eb8692c76ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 17 May 2024 09:59:57 +0200
Subject: [PATCH 15/20] tokenization: add warning for double BOS (#7332)

---
 llama.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index daaa138b9..c5a1fa0f6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12818,6 +12818,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     }
                 }
 
+                if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
                 if (add_special && vocab.special_add_eos == 1) {
                     GGML_ASSERT(vocab.special_eos_id != -1);
                     output.push_back(vocab.special_eos_id);
@@ -12844,6 +12851,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     }
                 }
 
+                if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
                 if (add_special && vocab.special_add_eos == 1) {
                     GGML_ASSERT(vocab.special_add_eos != -1);
                     output.push_back(vocab.special_eos_id);

From 27b040691cbe45314147c2745e891a38e9c048d4 Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Fri, 17 May 2024 13:24:38 +0200
Subject: [PATCH 16/20] llama : use n_embd_head_v when reshaping kqv (#7327)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* llama : use n_embd_head_v instead of n_embd_head_k when reshaping kqv

* llama : use n_embd_v_gqa and n_embd_head_v instead of n_embd_k_gqa and n_embd_head_k when making a view of cached value vectors.

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
---
 llama.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c5a1fa0f6..e11f0ac4b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6622,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
     const int64_t n_embd_head_k = hparams.n_embd_head_k;
     const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
     const int64_t n_embd_head_v = hparams.n_embd_head_v;
+    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
 
     struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
     cb(q, "q", il);
@@ -6644,8 +6645,8 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * v =
             ggml_view_3d(ctx, kv.v_l[il],
                     n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
+                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
+                    ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
                     0);
         cb(v, "v", il);
 
@@ -6655,7 +6656,7 @@ static struct ggml_tensor * llm_build_kqv(
             ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
         }
 
-        cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
+        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
     } else {
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
         cb(kq, "kq", il);
@@ -6700,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
         cb(kqv_merged, "kqv_merged", il);
 
-        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
+        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
         cb(cur, "kqv_merged_cont", il);
     }
 

From d273c1402b25086fd91aef2467ac13f2e49fa0ea Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Fri, 17 May 2024 15:11:45 +0300
Subject: [PATCH 17/20] py : convert-hf-to-gguf-update improvements (#7340)

* convert-hf-to-gguf-update: automate updating

* convert-hf-to-gguf-update: improve download

* share requests session for performance
* create directories only when needed, don't skip downloads when empty directory encountered
* be more graceful about errors
---
 convert-hf-to-gguf-update.py | 87 ++++++++++++++++--------------------
 convert-hf-to-gguf.py        |  2 +
 2 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 14aa0c45a..27983fadf 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -20,11 +20,13 @@
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
-# TODO: automate the update of convert-hf-to-gguf.py
 #
 
 import logging
 import os
+import pathlib
+import re
+
 import requests
 import sys
 import json
@@ -35,6 +37,7 @@ from transformers import AutoTokenizer
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert-hf-to-gguf-update")
+sess = requests.Session()
 
 
 class TOKENIZER_TYPE(IntEnum):
@@ -79,63 +82,44 @@ models = [
     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
 ]
 
-# make directory "models/tokenizers" if it doesn't exist
-if not os.path.exists("models/tokenizers"):
-    os.makedirs("models/tokenizers")
-
 
 def download_file_with_auth(url, token, save_path):
     headers = {"Authorization": f"Bearer {token}"}
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        with open(save_path, 'wb') as f:
-            f.write(response.content)
-        logger.info(f"File {save_path} downloaded successfully")
-    else:
-        logger.info(f"Failed to download file. Status code: {response.status_code}")
+    response = sess.get(url, headers=headers)
+    response.raise_for_status()
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, 'wb') as f:
+        f.write(response.content)
+    logger.info(f"File {save_path} downloaded successfully")
 
 
-# download the tokenizer models
-for model in models:
+def download_model(model):
     name = model["name"]
     repo = model["repo"]
     tokt = model["tokt"]
 
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        os.makedirs(f"models/tokenizers/{name}")
-    else:
-        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
-        continue
-
-    logger.info(f"Downloading {name} to models/tokenizers/{name}")
-
-    url = f"{repo}/raw/main/config.json"
-    save_path = f"models/tokenizers/{name}/config.json"
-    download_file_with_auth(url, token, save_path)
-
-    url = f"{repo}/raw/main/tokenizer.json"
-    save_path = f"models/tokenizers/{name}/tokenizer.json"
-    download_file_with_auth(url, token, save_path)
-
-    # if downloaded file is less than 1KB, we likely need to download an LFS instead
-    if os.path.getsize(save_path) < 1024:
-        # remove the file
-        os.remove(save_path)
-        url = f"{repo}/resolve/main/tokenizer.json"
-        save_path = f"models/tokenizers/{name}/tokenizer.json"
-        download_file_with_auth(url, token, save_path)
+    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
 
+    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
     if tokt == TOKENIZER_TYPE.SPM:
-        url = f"{repo}/resolve/main/tokenizer.model"
-        save_path = f"models/tokenizers/{name}/tokenizer.model"
-        download_file_with_auth(url, token, save_path)
+        files.append("tokenizer.model")
+
+    for file in files:
+        save_path = f"models/tokenizers/{name}/{file}"
+        if os.path.isfile(save_path):
+            logger.info(f"{name}: File {save_path} already exists - skipping")
+            continue
+        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+
+
+for model in models:
+    try:
+        download_model(model)
+    except Exception as e:
+        logger.error(f"Failed to download model {model['name']}. Error: {e}")
 
-    url = f"{repo}/raw/main/tokenizer_config.json"
-    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
-    download_file_with_auth(url, token, save_path)
 
 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
-# TODO: auto-update convert-hf-to-gguf.py with the generated function
 
 src_ifs = ""
 for model in models:
@@ -224,11 +208,18 @@ src_func = f"""
         return res
 """
 
-print(src_func) # noqa: NP100
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
+convert_py = convert_py_pth.read_text()
+convert_py = re.sub(
+    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
+    lambda m: m.group(1) + src_func + m.group(3),
+    convert_py,
+    flags=re.DOTALL | re.MULTILINE,
+)
 
-logger.info("\n")
-logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
-logger.info("\n")
+convert_py_pth.write_text(convert_py)
+
+logger.info("+++ convert-hf-to-gguf.py was updated")
 
 # generate tests for each tokenizer model
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 2810e1e41..5ba3161c7 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -402,6 +402,7 @@ class Model:
     # NOTE: this function is generated by convert-hf-to-gguf-update.py
     #       do not modify it manually!
     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # Marker: Start get_vocab_base_pre
     def get_vocab_base_pre(self, tokenizer) -> str:
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
@@ -489,6 +490,7 @@ class Model:
         logger.debug(f"chkhsh: {chkhsh}")
 
         return res
+        # Marker: End get_vocab_base_pre
 
     def _set_vocab_gpt2(self) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()

From 51e9d02599336e62948d29f1d6c05addeb921ac2 Mon Sep 17 00:00:00 2001
From: Brian <mofosyne@gmail.com>
Date: Fri, 17 May 2024 22:40:14 +1000
Subject: [PATCH 18/20] Added a single test function script and fix
 debug-test.sh to be more robust (#7279)

* run-single-test.sh: added a single test function script and fix debug-test.sh to be more robust

* debug-test.sh: combined execute and gdb test mode via -g flag

* debug-test.sh: refactor

* debug-test: refactor for clarity

* debug-test.sh: comment style changes

* debug-test.sh: fix gdb
---
 docs/debugging-tests.md |  24 +++-
 scripts/debug-test.sh   | 266 ++++++++++++++++++++++++++--------------
 2 files changed, 196 insertions(+), 94 deletions(-)

diff --git a/docs/debugging-tests.md b/docs/debugging-tests.md
index 51a125e19..18407f688 100644
--- a/docs/debugging-tests.md
+++ b/docs/debugging-tests.md
@@ -1,6 +1,6 @@
 # Debugging Tests Tips
 
-## How to run & debug a specific test without anything else to keep the feedback loop short?
+## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
 
 There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
 
@@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
 
 It will then build & run in the debugger for you.
 
+To just execute a test and get back a PASS or FAIL message run:
+
 ```bash
 ./scripts/debug-test.sh test-tokenizer
+```
+
+To test in GDB use the `-g` flag to enable gdb test mode.
+
+```bash
+./scripts/debug-test.sh -g test-tokenizer
 
 # Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
 >>> b main
 ```
 
+To speed up the testing loop, if you know your test number you can just run it similar to below:
+
+```bash
+./scripts/debug-test.sh test 23
+```
+
 For further reference use `debug-test.sh -h` to print help.
 
 &nbsp;
@@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
 make -j
 ```
 
-#### Step 3.1: Identify Test Command for Debugging
+#### Step 3: Find all tests available that matches REGEX
 
 The output of this command will give you the command & arguments needed to run GDB.
 
@@ -69,11 +83,13 @@ Labels: main
 ...
 ```
 
-So for test #1 we can tell these two pieces of relevant information:
+#### Step 4: Identify Test Command for Debugging
+
+So for test #1 above we can tell these two pieces of relevant information:
 * Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
 * Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
 
-#### Step 3.2: Run GDB on test command
+#### Step 5: Run GDB on test command
 
 Based on the ctest 'test command' report above we can then run a gdb session via this command below:
 
diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh
index 231a23d69..7b2b601a9 100755
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@@ -1,117 +1,203 @@
 #!/bin/bash
-test_suite=${1:-}
-test_number=${2:-}
 
 PROG=${0##*/}
 build_dir="build-ci-debug"
 
-if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
-    echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
-    echo "Debug specific ctest program."
-    echo
-    echo "Options:"
-    echo "  -h, --help       Display this help and exit"
-    echo
-    echo "Arguments:"
-    echo "  <test_regex>     (Mandatory) Supply one regex to the script to filter tests"
-    echo "  (test_number)    (Optional) Test number to run a specific test"
-    echo
-    echo "Example:"
-    echo "  $PROG test-tokenizer"
-    echo "  $PROG test-tokenizer 3"
-    echo
-    exit 0
-fi
+# Print Color Commands
+red=$(tput setaf 1)
+green=$(tput setaf 2)
+yellow=$(tput setaf 3)
+blue=$(tput setaf 4)
+magenta=$(tput setaf 5)
+cyan=$(tput setaf 6)
+normal=$(tput sgr0)
 
-# Function to select and debug a test
-function select_test() {
-    test_suite=${1:-test}
-    test_number=${2:-}
 
-    # Sanity Check If Tests Is Detected
-    printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
-    tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
-    if [ ${#tests[@]} -eq 0 ]
-    then
-        echo "No tests avaliable... check your compliation process..."
-        echo "Exiting."
-        exit 1
-    fi
+# Print Help Message
+####################
 
-    if [ -z $test_number ]
-    then
-        # List out avaliable tests
-        printf "Which test would you like to debug?\n"
-        id=0
-        for s in "${tests[@]}"
-        do
-            echo "Test# ${id}"
-            echo "  $s"
-            ((id++))
-        done
+print_full_help() {
+  cat << EOF
+Usage: $PROG [OPTION]... <test_regex> (test_number)
+Debug specific ctest program.
 
-        # Prompt user which test they wanted to run
-        printf "\nRun test#? "
-        read test_number
-    else
-        printf "\nUser Already Requested #${test_number}"
-    fi
+Options:
+  -h, --help            display this help and exit
+  -g                    run in gdb mode
 
-    # Start GDB with the requested test binary and arguments
-    printf "Debugging(GDB) test: ${tests[test_number]}\n"
-    # Change IFS (Internal Field Separator)
-    sIFS=$IFS
-    IFS=$'\n'
+Arguments:
+  <test_regex>     (Mandatory) Supply one regex to the script to filter tests
+  (test_number)    (Optional) Test number to run a specific test
 
-    # Get test args
-    gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
-    IFS=$sIFS
-    printf "Debug arguments: ${gdb_args[test_number]}\n\n"
-
-    # Expand paths if needed
-    args=()
-    for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
-    do
-        args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
-    done
-
-    # Execute debugger
-    echo "gdb args: ${args[@]}"
-    gdb --args ${args[@]}
+Example:
+  $PROG test-tokenizer
+  $PROG test-tokenizer 3
+EOF
 }
 
+abort() {
+  echo "Error: $1" >&2
+  cat << EOF >&2
+Usage: $PROG [OPTION]... <test_regex> (test_number)
+Debug specific ctest program.
+Refer to --help for full instructions.
+EOF
+  exit 1
+}
+
+
+# Dependency Sanity Check
+#########################
+
+check_dependency() {
+  command -v "$1" >/dev/null 2>&1 || {
+    abort "$1 is required but not found. Please install it and try again."
+  }
+}
+
+check_dependency ctest
+check_dependency cmake
+
+
 # Step 0: Check the args
-if [ -z "$test_suite" ]
-then
-    echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
-    echo "Supply one regex to the script to filter tests,"
-    echo "and optionally a test number to run a specific test."
-    echo "Use --help flag for full instructions"
-    exit 1
+########################
+
+if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
+  print_full_help >&2
+  exit 0
 fi
 
+# Parse command-line options
+gdb_mode=false
+while getopts "g" opt; do
+    case $opt in
+        g)
+            gdb_mode=true
+            echo "gdb_mode Mode Enabled"
+            ;;
+    esac
+done
+
+# Shift the option parameters
+shift $((OPTIND - 1))
+
+# Positionial Argument Processing : <test_regex>
+if [ -z "${1}" ]; then
+    abort "Test regex is required"
+else
+    test_suite=${1:-}
+fi
+
+# Positionial Argument Processing : (test_number)
+test_number=${2:-}
+
+
 # Step 1: Reset and Setup folder context
+########################################
+
 ## Sanity check that we are actually in a git repo
 repo_root=$(git rev-parse --show-toplevel)
 if [ ! -d "$repo_root" ]; then
-    echo "Error: Not in a Git repository."
-    exit 1
+    abort "Not in a Git repository."
 fi
 
-## Reset folder to root context of git repo
-pushd "$repo_root" || exit 1
+## Reset folder to root context of git repo and Create and enter build directory
+pushd "$repo_root"
+rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 
-## Create and enter build directory
-rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
 
 # Step 2: Setup Build Environment and Compile Test Binaries
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
-pushd "$build_dir" && make -j || exit 1
+###########################################################
 
-# Step 3: Debug the Test
-select_test "$test_suite" "$test_number"
+# Note: test-eval-callback requires -DLLAMA_CURL
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
+pushd "$build_dir"
+make -j || abort "Failed to compile"
+popd > /dev/null || exit 1
 
-# Step 4: Return to the directory from which the user ran the command.
-popd || exit 1
-popd || exit 1
-popd || exit 1
+
+# Step 3: Find all tests available that matches REGEX
+####################################################
+
+# Ctest Gather Tests
+# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
+# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
+# `-V` : Verbose Mode
+printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
+pushd "$build_dir"
+tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
+if [ ${#tests[@]} -eq 0 ]; then
+    abort "No tests avaliable... check your compliation process..."
+fi
+popd > /dev/null || exit 1
+
+
+# Step 4: Identify Test Command for Debugging
+#############################################
+
+# Select test number
+if [ -z $test_number ]; then
+    # List out avaliable tests
+    printf "Which test would you like to debug?\n"
+    id=0
+    for s in "${tests[@]}"
+    do
+        echo "Test# ${id}"
+        echo "  $s"
+        ((id++))
+    done
+
+    # Prompt user which test they wanted to run
+    printf "\nRun test#? "
+    read test_number
+
+else
+    printf "\nUser Already Requested #${test_number}\n"
+
+fi
+
+# Grab all tests commands
+pushd "$build_dir"
+sIFS=$IFS # Save Initial IFS (Internal Field Separator)
+IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
+test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
+IFS=$sIFS # Reset IFS (Internal Field Separator)
+popd > /dev/null || exit 1
+
+# Grab specific test command
+single_test_name="${tests[test_number]}"
+single_test_command="${test_args[test_number]}"
+
+
+# Step 5: Execute or GDB Debug
+##############################
+
+printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
+printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
+
+if [ "$gdb_mode" = "true" ]; then
+    # Execute debugger
+    pushd "$repo_root" || exit 1
+    eval "gdb --args ${single_test_command}"
+    popd > /dev/null || exit 1
+
+else
+    # Execute Test
+    pushd "$repo_root" || exit 1
+    eval "${single_test_command}"
+    exit_code=$?
+    popd > /dev/null || exit 1
+
+    # Print Result
+    printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
+    printf "${yellow}Command: ${single_test_command}${normal}\n"
+    if [ $exit_code -eq 0 ]; then
+        printf "${green}TEST PASS${normal}\n"
+    else
+        printf "${red}TEST FAIL${normal}\n"
+    fi
+
+fi
+
+# Return to the directory from which the user ran the command.
+popd > /dev/null || exit 1

From f4bd8b3d260bb09491ba63c77ab7012b744362ef Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Fri, 17 May 2024 17:25:44 +0300
Subject: [PATCH 19/20] rpc : set SO_REUSEADDR for the server socket (#7320)

ref: #7293
---
 examples/rpc/rpc-server.cpp |  4 ++++
 ggml-rpc.cpp                | 13 +++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 41f377376..7c15d2aa4 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -56,6 +56,10 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv, params);
             exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, params);
+            exit(0);
         }
     }
     return true;
diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp
index ba392009f..4a9bfa52d 100644
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
@@ -134,7 +134,13 @@ static bool set_no_delay(sockfd_t sockfd) {
     int flag = 1;
     // set TCP_NODELAY to disable Nagle's algorithm
     int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
-    return ret >= 0;
+    return ret == 0;
+}
+
+static bool set_reuse_addr(sockfd_t sockfd) {
+    int flag = 1;
+    int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
+    return ret == 0;
 }
 
 static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
@@ -181,7 +187,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
     if (sock == nullptr) {
         return nullptr;
     }
-
+    if (!set_reuse_addr(sockfd)) {
+        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
+        return nullptr;
+    }
     struct sockaddr_in serv_addr;
     serv_addr.sin_family = AF_INET;
     serv_addr.sin_addr.s_addr = inet_addr(host);

From 82ca83db3c8d45df559c03a4225b6eb34808a2db Mon Sep 17 00:00:00 2001
From: Gavin Zhao <gavinzhaojw@protonmail.com>
Date: Fri, 17 May 2024 11:03:03 -0400
Subject: [PATCH 20/20] ROCm: use native CMake HIP support (#5966)

Supercedes #4024 and #4813.

CMake's native HIP support has become the
recommended way to add HIP code into a project (see
[here](https://rocm.docs.amd.com/en/docs-6.0.0/conceptual/cmake-packages.html#using-hip-in-cmake)).
This PR makes the following changes:

1. The environment variable `HIPCXX` or CMake option
`CMAKE_HIP_COMPILER` should be used to specify the HIP
compiler. Notably this shouldn't be `hipcc`, but ROCm's clang,
which usually resides in `$ROCM_PATH/llvm/bin/clang`. Previously
this was control by `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`.
Note that since native CMake HIP support is not yet available on
Windows, on Windows we fall back to the old behavior.

2. CMake option `CMAKE_HIP_ARCHITECTURES` is used to control the
GPU architectures to build for. Previously this was controled by
`GPU_TARGETS`.

3. Updated the Nix recipe to account for these new changes.

4. The GPU targets to build against in the Nix recipe is now
consistent with the supported GPU targets in nixpkgs.

5. Added CI checks for HIP on both Linux and Windows. On Linux, we test
both the new and old behavior.

The most important part about this PR is the separation of the
HIP compiler and the C/C++ compiler. This allows users to choose
a different C/C++ compiler if desired, compared to the current
situation where when building for ROCm support, everything must be
compiled with ROCm's clang.

~~Makefile is unchanged. Please let me know if we want to be
consistent on variables' naming because Makefile still uses
`GPU_TARGETS` to control architectures to build for, but I feel
like setting `CMAKE_HIP_ARCHITECTURES` is a bit awkward when you're
calling `make`.~~ Makefile used `GPU_TARGETS` but the README says
to use `AMDGPU_TARGETS`. For consistency with CMake, all usage of
`GPU_TARGETS` in Makefile has been updated to `AMDGPU_TARGETS`.

Thanks to the suggestion of @jin-eld, to maintain backwards
compatibility (and not break too many downstream users' builds), if
`CMAKE_CXX_COMPILER` ends with `hipcc`, then we still compile using
the original behavior and emit a warning that recommends switching
to the new HIP support. Similarly, if `AMDGPU_TARGETS` is set but
`CMAKE_HIP_ARCHITECTURES` is not, then we forward `AMDGPU_TARGETS`
to `CMAKE_HIP_ARCHITECTURES` to ease the transition to the new
HIP support.

Signed-off-by: Gavin Zhao <git@gzgz.dev>
---
 .devops/nix/package.nix     | 16 +++++-----
 .github/workflows/build.yml | 58 +++++++++++++++++++++++++++++++++++++
 CMakeLists.txt              | 42 ++++++++++++++++++++++-----
 Makefile                    |  6 ++--
 README.md                   | 25 ++++++++++++----
 5 files changed, 122 insertions(+), 25 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 2c0ae4e2a..1c9633cdf 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -227,20 +227,20 @@ effectiveStdenv.mkDerivation (
         )
       ]
       ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
-
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-        # and select the line that matches the current nixpkgs version of rocBLAS.
-        # Should likely use `rocmPackages.clr.gpuTargets`.
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
       ]
       ++ optionals useMetalKit [
         (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
         (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
       ];
 
+    # Environment variables needed for ROCm
+    env = optionals useRocm {
+      ROCM_PATH = "${rocmPackages.clr}";
+      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+    };
+
     # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
     # if they haven't been added yet.
     postInstall = ''
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0742443c6..0109cc004 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -392,6 +392,33 @@ jobs:
           cmake -DLLAMA_VULKAN=ON ..
           cmake --build . --config Release -j $(nproc)
 
+  ubuntu-22-cmake-hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.0.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Build with legacy HIP support
+        id: cmake_build_legacy_hip
+        run: |
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
+          cmake --build build2 --config Release -j $(nproc)
+
   ubuntu-22-cmake-sycl:
     runs-on: ubuntu-22.04
 
@@ -989,6 +1016,37 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
           name: llama-bin-win-sycl-x64.zip
 
+  windows-latest-cmake-hip:
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
+          cmake --build build --config Release
+
   ios-xcode-build:
     runs-on: macos-latest
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ab6a45a6..990e34b86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -555,16 +555,37 @@ if (LLAMA_VULKAN)
 endif()
 
 if (LLAMA_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    if ($ENV{ROCM_PATH})
+        set(ROCM_PATH $ENV{ROCM_PATH})
+    else()
+        set(ROCM_PATH /opt/rocm)
+    endif()
+    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
 
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    # CMake on Windows doesn't support the HIP language yet
+    if(WIN32)
+        set(CXX_IS_HIPCC TRUE)
+    else()
+        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
     endif()
 
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-    endif()
+    if(CXX_IS_HIPCC)
+        if(LINUX)
+            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+            endif()
 
+            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+                    " Prefer setting the HIP compiler directly. See README for details.")
+        endif()
+    else()
+        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+        if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
+        endif()
+        cmake_minimum_required(VERSION 3.21)
+        enable_language(HIP)
+    endif()
     find_package(hip     REQUIRED)
     find_package(hipblas REQUIRED)
     find_package(rocblas REQUIRED)
@@ -598,13 +619,18 @@ if (LLAMA_HIPBLAS)
     add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
     add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
 
-    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    if (CXX_IS_HIPCC)
+        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
+    else()
+        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+    endif()
 
     if (LLAMA_STATIC)
         message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
     endif()
 
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 
 if (LLAMA_SYCL)
diff --git a/Makefile b/Makefile
index 3fa56d13a..22d521856 100644
--- a/Makefile
+++ b/Makefile
@@ -560,10 +560,10 @@ endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
@@ -575,7 +575,7 @@ ifdef LLAMA_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
diff --git a/README.md b/README.md
index 5d6217d13..7dd6fc0eb 100644
--- a/README.md
+++ b/README.md
@@ -528,13 +528,28 @@ Building the program with BLAS support may lead to some performance improvements
     ```
   - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
     ```bash
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
         && cmake --build build --config Release -- -j 16
     ```
     On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
     However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
 
+    Note that if you get the following error:
+    ```
+    clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+    ```
+    Try searching for a directory under `HIP_PATH` that contains the file
+    `oclc_abi_version_400.bc`. Then, add the following to the start of the
+    command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
+    like:
+    ```bash
+    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
+    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
+        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        && cmake --build build -- -j 16
+    ```
+
   - Using `make` (example for target gfx1030, build with 16 CPU threads):
     ```bash
     make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
@@ -543,10 +558,8 @@ Building the program with BLAS support may lead to some performance improvements
   - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
     ```bash
     set PATH=%HIP_PATH%\bin;%PATH%
-    mkdir build
-    cd build
-    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
-    cmake --build .
+    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+    cmake --build build
     ```
     Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
     Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.