From 21f3d1be867b4d7be07c26f5da6e4bc69bcf4d27 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Sat, 2 Sep 2023 20:23:45 +0800
Subject: [PATCH 1/4] k-quants : fix build on armv7 (android only) (#2920)

* k-quants : fix build on armv7

* ggml : cleanup unused arm32 specific impl

* k-quants : avoid some unused vzero / mzero define

* ggml-alloc : use 4g for MEASURE_MAX_SIZE in 32-bit arm
---
 ggml-alloc.c |  7 +++++++
 ggml.c       | 46 ----------------------------------------------
 k_quants.c   | 40 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index f07a4a217..459f121ca 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -284,7 +284,14 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
 // address and size of the buffer when measuring
 // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
 static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+#if defined(__ARM_NEON) && !defined(__aarch64__)
+// 32-bit
+// TODO: Use for 32-bit x86 as well
+static const size_t MEASURE_MAX_SIZE  = (1ULL<<32) - 1; // 4 GB
+#else
+// 64-bit
 static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+#endif
 
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
diff --git a/ggml.c b/ggml.c
index cf3955f7f..38b1155c1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -817,46 +817,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 
 #if !defined(__aarch64__)
 
-inline static uint16_t vaddvq_u8(uint8x16_t v) {
-    return
-        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
-        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
-        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
-        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
-        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
-        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
-        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
-        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
-}
-
-inline static int16_t vaddvq_s8(int8x16_t v) {
-    return
-        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
-        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
-        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
-        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
-        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
-        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
-        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
-        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
-}
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static uint32_t vaddvq_u16(uint16x8_t v) {
-    return
-        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
-        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
-        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
-        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
-}
-
 inline static int32_t vaddvq_s32(int32x4_t v) {
     return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
@@ -865,12 +825,6 @@ inline static float vaddvq_f32(float32x4_t v) {
     return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
 
-inline static float vminvq_f32(float32x4_t v) {
-    return
-        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
 inline static float vmaxvq_f32(float32x4_t v) {
     return
         MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
diff --git a/k_quants.c b/k_quants.c
index 3deeaedf7..4accd2480 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -13,6 +13,26 @@
 //
 #include <arm_neon.h>
 
+#if !defined(__aarch64__)
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+#endif
+
 #else
 
 #ifdef __wasm_simd128__
@@ -1302,7 +1322,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     const uint8x16_t m3 = vdupq_n_u8(0x3);
     const uint8x16_t m4 = vdupq_n_u8(0xF);
+#if defined(__ARM_FEATURE_DOTPROD)
     const int32x4_t  vzero = vdupq_n_s32(0);
+#endif
 
     int8x16x2_t q2bytes;
     uint8_t aux[16];
@@ -1608,7 +1630,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON
 
     const uint8x16_t m3 = vdupq_n_u8(0x3);
+#if defined(__ARM_FEATURE_DOTPROD)
     const int32x4_t  vzero = vdupq_n_s32(0);
+#endif
 
     int8x16x4_t q2bytes;
 
@@ -2592,8 +2616,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         const uint8_t * restrict q4 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
 
-        //int32x4_t isum = mzero;
-
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
 
@@ -3092,9 +3114,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON
 
     const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
     const uint8x16_t mone = vdupq_n_u8(1);
     const uint8x16_t mtwo = vdupq_n_u8(2);
+#if defined(__ARM_FEATURE_DOTPROD)
+    const int32x4_t mzero = vdupq_n_s32(0);
+#endif
 
     int8x16x4_t q5bytes;
 
@@ -3437,8 +3461,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON
 
     const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
     const uint8x16_t mh = vdupq_n_u8(16);
+#if defined(__ARM_FEATURE_DOTPROD)
+    const int32x4_t mzero = vdupq_n_s32(0);
+#endif
 
     int8x16x4_t q5bytes;
     uint8x16x4_t q5h;
@@ -3656,7 +3682,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
     float sum = 0;
 
     const uint8x16_t m4b = vdupq_n_u8(0xF);
+#if defined(__ARM_FEATURE_DOTPROD)
     const int32x4_t  vzero = vdupq_n_s32(0);
+#endif
     //const int8x16_t  m32s = vdupq_n_s8(32);
 
     const uint8x16_t mone = vdupq_n_u8(3);
@@ -4045,8 +4073,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
     float sum = 0;
 
     const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int32x4_t  vzero = vdupq_n_s32(0);
     const int8x16_t  m32s = vdupq_n_s8(32);
+#if defined(__ARM_FEATURE_DOTPROD)
+    const int32x4_t  vzero = vdupq_n_s32(0);
+#endif
 
     const uint8x16_t mone = vdupq_n_u8(3);
 

From 8b56b4f2c396eae1f4417e5a859557fed989e0ee Mon Sep 17 00:00:00 2001
From: Karsten Weiss <knweiss@gmail.com>
Date: Sat, 2 Sep 2023 14:29:09 +0200
Subject: [PATCH 2/4] metal : show all Metal device instances in the system
 (#2952)

* ggml_metal_init: Show all Metal device instances in the system

Also show the default Metal device that was picked.

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 4267db9be..88e7e1356 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -116,10 +116,24 @@ static NSString * const msl_library_source = @"see metal.metal";
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
     metal_printf("%s: allocating\n", __func__);
 
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    // Show all the Metal device instances in the system
+    NSArray * devices = MTLCopyAllDevices();
+    id <MTLDevice> device;
+    NSString * s;
+    for (device in devices) {
+        s = [device name];
+        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
+    }
 
+    // Pick and show default Metal device
+    device = MTLCreateSystemDefaultDevice();
+    s = [device name];
+    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
+
+    // Configure context
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    ctx->device = device;
     ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-    ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
     ctx->concur_list_len = 0;

From 52315a421674ff64305dbf082f69e4ec77f0a3f3 Mon Sep 17 00:00:00 2001
From: bandoti <141645996+bandoti@users.noreply.github.com>
Date: Sat, 2 Sep 2023 09:53:18 -0300
Subject: [PATCH 3/4] readme : update clblast instructions (#2903)

* Update Windows CLBlast instructions

* Update Windows CLBlast instructions

* Remove trailing whitespace
---
 README.md | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 4e6a0957d..5eda5f006 100644
--- a/README.md
+++ b/README.md
@@ -464,6 +464,8 @@ Building the program with BLAS support may lead to some performance improvements
   You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
     - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
 
+    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
+
     - <details>
         <summary>Installing the OpenCL SDK from source</summary>
 
@@ -481,10 +483,27 @@ Building the program with BLAS support may lead to some performance improvements
         ```
       </details>
 
-  Installing CLBlast: it may be found in your operating system's packages.
+  ##### Installing CLBlast
+
+  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
+
+  Alternatively, they may be built from source.
 
   - <details>
-    <summary>If not, then installing from source:</summary>
+    <summary>Windows:</summary>
+
+      ```cmd
+      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
+      git clone https://github.com/CNugteren/CLBlast.git
+      mkdir CLBlast\build
+      cd CLBlast\build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
+      cmake --build . --config Release
+      cmake --install . --prefix C:/CLBlast
+      ```
+
+  - <details>
+    <summary>Unix:</summary>
 
       ```sh
       git clone https://github.com/CNugteren/CLBlast.git
@@ -498,21 +517,32 @@ Building the program with BLAS support may lead to some performance improvements
       Where `/some/path` is where the built library will be installed (default is `/usr/local`).
     </details>
 
-  Building:
+  ##### Building Llama with CLBlast
 
   - Build with make:
     ```sh
     make LLAMA_CLBLAST=1
     ```
-  - CMake:
+  - CMake (Unix):
     ```sh
     mkdir build
     cd build
     cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
     cmake --build . --config Release
     ```
+  - CMake (Windows):
+    ```cmd
+    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
+    git clone https://github.com/ggerganov/llama.cpp
+    cd llama.cpp
+    mkdir build
+    cd build
+    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
+    cmake --build . --config Release
+    cmake --install . --prefix C:/LlamaCPP
+    ```
 
-  Running:
+  ##### Running Llama with CLBlast
 
   The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
 

From 3358c381f6251bf6e65855e1c93bfaa9ec82ddb3 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sat, 2 Sep 2023 11:53:55 -0600
Subject: [PATCH 4/4] logging: Fix creating empty file even when disabled
 (#2966)

* logging: Fix creating empty file even when disabled

* Minor formatting fix

Co-authored-by: staviq <staviq@gmail.com>

---------

Co-authored-by: staviq <staviq@gmail.com>
---
 common/log.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/log.h b/common/log.h
index bf9fafd68..0b9b01052 100644
--- a/common/log.h
+++ b/common/log.h
@@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
         }
     }
 
+    if (_disabled)
+    {
+        // Log is disabled
+        return nullptr;
+    }
+
     if (_initialized)
     {
-        if (_disabled)
-        {
-            // Log is disabled
-            return nullptr;
-        }
-
         // with fallback in case something went wrong
         return logfile ? logfile : stderr;
     }