From 921dcee00a55d9aba3b3026d0509d31ac8386e2a Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 5 May 2023 16:43:36 +0200
Subject: [PATCH 1/5] readme: add missing info (#1324)

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f1fa63542..233c5c5e1 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,12 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 
 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
-- AVX2 support for x86 architectures
+- AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
-- 4-bit integer quantization support
+- 4-bit, 5-bit and 8-bit integer quantization support
 - Runs on the CPU
+- OpenBLAS support
+- cuBLAS and CLBlast support
 
 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves

From a3b85b28da84c67c3406807aef5e0457bcc4b00f Mon Sep 17 00:00:00 2001
From: Erik Scholz <Green-Sky@users.noreply.github.com>
Date: Fri, 5 May 2023 22:56:09 +0200
Subject: [PATCH 2/5] ci : add cublas to windows release (#1271)

---
 .github/workflows/build.yml | 77 +++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 179080576..18bb33f94 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -210,6 +210,82 @@ jobs:
           path: |
             llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
 
+  windows-latest-cmake-cublas:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        cuda: ['12.1.0', '11.7.1']
+        build: ['cublas']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - uses: Jimver/cuda-toolkit@v0.2.10
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+          # TODO(green-sky): _dev seems to fail, and non dev are not enought
+          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '12.1.0' }}
+        # TODO(green-sky): paths are cuda 12 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '11.7.1' }}
+        # TODO(green-sky): paths are cuda 11 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
@@ -221,6 +297,7 @@ jobs:
       - macOS-latest-make
       - macOS-latest-cmake
       - windows-latest-cmake
+      - windows-latest-cmake-cublas
 
     steps:
       - name: Download artifacts

From 173d0e6419e8f8f3c1f4f13201b777f4c60629f3 Mon Sep 17 00:00:00 2001
From: DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
Date: Fri, 5 May 2023 23:57:14 +0200
Subject: [PATCH 3/5] makefile: automatic Arch Linux detection (#1332)

This commit is a port of a detection method used in koboldcpp's Makefile in order to automatically set the -lcblas option on Arch Linux
---
 Makefile  | 6 +++++-
 README.md | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 260b2487f..0ddff9961 100644
--- a/Makefile
+++ b/Makefile
@@ -107,7 +107,11 @@ ifndef LLAMA_NO_ACCELERATE
 endif
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-	LDFLAGS += -lopenblas
+	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
+		LDFLAGS += -lopenblas -lcblas
+	else
+		LDFLAGS += -lopenblas
+	endif
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
diff --git a/README.md b/README.md
index 233c5c5e1..19cc94aa2 100644
--- a/README.md
+++ b/README.md
@@ -216,7 +216,6 @@ Building the program with BLAS support may lead to some performance improvements
       ```bash
       make LLAMA_OPENBLAS=1
       ```
-      Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`
 
     - On Windows:
 

From 3924088512d9e12e90ed6dbf28a6c5712481d33e Mon Sep 17 00:00:00 2001
From: Jed Fox <git@jedfox.com>
Date: Sat, 6 May 2023 17:01:47 -0400
Subject: [PATCH 4/5] Remove default arguments from sampling functions (#1343)

---
 .gitignore              | 1 +
 examples/main/main.cpp  | 8 ++++----
 llama.cpp               | 2 +-
 llama.h                 | 8 ++++----
 tests/test-sampling.cpp | 8 ++++----
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index e479c6180..6f275fea4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ build-sanitize-addr/
 build-sanitize-thread/
 
 models/*
+*.bin
 
 /main
 /quantize
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 43dca8eb5..5ac151e14 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -444,10 +444,10 @@ int main(int argc, char ** argv) {
                         id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                     } else {
                         // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z);
-                        llama_sample_typical(ctx, &candidates_p, typical_p);
-                        llama_sample_top_p(ctx, &candidates_p, top_p);
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                         llama_sample_temperature(ctx, &candidates_p, temp);
                         id = llama_sample_token(ctx, &candidates_p);
                     }
diff --git a/llama.cpp b/llama.cpp
index 85af4dc49..c36c6ced6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1791,7 +1791,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
     float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
 
     // Sample the next word X using top-k sampling
-    llama_sample_top_k(nullptr, candidates, int(k));
+    llama_sample_top_k(nullptr, candidates, int(k), 1);
     if (ctx) {
         ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
     }
diff --git a/llama.h b/llama.h
index e993c464a..58c6e0699 100644
--- a/llama.h
+++ b/llama.h
@@ -202,16 +202,16 @@ extern "C" {
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
 
     /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
 
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
 
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
     LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 8ce59af3d..9174c1e37 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -32,7 +32,7 @@ void test_top_k(const std::vector<float> & probs,
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
-    llama_sample_top_k(nullptr, &candidates_p, k);
+    llama_sample_top_k(nullptr, &candidates_p, k, 1);
     DUMP(&candidates_p);
 
     assert(candidates_p.size == expected_probs.size());
@@ -57,7 +57,7 @@ void test_top_p(const std::vector<float> & probs,
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
-    llama_sample_top_p(nullptr, &candidates_p, p);
+    llama_sample_top_p(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
     assert(candidates_p.size == expected_probs.size());
@@ -80,7 +80,7 @@ void test_tfs(const std::vector<float> & probs,
 
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     DUMP(&candidates_p);
-    llama_sample_tail_free(nullptr, &candidates_p, z);
+    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
     DUMP(&candidates_p);
 
     assert(candidates_p.size == expected_probs.size());
@@ -103,7 +103,7 @@ void test_typical(const std::vector<float> & probs,
 
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     DUMP(&candidates_p);
-    llama_sample_typical(nullptr, &candidates_p, p);
+    llama_sample_typical(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
     assert(candidates_p.size == expected_probs.size());

From 1b0fd454650ef4d68a980e3225488b79e6e9af25 Mon Sep 17 00:00:00 2001
From: swittk <switt1995@gmail.com>
Date: Sun, 7 May 2023 10:03:23 +0700
Subject: [PATCH 5/5] ggml : Allow usage of CLBlast alongside
 Accelerate.framework (#1336)

Minor edit in ggml.c which originally would prevent OpenCL from loading completely if GGML_USE_ACCELERATE was defined.
Minor speedup in prompt eval time.
---
 ggml.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml.c b/ggml.c
index 4d49242a4..1b89bdd89 100644
--- a/ggml.c
+++ b/ggml.c
@@ -137,6 +137,9 @@ inline static void* ggml_aligned_malloc(size_t size) {
 
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
+#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
+#include "ggml-opencl.h"
+#endif
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
 #elif defined(GGML_USE_CUBLAS)