From 64639555ff93c8ead2b80becb49cc6b60aeac240 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 8 Jul 2023 20:01:44 +0200
Subject: [PATCH 1/6] Fixed OpenLLaMA 3b CUDA mul_mat_vec_q (#2144)

---
 ggml-cuda.cu | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index ec41e3524..fd36f179b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -208,6 +208,7 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
 
 #define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
@@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
@@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 
     block_q8_1 * y = (block_q8_1 *) vy;
 
-    const int ib = i / QK8_0; // block index
-    const int iqs = i % QK8_0; // quant index
+    const int ib = i / QK8_1; // block index
+    const int iqs = i % QK8_1; // quant index
 
-    const float xi = x[i];
+    const float xi = i < ndata ? x[i] : 0.0f;
     float amax = fabsf(xi);
     float sum = xi;
 
@@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
     rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
+    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
 }
 
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2359,9 +2360,11 @@ inline void ggml_cuda_op_mul_mat_vec(
 #endif
 
     if (use_mul_mat_vec_q) {
+        int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
+        padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
         size_t as;
-        void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
+        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
 
         switch (src0->type) {
             case GGML_TYPE_Q4_0:
@@ -3105,7 +3108,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
 
 void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
     int nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
     const size_t nb1 = tensor->nb[1];
+
     ggml_backend backend = tensor->backend;
     struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
@@ -3134,11 +3141,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
         int64_t nrows_split = row_high - row_low;
 
         const size_t offset_split = row_low*nb1;
-        const size_t size = ggml_nbytes_split(tensor, nrows_split);
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
 
-        void * buf;
+        // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+
+        char * buf;
         CUDA_CHECK(cudaMalloc(&buf, size));
-        void * buf_host = (char*)data + offset_split;
+        char * buf_host = (char*)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
 
         cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
 

From 2492a53fd0d8372ecc67f49f07b581905175eea8 Mon Sep 17 00:00:00 2001
From: rankaiyx <rankaiyx@rankaiyx.com>
Date: Sun, 9 Jul 2023 15:38:42 +0800
Subject: [PATCH 2/6] readme : add more docs indexes (#2127)

* Update README.md to add more docs indexes

* Update README.md to add more docs indexes
---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7953fd3a0..318632989 100644
--- a/README.md
+++ b/README.md
@@ -783,5 +783,10 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 
 ### Docs
 
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [main](./examples/main/README.md)
+- [server](./examples/server/README.md)
+- [embd-input](./examples/embd-input/README.md)
+- [jeopardy](./examples/jeopardy/README.md)
+- [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

From 3bbc1a11f04a9adc0d0e08c2940ba4d2978755ab Mon Sep 17 00:00:00 2001
From: clyang <clyang@clyang.net>
Date: Sun, 9 Jul 2023 16:12:20 +0800
Subject: [PATCH 3/6] ggml : fix buidling with Intel MKL but ask for "cblas.h"
 issue (#2104) (#2115)

* Fix buidling with Intel MKL but ask for "cblas.h" issue

* Use angle brackets to indicate the system library
---
 CMakeLists.txt | 3 +++
 ggml.c         | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2404548f..eed7b1b7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,9 @@ if (LLAMA_BLAS)
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
         add_compile_definitions(GGML_USE_OPENBLAS)
+        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+        endif()
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
 
diff --git a/ggml.c b/ggml.c
index 55b0aff03..c10877a76 100644
--- a/ggml.c
+++ b/ggml.c
@@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
+#else
 #include <cblas.h>
+#endif
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)

From 18780e0a5e17348236230bbe891901b9b5718709 Mon Sep 17 00:00:00 2001
From: JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Date: Sun, 9 Jul 2023 05:20:43 -0300
Subject: [PATCH 4/6] readme : update Termux instructions (#2147)

The file pathing is significant when running models inside of Termux on Android devices. llama.cpp performance is improved with loading a .bin from the $HOME directory.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 318632989..daa71c2b9 100644
--- a/README.md
+++ b/README.md
@@ -695,7 +695,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
 
 For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
 
-Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
 
 ### Docker
 

From db4047ad5cd8eae04db3b2efe0245e69a376601a Mon Sep 17 00:00:00 2001
From: Nigel Bosch <pnigelb@gmail.com>
Date: Sun, 9 Jul 2023 03:56:18 -0500
Subject: [PATCH 5/6] main : escape prompt prefix/suffix (#2151)

---
 examples/common.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/common.cpp b/examples/common.cpp
index 3278a0643..93159c6df 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -418,6 +418,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
     if (escape_prompt) {
         process_escapes(params.prompt);
+        process_escapes(params.input_prefix);
+        process_escapes(params.input_suffix);
     }
 
     return true;

From 1d1630996920f889cdc08de26cebf2415958540e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 9 Jul 2023 05:59:53 -0300
Subject: [PATCH 6/6] llama : remove "first token must be BOS" restriction
 (#2153)

---
 llama.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ee6ec0920..a491f1c7e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1291,12 +1291,6 @@ static bool llama_eval_internal(
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
 
-    // enforce that the first token is BOS
-    if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
-        return false;
-    }
-
     const int64_t t_start_us = ggml_time_us();
 
     const int N = n_tokens;