Merge branch 'master' into concedo_experimental

# Conflicts: # CMakeLists.txt # README.md # llama.cpp
2023-07-10 20:08:27 +08:00 · 2023-07-10 20:08:27 +08:00 · 50097e6c7f
commit 50097e6c7f
parent 523fc3be52 1d16309969
4 changed files with 38 additions and 16 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -418,6 +418,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    if (escape_prompt) {
        process_escapes(params.prompt);
+        process_escapes(params.input_prefix);
+        process_escapes(params.input_suffix);
    }

    return true;
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -208,6 +208,7 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");

 #define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
    v.y = x[ib + iqs + 1];
 }

-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest

    block_q8_1 * y = (block_q8_1 *) vy;

-    const int ib = i / QK8_0; // block index
-    const int iqs = i % QK8_0; // quant index
+    const int ib = i / QK8_1; // block index
+    const int iqs = i % QK8_1; // quant index

-    const float xi = x[i];
+    const float xi = i < ndata ? x[i] : 0.0f;
    float amax = fabsf(xi);
    float sum = xi;

@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }

-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
+    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
 }

 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@ -2384,9 +2385,11 @@ inline void ggml_cuda_op_mul_mat_vec(
 #endif

    if (use_mul_mat_vec_q) {
+        int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
+        padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
        size_t as;
-        void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
+        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);

        switch (src0->type) {
            case GGML_TYPE_Q4_0:
@ -3133,7 +3136,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens

 void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
    int nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
    const size_t nb1 = tensor->nb[1];
+
    ggml_backend backend = tensor->backend;
    struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
    memset(extra, 0, sizeof(*extra));
@ -3162,11 +3169,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
        int64_t nrows_split = row_high - row_low;

        const size_t offset_split = row_low*nb1;
-        const size_t size = ggml_nbytes_split(tensor, nrows_split);
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;

-        void * buf;
+        // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+
+        char * buf;
        CUDA_CHECK(cudaMalloc(&buf, size));
-        void * buf_host = (char*)data + offset_split;
+        char * buf_host = (char*)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+

        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);

--- a/ggml.c
+++ b/ggml.c
@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
+#else
 #include <cblas.h>
+#endif
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
--- a/llama.cpp
+++ b/llama.cpp
@ -1291,11 +1291,7 @@ static bool llama_eval_internal(
                   int   n_threads,
            const char * cgraph_fname) {

-    // // enforce that the first token is BOS
-    // if (n_past == 0 && tokens[0] != llama_token_bos()) {
-    //     fprintf(stderr, "%s: first token must be BOS\n", __func__);
-    //     return false;
-    // }
+    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));

    const int64_t t_start_us = ggml_time_us();