Merge branch 'master' into concedo_experimental

# Conflicts: # Makefile # ggml.c # llama.cpp
2023-05-14 11:34:41 +08:00 · 2023-05-14 11:34:41 +08:00 · e01e373e63
commit e01e373e63
parent 9cd5b9a769 08737ef720
16 changed files with 6698 additions and 350 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@ build-debug/
 build-release/
 build-static/
 build-cublas/
+build-opencl/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/9
+++ b/9
@ -84,6 +84,15 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif

+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
+endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -36,4 +36,5 @@ else()
    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
+    add_subdirectory(baby-llama)
 endif()
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@ -0,0 +1,4 @@
+set(TARGET baby-llama)
+add_executable(${TARGET} baby-llama.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -277,6 +277,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_gpu_layers = std::stoi(argv[i]);
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
@ -421,6 +427,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_mmap_supported()) {
        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stderr, "                        number of layers to store in VRAM\n");
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
@ -465,6 +473,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {

    lparams.n_ctx        = params.n_ctx;
    lparams.n_parts      = params.n_parts;
+    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
    lparams.use_mmap     = params.use_mmap;
--- a/examples/common.h
+++ b/examples/common.h
@ -28,6 +28,7 @@ struct gpt_params {
    int32_t n_ctx         = 512; // context size
    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers  = 0;   // number of layers to store in VRAM

    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -32,9 +32,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
        }                                                                               \
    } while (0)

+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
 typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization

 #define QK4_0 32
+#define QR4_0 2
 typedef struct {
    float   d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
@ -42,6 +48,7 @@ typedef struct {
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");

 #define QK4_1 32
+#define QR4_1 2
 typedef struct {
    float   d;              // delta
    float   m;              // min
@ -50,6 +57,7 @@ typedef struct {
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");

 #define QK5_0 32
+#define QR5_0 2
 typedef struct {
    half d;                 // delta
    uint8_t qh[4];          // 5-th bit of quants
@ -58,6 +66,7 @@ typedef struct {
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

 #define QK5_1 32
+#define QR5_1 2
 typedef struct {
    half d;                 // delta
    half m;                 // min
@ -67,12 +76,100 @@ typedef struct {
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

 #define QK8_0 32
+#define QR8_0 1
 typedef struct {
    float   d;              // delta
    int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");

+#define CUDA_DMMV_BLOCK_SIZE 32
+
+static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = (vi0 - 8)*d;
+    v1 = (vi1 - 8)*d;
+}
+
+static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = vi0*d + m;
+    v1 = vi1*d + m;
+}
+
+static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const float d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    v0 = x0*d;
+    v1 = x1*d;
+}
+
+static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v0 = x0*d + m;
+    v1 = x1*d + m;
+}
+
+static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    v0 = vi0*d;
+    v1 = vi1*d;
+}
+
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const half * x = (const half *) vx;
+
+    v0 = __half2float(x[ib + 0]);
+    v1 = __half2float(x[ib + 1]);
+}
+
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
    static const int qk = QK4_0;

@ -173,6 +270,44 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
    }
 }

+template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    __shared__ float tmp[block_size]; // separate sum for each thread
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_kernel(vx, ib, iqs, v0, v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    __syncthreads();
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK4_0;
    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
@ -198,6 +333,36 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }

+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
 // TODO: optimize
 static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
    const half * x = (const half *) vx;
@ -211,6 +376,12 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
 }

+static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
@ -230,8 +401,27 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    }
 }

+static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_mul_mat_vec_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_mul_mat_vec_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_mul_mat_vec_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_mul_mat_vec_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_mul_mat_vec_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_mul_mat_vec_f16_cuda;
+        default:
+            return nullptr;
+    }
+}
+
 // buffer pool for cuda
-#define MAX_CUDA_BUFFERS 16
+#define MAX_CUDA_BUFFERS 256

 struct scoped_spin_lock {
    std::atomic_flag& lock;
@ -528,6 +718,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;

    const float alpha = 1.0f;
    const float beta = 0.0f;
@ -538,12 +729,16 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);

    size_t x_size, y_size, d_size, q_size;
-    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_X = nullptr;
+    if (!mul_mat_vec) {
+        d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    }
    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
    char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);

    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
    GGML_ASSERT(to_fp32_cuda != nullptr);

    for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -553,13 +748,35 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
            cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
            cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];

-            float * c_X = d_X + i * x_ne;
            float * c_Y = d_Y + i * y_ne;
            float * c_D = d_D + i * d_ne;
            char  * c_Q = d_Q + i * q_sz;

-            // copy src0 and convert to fp32 on device
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            } else if (src0->backend == GGML_BACKEND_CUDA) {
+                c_Q = ((char *) src0->data) + i * q_sz;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+                // wait for data
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+                // compute
+                dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
+                CUDA_CHECK(cudaGetLastError());
+
+            } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
+                float * c_X = d_X + i * x_ne;
+
+                // convert src0 to fp32 on device
                to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
                CUDA_CHECK(cudaGetLastError());
                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
@ -578,6 +795,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
                            &alpha, c_X, ne00,
                                    c_Y, ne10,
                            &beta,  c_D, ne01));
+            }

            // copy dst to host
            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@ -586,7 +804,9 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    }

    CUDA_CHECK(cudaDeviceSynchronize());
+    if (!mul_mat_vec) {
        ggml_cuda_pool_free(d_X, x_size);
+    }
    ggml_cuda_pool_free(d_Y, y_size);
    ggml_cuda_pool_free(d_D, d_size);
    ggml_cuda_pool_free(d_Q, q_size);
@ -602,8 +822,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
        return true;
    }

@ -655,3 +874,25 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
        return 0;
    }
 }
+
+void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+
+    size_t q_size;
+    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+
+    cudaStream_t cudaStream2 = g_cudaStreams2[0];
+
+    // copy tensor to device
+    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    tensor->data = d_Q;
+    tensor->backend = GGML_BACKEND_CUDA;
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -14,6 +14,8 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

+void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -192,7 +192,7 @@

 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
@ -244,6 +244,11 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_CUDA = 1,
+    };
+
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@ -265,12 +270,16 @@ extern "C" {

        GGML_OP_DUP,
        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
        GGML_OP_SUB,
        GGML_OP_MUL,
        GGML_OP_DIV,
        GGML_OP_SQR,
        GGML_OP_SQRT,
+        GGML_OP_LOG,
        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_REPEAT,
        GGML_OP_ABS,
@ -280,12 +289,15 @@ extern "C" {
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_SILU,
+        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,

        GGML_OP_MUL_MAT,

        GGML_OP_SCALE,
+        GGML_OP_SET,
        GGML_OP_CPY,
        GGML_OP_CONT,
        GGML_OP_RESHAPE,
@ -293,9 +305,13 @@ extern "C" {
        GGML_OP_PERMUTE,
        GGML_OP_TRANSPOSE,
        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
        GGML_OP_SOFT_MAX,
        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,
@ -325,6 +341,7 @@ extern "C" {
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -355,7 +372,7 @@ extern "C" {

        char name[32];

-        char padding[8]; // TODO: remove and add padding to name?
+        char padding[9]; // TODO: remove and add padding to name?
    };

    // computation graph
@ -499,6 +516,29 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -522,12 +562,24 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return scalar
-    // TODO: compute sum along rows
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // mean along rows
    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
@ -569,6 +621,13 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
    GGML_API struct ggml_tensor * ggml_norm(
@ -579,6 +638,13 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // A: m rows, n columns
    // B: p rows, n columns (i.e. we transpose it internally)
    // result is m columns, p rows
@ -591,12 +657,66 @@ extern "C" {
    // operations on tensors without backpropagation
    //

-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
@ -617,6 +737,11 @@ extern "C" {

    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -632,6 +757,14 @@ extern "C" {
            int64_t               ne1,
            int64_t               ne2);

+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
    // offset in bytes
    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
@ -657,6 +790,18 @@ extern "C" {
            size_t                nb2, // slice stride in bytes
            size_t                offset);

+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -675,20 +820,50 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
    // set elements above the diagonal to -INF
-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);

    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
    GGML_API struct ggml_tensor * ggml_soft_max(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // rotary position embedding
    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // rotary position embedding
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
@ -699,6 +874,23 @@ extern "C" {
            int                   n_dims,
            int                   mode);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
    // alibi position embedding
    // in-place, returns view(a)
    struct ggml_tensor * ggml_alibi(
@ -743,13 +935,13 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-            const  ggml_unary_op_f32_t fun);
+                   ggml_unary_op_f32_t   fun);

    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-            const  ggml_binary_op_f32_t fun);
+                   ggml_binary_op_f32_t   fun);

    //
    // automatic differentiation
--- a/llama.cpp
+++ b/llama.cpp
@ -9,6 +9,9 @@
 #include "llama.h"

 #include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif

 #include <array>
 #include <ctime>
@ -50,49 +53,49 @@ static const size_t MB = 1024*1024;

 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
        { MODEL_65B,  1024ull * MB },
    };
-    return _MEM_REQ_SCRATCH0;
+    return k_sizes;
 }

 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
 {
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
        { MODEL_65B,  1024ull * MB },
    };
-    return _MEM_REQ_SCRATCH1;
+    return k_sizes;
 }

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 {
-    static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,   1026ull * MB },
        { MODEL_13B,  1608ull * MB },
        { MODEL_30B,  3124ull * MB },
        { MODEL_65B,  5120ull * MB },
    };
-    return _MEM_REQ_KV_SELF;
+    return k_sizes;
 }

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
 static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 {
-    static std::map<e_model, size_t> _MEM_REQ_EVAL = {
+    static std::map<e_model, size_t> k_sizes = {
        { MODEL_7B,   800ull * MB },
        { MODEL_13B, 1024ull * MB },
        { MODEL_30B, 1280ull * MB },
        { MODEL_65B, 1536ull * MB },
    };
-    return _MEM_REQ_EVAL;
+    return k_sizes;
 }

 // default hparams (LLaMA 7B)
@ -597,12 +600,12 @@ struct llama_model_loader {
    std::unique_ptr<llama_mmap> mapping;

    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
+        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
        file_loaders.emplace_back(first_file);
        uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
        for (uint32_t i = 1; i < n_parts; i++) {
            std::string fname = fname_base + "." + std::to_string(i);
-            auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
+            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
            file_loaders.emplace_back(ith_file);
            if (ith_file->hparams != first_file->hparams) {
                throw format("llama.cpp: hparams inconsistent between files");
@ -649,7 +652,7 @@ struct llama_model_loader {
        }
    }

-    struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -678,7 +681,7 @@ struct llama_model_loader {
        return tensor;
    }

-    void done_getting_tensors() {
+    void done_getting_tensors() const {
        if (num_ggml_tensors_created != tensors_map.tensors.size()) {
            throw std::string("llama.cpp: file contained more tensors than expected");
        }
@ -821,6 +824,7 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.n_ctx                       =*/ 512,
        /*.n_parts                     =*/ -1,
+        /*.gpu_layers                  =*/ 0,
        /*.seed                        =*/ -1,
        /*.f16_kv                      =*/ false,
        /*.logits_all                  =*/ false,
@ -889,6 +893,7 @@ static void llama_model_load_internal(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -948,7 +953,8 @@ static void llama_model_load_internal(

    auto & ctx = model.ctx;

-    size_t ctx_size, mmapped_size;
+    size_t ctx_size;
+    size_t mmapped_size;
    ml->calc_sizes(&ctx_size, &mmapped_size);
    fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);

@ -1035,6 +1041,35 @@ static void llama_model_load_internal(
    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

    model.mapping = std::move(ml->mapping);
+#ifdef GGML_USE_CUBLAS
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+            ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+#else
+    (void) n_gpu_layers;
+#endif

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
@ -1045,6 +1080,7 @@ static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1052,7 +1088,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+        llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
                                  vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::string & err) {
@ -1088,7 +1124,7 @@ static bool llama_eval_internal(
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;

-    auto & kv_self = model.kv_self;
+    const auto & kv_self = model.kv_self;

    LLAMA_ASSERT(!!kv_self.ctx);

@ -1141,8 +1177,8 @@ static bool llama_eval_internal(
        // self-attention
        {
            // compute Q and K and RoPE them
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
            ggml_set_name(Qcur, "Qcur");
            ggml_set_name(Kcur, "Kcur");

@ -1183,17 +1219,19 @@ static bool llama_eval_internal(
            struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
            ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");

-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
            ggml_set_name(KQ_scaled, "KQ_scaled");

            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
            ggml_set_name(KQ_masked, "KQ_masked");

            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
            ggml_set_name(KQ_soft_max, "KQ_soft_max");

+
            // split cached V into n_head heads
            struct ggml_tensor * V =
                ggml_view_3d(ctx0, kv_self.v,
@ -1294,7 +1332,7 @@ static bool llama_eval_internal(
    lctx.use_buf(ctx0, -1);

    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max_inplace(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -1332,7 +1370,7 @@ static bool llama_eval_internal(
    }

    // extract embeddings
-    if (lctx.embedding.size()) {
+    if (!lctx.embedding.empty()) {
        auto & embedding_out = lctx.embedding;

        embedding_out.resize(n_embd);
@ -1383,6 +1421,8 @@ struct llama_sp_symbol {
    size_t n;
 };

+static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
+
 struct llama_sp_bigram {
    struct comparator {
        bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
@ -1415,7 +1455,7 @@ struct llama_tokenizer {
            sym.prev = index - 1;
            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
-            symbols_.emplace_back(std::move(sym));
+            symbols_.emplace_back(sym);
        }

        // seed the work queue with all possible 2-character tokens.
@ -1506,7 +1546,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
    llama_tokenizer tokenizer(vocab);
    std::vector<llama_vocab::id> output;

-    if (text.size() == 0) {
+    if (text.empty()) {
        return output;
    }

@ -1742,7 +1782,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
    const int64_t t_start_sample_us = ggml_time_us();

    for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
        if (token_iter == last_tokens + last_tokens_size) {
            continue;
        }
@ -1886,7 +1926,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
    const int64_t t_start_sample_us = ggml_time_us();

    // Find max element
-    auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
        return a.logit < b.logit;
    });

@ -1941,7 +1981,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        nthread = std::thread::hardware_concurrency();
    }

-    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
+    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
                                                                            /*vocab_only*/ false));
    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);

@ -1995,7 +2035,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else if (tensor.type == GGML_TYPE_F16) {
                f32_conv_buf.resize(nelements * sizeof(float));
                f32_data = (float *) f32_conv_buf.addr;
-                auto f16_data = (const ggml_fp16_t *) tensor.data;
+                const auto * f16_data = (const ggml_fp16_t *) tensor.data;
                for (size_t i = 0; i < nelements; i++) {
                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                }
@ -2026,21 +2066,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                        size_t first = counter; counter += chunk_size;
                        if (first >= nelements) {
                            if (!local_hist.empty()) {
-                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
+                                for (int j=0; j<int(local_hist.size()); ++j) {
+                                    hist_cur[j] += local_hist[j];
+                                }
                                new_size += local_size;
                            }
                            break;
                        }
                        lock.unlock();
                        size_t last = std::min(nelements, first + chunk_size);
-                        if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
+                        if (local_hist.empty()) {
+                            local_hist.resize(hist_cur.size(), 0);
+                        }
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
-                if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
+                if ((int) workers.size() < nthread_use - 1) {
+                    workers.resize(nthread_use - 1);
+                }
+                for (int it = 0; it < nthread_use - 1; ++it) {
+                    workers[it] = std::thread(compute);
+                }
                compute();
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
+                for (int it = 0; it < nthread_use - 1; ++it) {
+                    workers[it].join();
+                }
            }

            printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@ -2112,7 +2162,7 @@ struct llama_context * llama_init_from_file(

    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
                          params.use_mmap, params.use_mlock, params.vocab_only,
                          params.progress_callback, params.progress_callback_user_data)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
@ -2238,7 +2288,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
        model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));

-        size_t ctx_size, mmapped_size;
+        size_t ctx_size;
+        size_t mmapped_size;
        model_loader->calc_sizes(&ctx_size, &mmapped_size);
        base_buf.resize(ctx_size);

@ -2277,8 +2328,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }

-        std::string name(length, 0);
-        fin.read(&name[0], length);
+        std::string name;
+        {
+            char buf[1024];
+            fin.read(buf, length);
+            name = std::string(buf, length);
+        }

        // check for lora suffix and get the type of tensor
        const std::string lora_suffix = ".lora";
@ -2293,7 +2348,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
        base_name.erase(pos);
        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());

-        if (model_tensors.find(base_name.data()) == model_tensors.end()) {
+        if (model_tensors.find(base_name) == model_tensors.end()) {
            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
            return 1;
        }
@ -2373,7 +2428,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *

            if (scaling != 1.0f) {
                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-                BA = ggml_scale(lora_ctx, BA, scale_tensor);
+                BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
            }

            ggml_tensor * r;
@ -2395,10 +2450,11 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
            lora_tensors.clear();

            n_tensors++;
-            if (n_tensors % 4 == 0)
+            if (n_tensors % 4 == 0) {
                fprintf(stderr, ".");
            }
        }
+    }

    // TODO: this should be in a destructor, it will leak on failure
    ggml_free(lora_ctx);
@ -2425,7 +2481,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
    return ctx->model.kv_self.n;
 }

-#define LLAMA_MAX_RNG_STATE 64*1024
+#define LLAMA_MAX_RNG_STATE (64*1024)

 void llama_set_rng_seed(struct llama_context * ctx, int seed) {
    if (seed < 0) {
@ -2684,7 +2740,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
        const uint32_t magic   = file.read_u32();
        const uint32_t version = file.read_u32();

-        if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
            return false;
        }
--- a/llama.h
+++ b/llama.h
@ -56,6 +56,7 @@ extern "C" {
    struct llama_context_params {
        int n_ctx;        // text context
        int n_parts;      // -1 for default
+        int n_gpu_layers; // number of layers to store in VRAM
        int seed;         // RNG seed, -1 for random

        bool f16_kv;     // use fp16 for KV cache
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -10,3 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+# llama_add_test(test-grad0.c) # SLOW
+# llama_add_test(test-opt.c) # SLOW
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@ -0,0 +1,205 @@
+#include "ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define MAX_NARGS 2
+
+
+//
+// logging
+//
+#define GGML_DEBUG 0
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+    return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = min + irand(max-min);
+    }
+}
+
+
+struct ggml_tensor * get_random_tensor(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+    return ((float *)t->data)[idx];
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+    ((float *)t->data)[idx] = value;
+}
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 1024*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    int64_t ne1[4] = {4, 1024, 1, 1};
+    int64_t ne2[4] = {4, 2048, 1, 1};;
+    int64_t ne3[4] = {1024, 2048, 1, 1};
+
+    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
+    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
+    ggml_set_param(ctx, a);
+    ggml_set_param(ctx, b);
+
+    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
+    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
+    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+
+
+    struct ggml_cgraph ge = ggml_build_forward(e);
+    ggml_graph_reset  (&ge);
+    ggml_graph_compute(ctx, &ge);
+    const float fe = ggml_get_f32_1d(e, 0);
+    printf("%s: e = %.4f\n", __func__, fe);
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+    ggml_opt(ctx, opt_params, e);
+
+    ggml_graph_reset  (&ge);
+    ggml_graph_compute(ctx, &ge);
+    const float fe_opt = ggml_get_f32_1d(e, 0);
+    printf("%s: original  e = %.4f\n", __func__, fe);
+    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+
+    const bool success = (fe_opt <= fe);
+    assert(success);
+
+    ggml_free(ctx);
+    return success ? 0 : -1;
+}
+// int64_t ne1[4] = {4, 128, 1, 1};
+// int64_t ne2[4] = {4, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 25890.9375
+// main: optimized e = 10094.7031
+
+// int64_t ne1[4] = {8, 128, 1, 1};
+// int64_t ne2[4] = {8, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 39429.5078
+// main: optimized e = 9275.8936
+
+// int64_t ne1[4] = {16, 128, 1, 1};
+// int64_t ne2[4] = {16, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 68371.1328
+// main: optimized e = 7854.4502
+
+
+// int64_t ne1[4] = {32, 128, 1, 1};
+// int64_t ne2[4] = {32, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 126061.1953
+// main: optimized e = 5451.0166
+
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1620817.8750
+// main: optimized e = 698387.6875
+
+// another run on M1
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1629595.6250
+// main: optimized e = 698169.1250
+
+// int64_t ne1[4] = {32, 1024, 1, 1};
+// int64_t ne2[4] = {32, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 8146770.5000
+// main: optimized e = 651119.1250