Merge remote-tracking branch 'upstream/master'

# Conflicts: # examples/server/README.md
2023-06-15 17:12:29 -04:00 · 2023-06-15 17:12:29 -04:00 · 488c62acf9
commit 488c62acf9
parent aee859519e a09f9195be
11 changed files with 547 additions and 457 deletions
--- a/.gitignore
+++ b/.gitignore
@ -32,6 +32,7 @@ models/*
 /result
 /perplexity
 /embedding
+/train-text-from-scratch
 /benchmark-matmult
 /vdot
 /server
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -163,12 +163,23 @@ if (LLAMA_BLAS)
    if (BLAS_FOUND)
        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")

+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_path(BLAS_INCLUDE_DIRS
+            NAMES cblas.h
+            HINTS
+                /usr/include
+                /usr/local/include
+                /usr/include/openblas
+        )
+
+        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})

-        message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
-        include_directories(${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -408,7 +419,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_EXTRA}
            )

-target_include_directories(ggml PUBLIC .)
+target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})

--- a/7
+++ b/7
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch

 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@ -261,7 +261,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h

 #
 # Examples
@ -291,6 +291,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)

+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
--- a/Package.swift
+++ b/Package.swift
@ -11,6 +11,7 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
+            exclude: ["ggml-metal.metal"],
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="### Human"
+AI_NAME="### Assistant"
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-8}"
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+N_PREDICTS="${N_PREDICTS:-2048}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./bin/main $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --n_predict "$N_PREDICTS" \
+  --color --interactive \
+  --file ${PROMPT_FILE} \
+  --reverse-prompt "### Human:" \
+  --in-prefix ' ' \
+  "$@"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
+
+#ifdef GGML_USE_CUBLAS
+    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
+        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
+        exit(1);
+    }
+#endif // GGML_USE_CUBLAS
+
    if (escape_prompt) {
        process_escapes(params.prompt);
    }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
        }                                                                               \
    } while (0)

-#if CUDART_VERSION >= 12
+#if CUDART_VERSION >= 12000
 #define CUBLAS_CHECK(err)                                                               \
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
 }

 void ggml_cuda_set_main_device(int main_device) {
-    if (main_device > g_device_count) {
+    if (main_device >= g_device_count) {
        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                main_device, g_device_count, g_main_device);
        return;
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

 // same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

 #ifdef __cplusplus
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -284,528 +284,551 @@ void ggml_metal_get_tensor(

 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
-             struct ggml_cgraph * gf) {
+               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);

-    size_t offs_src0 = 0;
-    size_t offs_src1 = 0;
-    size_t offs_dst  = 0;
+    // create multiple command buffers and enqueue them
+    // then, we encode the graph into the command buffers in parallel

-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = nil;
+    const int n_cb = gf->n_threads;

-    for (int i = 0; i < gf->n_nodes; ++i) {
-        //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];

-        struct ggml_tensor * src0 = gf->nodes[i]->src0;
-        struct ggml_tensor * src1 = gf->nodes[i]->src1;
-        struct ggml_tensor * dst  = gf->nodes[i];
+    for (int i = 0; i < n_cb; ++i) {
+        command_buffers[i] = [ctx->queue commandBuffer];

-        const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-        const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-        const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-        const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+        // enqueue the command buffers in order to specify their execution order
+        [command_buffers[i] enqueue];
+    }

-        const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-        const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-        const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-        const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+    // TODO: is this the best way to start threads?
+    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);

-        const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-        const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-        const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-        const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;

-        const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-        const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-        const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-        const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+        dispatch_async(queue, ^{
+            size_t offs_src0 = 0;
+            size_t offs_src1 = 0;
+            size_t offs_dst  = 0;

-        const int64_t  ne0  = dst ? dst->ne[0] : 0;
-        const int64_t  ne1  = dst ? dst->ne[1] : 0;
-        const int64_t  ne2  = dst ? dst->ne[2] : 0;
-        const int64_t  ne3  = dst ? dst->ne[3] : 0;
+            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];

-        const uint64_t nb0  = dst ? dst->nb[0] : 0;
-        const uint64_t nb1  = dst ? dst->nb[1] : 0;
-        const uint64_t nb2  = dst ? dst->nb[2] : 0;
-        const uint64_t nb3  = dst ? dst->nb[3] : 0;
+            id<MTLComputeCommandEncoder> encoder = nil;

-        const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-        const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-        const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;

-        id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-        id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-        id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            for (int i = node_start; i < node_end; ++i) {
+                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

-        //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-        //if (src0) {
-        //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-        //            ggml_is_contiguous(src0), src0->name);
-        //}
-        //if (src1) {
-        //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-        //            ggml_is_contiguous(src1), src1->name);
-        //}
-        //if (dst) {
-        //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-        //            dst->name);
-        //}
+                struct ggml_tensor * src0 = gf->nodes[i]->src0;
+                struct ggml_tensor * src1 = gf->nodes[i]->src1;
+                struct ggml_tensor * dst  = gf->nodes[i];

-        switch (dst->op) {
-            case GGML_OP_RESHAPE:
-            case GGML_OP_VIEW:
-            case GGML_OP_TRANSPOSE:
-            case GGML_OP_PERMUTE:
-                {
-                    // noop
-                } break;
-            case GGML_OP_ADD:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+                const int64_t  ne03 = src0 ? src0->ne[3] : 0;

-                    [encoder setComputePipelineState:ctx->pipeline_add];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+                const uint64_t nb03 = src0 ? src0->nb[3] : 0;

-                    const int64_t n = ggml_nelements(dst);
+                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_MUL:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);

-                    if (ggml_nelements(src1) == ne10) {
-                        // src1 is a row
-                        [encoder setComputePipelineState:ctx->pipeline_mul_row];
-                    } else {
-                        [encoder setComputePipelineState:ctx->pipeline_mul];
-                    }
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                const int64_t  ne0  = dst ? dst->ne[0] : 0;
+                const int64_t  ne1  = dst ? dst->ne[1] : 0;
+                const int64_t  ne2  = dst ? dst->ne[2] : 0;
+                const int64_t  ne3  = dst ? dst->ne[3] : 0;

-                    const int64_t n = ggml_nelements(dst);
+                const uint64_t nb0  = dst ? dst->nb[0] : 0;
+                const uint64_t nb1  = dst ? dst->nb[1] : 0;
+                const uint64_t nb2  = dst ? dst->nb[2] : 0;
+                const uint64_t nb3  = dst ? dst->nb[3] : 0;

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_SCALE:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;

-                    const float scale = *(const float *) src1->data;
+                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;

-                    [encoder setComputePipelineState:ctx->pipeline_scale];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+                //if (src0) {
+                //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+                //            ggml_is_contiguous(src0), src0->name);
+                //}
+                //if (src1) {
+                //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+                //            ggml_is_contiguous(src1), src1->name);
+                //}
+                //if (dst) {
+                //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+                //            dst->name);
+                //}

-                    const int64_t n = ggml_nelements(dst);
+                switch (dst->op) {
+                    case GGML_OP_RESHAPE:
+                    case GGML_OP_VIEW:
+                    case GGML_OP_TRANSPOSE:
+                    case GGML_OP_PERMUTE:
+                        {
+                            // noop
+                        } break;
+                    case GGML_OP_ADD:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_SILU:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                            [encoder setComputePipelineState:ctx->pipeline_add];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];

-                    [encoder setComputePipelineState:ctx->pipeline_silu];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            const int64_t n = ggml_nelements(dst);

-                    const int64_t n = ggml_nelements(dst);
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_MUL:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_RELU:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                            if (ggml_nelements(src1) == ne10) {
+                                // src1 is a row
+                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_mul];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];

-                    [encoder setComputePipelineState:ctx->pipeline_relu];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            const int64_t n = ggml_nelements(dst);

-                    const int64_t n = ggml_nelements(dst);
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SCALE:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_GELU:
-            {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                            const float scale = *(const float *) src1->data;

-                    [encoder setComputePipelineState:ctx->pipeline_gelu];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

-                    const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst);

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-            } break;
-            case GGML_OP_SOFT_MAX:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SILU:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    const int nth = 32;
+                            [encoder setComputePipelineState:ctx->pipeline_silu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                    [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            const int64_t n = ggml_nelements(dst);

-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                } break;
-            case GGML_OP_DIAG_MASK_INF:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_RELU:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    const int n_past = ((int32_t *)(src1->data))[0];
+                            [encoder setComputePipelineState:ctx->pipeline_relu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                    [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                    [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                    [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+                            const int64_t n = ggml_nelements(dst);

-                    [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_GELU:
+                    {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                    GGML_ASSERT(ne00 == ne10);
-                    GGML_ASSERT(ne02 == ne12);
+                            [encoder setComputePipelineState:ctx->pipeline_gelu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                    if (ggml_is_contiguous(src0) &&
-                        ggml_is_contiguous(src1) &&
-                        (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
+                            const int64_t n = ggml_nelements(dst);

-                        if (encoder != nil) {
-                            [encoder endEncoding];
-                            encoder = nil;
-                        }
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                    case GGML_OP_SOFT_MAX:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                        MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-                        MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+                            const int nth = 32;

-                        // for F32 x F32 we use MPS
-                        MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
+                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];

-                        MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_DIAG_MASK_INF:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }

-                        MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
-                            matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
+                            const int n_past = ((int32_t *)(src1->data))[0];

-                        MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
-                            initWithDevice:ctx->device transposeLeft:false transposeRight:true
-                                resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
+                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];

-                        // we need to do ne02 multiplications
-                        // TODO: is there a way to do this in parallel - currently very slow ..
-                        // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                        for (int64_t i02 = 0; i02 < ne02; ++i02) {
-                            size_t offs_src0_cur = offs_src0 + i02*nb02;
-                            size_t offs_src1_cur = offs_src1 + i02*nb12;
-                            size_t offs_dst_cur  = offs_dst  + i02*nb2;
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_MUL_MAT:
+                        {
+                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224

-                            MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
-                            MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
-                            MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
+                            GGML_ASSERT(ne00 == ne10);
+                            GGML_ASSERT(ne02 == ne12);

-                            [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                        }
-                    } else {
-                        if (encoder == nil) {
-                            encoder = [command_buffer computeCommandEncoder];
-                        }
+                            if (ggml_is_contiguous(src0) &&
+                                ggml_is_contiguous(src1) &&
+                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {

-                        int nth0 = 32;
-                        int nth1 = 1;
-
-                        // use custom matrix x vector kernel
-                        switch (src0t) {
-                            case GGML_TYPE_F16:
-                                {
-                                    GGML_ASSERT(ne02 == ne12);
-
-                                    nth0 = 64;
-                                    nth1 = 1;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
-                                } break;
-                            case GGML_TYPE_Q4_0:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 8;
-                                    nth1 = 8;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
-                                } break;
-                            case GGML_TYPE_Q4_1:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 8;
-                                    nth1 = 8;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
-                                } break;
-                            case GGML_TYPE_Q2_K:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 4;
-                                    nth1 = 16;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
-                                } break;
-                            case GGML_TYPE_Q3_K:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 4;
-                                    nth1 = 16;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
-                                } break;
-                            case GGML_TYPE_Q4_K:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 4;
-                                    nth1 = 16;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
-                                } break;
-                            case GGML_TYPE_Q5_K:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 4;
-                                    nth1 = 16;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
-                                } break;
-                            case GGML_TYPE_Q6_K:
-                                {
-                                    GGML_ASSERT(ne02 == 1);
-                                    GGML_ASSERT(ne12 == 1);
-
-                                    nth0 = 4;
-                                    nth1 = 16;
-                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
-                                } break;
-                            default:
-                                {
-                                    fprintf(stderr, "Asserting on type %d\n",(int)src0t);
-                                    GGML_ASSERT(false && "not implemented");
+                                if (encoder != nil) {
+                                    [encoder endEncoding];
+                                    encoder = nil;
                                }
-                        };

+                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;

-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+                                // for F32 x F32 we use MPS
+                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];

-                        if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
-                            [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                        }
-                        else if (src0t == GGML_TYPE_Q2_K ||
-                                 src0t == GGML_TYPE_Q3_K ||
-                                 src0t == GGML_TYPE_Q4_K ||
-                                 src0t == GGML_TYPE_Q5_K ||
-                                 src0t == GGML_TYPE_Q6_K) {
-                            [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                        } else {
-                            [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                        }
-                    }
-                } break;
-            case GGML_OP_GET_ROWS:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];

-                    switch (src0->type) {
-                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
-                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
-                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
-                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
-                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
-                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
-                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
-                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
-                        default: GGML_ASSERT(false && "not implemented");
-                    }
+                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];

-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
-                    [encoder setBytes:&(dst->nb[1])  length:sizeof(uint64_t) atIndex:5];
+                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
+                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
+                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];

-                    const int64_t n = ggml_nelements(src1);
+                                // we need to do ne02 multiplications
+                                // TODO: is there a way to do this in parallel - currently very slow ..
+                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
+                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
+                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
+                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
+                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;

-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_RMS_NORM:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
+                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
+                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];

-                    const float eps = 1e-6f;
+                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                                }
+                            } else {
+                                if (encoder == nil) {
+                                    encoder = [command_buffer computeCommandEncoder];
+                                }

-                    const int nth = 256;
+                                int nth0 = 32;
+                                int nth1 = 1;

-                    [encoder setComputePipelineState:ctx->pipeline_rms_norm];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                    [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                    [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                    [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                                // use custom matrix x vector kernel
+                                switch (src0t) {
+                                    case GGML_TYPE_F16:
+                                        {
+                                            GGML_ASSERT(ne02 == ne12);

-                    const int64_t nrows = ggml_nrows(src0);
+                                            nth0 = 64;
+                                            nth1 = 1;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                } break;
-            case GGML_OP_ROPE:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_1:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    const int n_dims = ((int32_t *) src1->data)[1];
-                    const int mode   = ((int32_t *) src1->data)[2];
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
+                                        } break;
+                                    case GGML_TYPE_Q2_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    const int n_past = ((int32_t *)(src1->data))[0];
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q3_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    [encoder setComputePipelineState:ctx->pipeline_rope];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                    [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                    [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                    [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                    [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                    [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                    [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                    [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                    [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                    [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                    [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                    [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                    [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                    [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                    [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
-                    [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
-                    [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
-                    [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_CPY:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q5_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    const int nth = 32;
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q6_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);

-                    switch (src0t) {
-                        case GGML_TYPE_F32:
-                            {
-                                switch (dstt) {
-                                    case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
-                                    case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
-                                    default: GGML_ASSERT(false && "not implemented");
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
+                                        } break;
+                                    default:
+                                        {
+                                            fprintf(stderr, "Asserting on type %d\n",(int)src0t);
+                                            GGML_ASSERT(false && "not implemented");
+                                        }
                                };
-                            } break;
-                        default: GGML_ASSERT(false && "not implemented");
-                    }

-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                    [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                    [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                    [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                    [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                    [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                    [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                    [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                    [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                    [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                    [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                    [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                    [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                    [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                    [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                    [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];

-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                } break;
-            default:
-                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                GGML_ASSERT(false);
-        }
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q2_K ||
+                                         src0t == GGML_TYPE_Q3_K ||
+                                         src0t == GGML_TYPE_Q4_K ||
+                                         src0t == GGML_TYPE_Q5_K ||
+                                         src0t == GGML_TYPE_Q6_K) {
+                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                } else {
+                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                            }
+                        } break;
+                    case GGML_OP_GET_ROWS:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            switch (src0->type) {
+                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
+                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
+                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
+                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
+                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
+                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
+                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
+                            [encoder setBytes:&(dst->nb[1])  length:sizeof(uint64_t) atIndex:5];
+
+                            const int64_t n = ggml_nelements(src1);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_RMS_NORM:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const float eps = 1e-6f;
+
+                            const int nth = 256;
+
+                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+
+                            const int64_t nrows = ggml_nrows(src0);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_ROPE:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int n_dims = ((int32_t *) src1->data)[1];
+                            const int mode   = ((int32_t *) src1->data)[2];
+
+                            const int n_past = ((int32_t *)(src1->data))[0];
+
+                            [encoder setComputePipelineState:ctx->pipeline_rope];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_CPY:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int nth = 32;
+
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        switch (dstt) {
+                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
+                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
+                                            default: GGML_ASSERT(false && "not implemented");
+                                        };
+                                    } break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    default:
+                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                        GGML_ASSERT(false);
+                }
+            }
+
+            if (encoder != nil) {
+                [encoder endEncoding];
+                encoder = nil;
+            }
+
+            [command_buffer commit];
+        });
    }

-    if (encoder != nil) {
-        [encoder endEncoding];
-        encoder = nil;
-    }
+    // wait for all threads to finish
+    dispatch_barrier_sync(queue, ^{});

-    [command_buffer commit];
-    [command_buffer waitUntilCompleted];
-
-    {
-        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        UNUSED(time_elapsed);
-
-        metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
-    }
+    [command_buffers[n_cb - 1] waitUntilCompleted];
 }
--- a/llama.h
+++ b/llama.h
@ -244,9 +244,9 @@ extern "C" {
    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);

    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-    LLAMA_API llama_token llama_token_nl();
+    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl();   // next-line

    // Sampling functions

--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -0,0 +1 @@
+../ggml.h