Merge branch 'master' into eval-thread-count

2023-04-05 12:44:50 -07:00 · 2023-04-05 12:44:50 -07:00 · 4778f93611
commit 4778f93611
parent 37264707c2 eeaa7b0492
14 changed files with 406 additions and 177 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,3 +33,6 @@ compile_commands.json
 .venv
 __pycache__
 .swiftpm
 zig-out/
 zig-cache/
--- a/1
+++ b/1
@ -72,6 +72,7 @@ endif
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Use all CPU extensions that are available:
 	CFLAGS += -march=native -mtune=native
 	CXXFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
--- a/Package.swift
+++ b/Package.swift
@ -13,7 +13,10 @@ let package = Package(
            path: ".",
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
-            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
+            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
        ),
    ],
    cxxLanguageStandard: .cxx11
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # llama.cpp
-![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
+![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@ -9,8 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
+- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
 - Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
 ## Description
@ -28,20 +27,31 @@ Please do not make conclusions about the models based on the results from this i
 For all I know, it can be completely wrong. This project is for educational purposes.
 New features will probably be added mostly through community contributions.
-Supported platforms:
+**Supported platforms:**
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker
-Supported models:
+**Supported models:**
 - [X] LLaMA 🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 **Bindings:**
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 **UI:**
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 ---
@ -145,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 make
 #For Windows and CMake, use the following command instead:
 cd <path_to_llama_folder>
 mkdir build
 cd build
 cmake ..
 cmake --build . --config Release
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@ -367,3 +384,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 ### Docs
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,62 @@
 const std = @import("std");
 pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
    const lib = b.addStaticLibrary(.{
        .name = "llama",
        .target = target,
        .optimize = optimize,
    });
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("examples");
    lib.addCSourceFiles(&.{
        "ggml.c",
    }, &.{"-std=c11"});
    lib.addCSourceFiles(&.{
        "llama.cpp",
        "examples/common.cpp",
    }, &.{"-std=c++11"});
    lib.install();
    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
    const exe = build_example("main", build_args);
    _ = build_example("quantize", build_args);
    _ = build_example("perplexity", build_args);
    _ = build_example("embedding", build_args);
    // create "zig build run" command for ./main
    const run_cmd = exe.run();
    run_cmd.step.dependOn(b.getInstallStep());
    if (b.args) |args| {
        run_cmd.addArgs(args);
    }
    const run_step = b.step("run", "Run the app");
    run_step.dependOn(&run_cmd.step);
 }
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
    const b = args.b;
    const lib = args.lib;
    const target = args.target;
    const optimize = args.optimize;
    const exe = b.addExecutable(.{
        .name = name,
        .target = target,
        .optimize = optimize,
    });
    exe.addIncludePath(".");
    exe.addIncludePath("examples");
    exe.addCSourceFiles(&.{
        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
    }, &.{"-std=c++11"});
    exe.linkLibrary(lib);
    exe.install();
    return exe;
 }
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -0,0 +1,49 @@
 #!/bin/bash
 set -e
 AI_NAME="${AI_NAME:-Miku}"
 MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
 USER_NAME="${USER_NAME:-Anon}"
 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
 N_PREDICTS="${N_PREDICTS:-4096}"
 GEN_OPTIONS=(--batch_size 1024
 --ctx_size 2048
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
 --temp 0.7
 --top_k 40
 --top_p 0.5)
 if [ -n "$N_THREAD" ]; then
 	GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
 ./main "${GEN_OPTIONS[@]}" \
 	--model "$MODEL" \
 	--n_predict "$N_PREDICTS" \
 	--color --interactive \
 	--reverse-prompt "${USER_NAME}:" \
 	--prompt "
 This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
 ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
 ${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
 ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
 The conversation is only between ${USER_NAME} and ${AI_NAME}
 The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
 ${AI_NAME} can only communicate through text, so she can't send images or videos.
 ${USER_NAME}: Hello!
 ${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
 ${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
 ${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
 ${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
 ${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
 ${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
 ${AI_NAME}: What do you like to do in your free time? ^_^
 ${USER_NAME}:" "$@"
--- a/ggml.c
+++ b/ggml.c
@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.pad          =*/ { 0 },
    };
-    ggml_assert_aligned(result->data);
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
    //ggml_assert_aligned(result->data);
    for (int i = 0; i < n_dims; i++) {
        result->ne[i] = ne[i];
@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
 struct ggml_tensor * ggml_view_tensor(
        struct ggml_context * ctx,
        const struct ggml_tensor * src) {
-    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
    result->nb[0] = src->nb[0];
    result->nb[1] = src->nb[1];
    result->nb[2] = src->nb[2];
    result->nb[3] = src->nb[3];
    return result;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
    return result;
 }
 // ggml_view_3d
 struct ggml_tensor * ggml_view_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        size_t                nb1,
        size_t                nb2,
        size_t                offset) {
    if (a->grad) {
        GGML_ASSERT(false); // gradient propagation is not supported
    }
    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = result->nb[2]*ne2;
    result->op   = GGML_OP_VIEW;
    result->grad = NULL;
    result->src0 = a;
    result->src1 = NULL; // TODO: maybe store the offset here?
    return result;
 }
 // ggml_permute
 struct ggml_tensor * ggml_permute(
@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16(
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16(
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
-    if (ggml_is_contiguous(src0) && src0->type == dst->type) {
+    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
        return;
    }
-    if (src0->nb[0] == sizeof(ggml_fp16_t)) {
+    if (src0->type == dst->type &&
-        if (dst->type == GGML_TYPE_F16) {
+        src0->ne[0] == dst->ne[0] &&
-            size_t id = 0;
+        src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
-            const size_t rs = ne00*nb00;
+        // copy by rows
-
+        const size_t rs = ne00*nb00;
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                    memcpy(
-                        char * dst_ptr = (char *) dst->data + id*rs;
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        memcpy(dst_ptr, src0_ptr, rs);
+                        rs);
                        id++;
                    }
                }
            }
-        } else if (dst->type == GGML_TYPE_F32) {
+        }
-            size_t id = 0;
+        return;
-            float * dst_ptr = (float *) dst->data;
+    }
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+    // dst counters
-                            id++;
+    int64_t i10 = 0;
    int64_t i11 = 0;
    int64_t i12 = 0;
    int64_t i13 = 0;
    if (dst->type == GGML_TYPE_F16) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
                        if (++i10 == ne00) {
                            i10 = 0;
                            if (++i11 == ne01) {
                                i11 = 0;
                                if (++i12 == ne02) {
                                    i12 = 0;
                                    if (++i13 == ne03) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    } else if (dst->type == GGML_TYPE_F32) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
                        if (++i10 == ne00) {
                            i10 = 0;
                            if (++i11 == ne01) {
                                i11 = 0;
                                if (++i12 == ne02) {
                                    i12 = 0;
                                    if (++i13 == ne03) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        } else {
            GGML_ASSERT(false); // TODO: implement
        }
    } else {
-        //printf("%s: this is not optimal - fix me\n", __func__);
+        GGML_ASSERT(false); // TODO: implement
        if (dst->type == GGML_TYPE_F32) {
            size_t id = 0;
            float * dst_ptr = (float *) dst->data;
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
                            id++;
                        }
                    }
                }
            }
        } else if (dst->type == GGML_TYPE_F16) {
            size_t id = 0;
            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            dst_ptr[id] = *src0_ptr;
                            id++;
                        }
                    }
                }
            }
        } else {
            GGML_ASSERT(false); // TODO: implement
        }
    }
 }
@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32(
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32(
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
-    if (ggml_is_contiguous(src0) && src0->type == dst->type) {
+    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];
    const size_t nb2 = dst->nb[2];
    const size_t nb3 = dst->nb[3];
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
        return;
    }
-    if (src0->nb[0] == sizeof(float)) {
+    // dst counters
-        if (dst->type == GGML_TYPE_F32) {
+    int64_t i10 = 0;
-            size_t id = 0;
+    int64_t i11 = 0;
-            const size_t rs = ne00*nb00;
+    int64_t i12 = 0;
    int64_t i13 = 0;
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
+    if (dst->type == GGML_TYPE_F32) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        char * dst_ptr = (char *) dst->data + id*rs;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-                        memcpy(dst_ptr, src0_ptr, rs);
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-                        id++;
+                        if (++i10 == dst->ne[0]) {
-                    }
+                            i10 = 0;
-                }
+                            if (++i11 == dst->ne[1]) {
-            }
+                                i11 = 0;
-        } else if (dst->type == GGML_TYPE_F16) {
+                                if (++i12 == dst->ne[2]) {
-            size_t id = 0;
+                                    i12 = 0;
-            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+                                    if (++i13 == dst->ne[3]) {
-
+                                        i13 = 0;
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                                    }
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                                }
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                            }
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        }
-                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                    }
-
+                }
-                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+            }
-                            id++;
+        }
    } else if (dst->type == GGML_TYPE_F16) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
                        if (++i10 == dst->ne[0]) {
                            i10 = 0;
                            if (++i11 == dst->ne[1]) {
                                i11 = 0;
                                if (++i12 == dst->ne[2]) {
                                    i12 = 0;
                                    if (++i13 == dst->ne[3]) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        } else {
            GGML_ASSERT(false); // TODO: implement
        }
    } else {
-        //printf("%s: this is not optimal - fix me\n", __func__);
+        GGML_ASSERT(false); // TODO: implement
        if (dst->type == GGML_TYPE_F32) {
            size_t id = 0;
            float * dst_ptr = (float *) dst->data;
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            dst_ptr[id] = *src0_ptr;
                            id++;
                        }
                    }
                }
            }
        } else if (dst->type == GGML_TYPE_F16) {
            size_t id = 0;
            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                            id++;
                        }
                    }
                }
            }
        } else {
            GGML_ASSERT(false); // TODO: implement
        }
    }
 }
@ -7199,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(src1->type == GGML_TYPE_I32);
    assert(ggml_nelements(src1) == 3);
@ -7226,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
    assert(nb0 == sizeof(float));
-    // TODO: optimize
+    const int ith = params->ith;
    const int nth = params->nth;
    const int nr = ggml_nrows(src0);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    // row index used to determine which thread to use
    int ir = 0;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                for (int i0 = 0; i0 < n_dims; i0 += 2) {
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
@ -7256,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(src1->type == GGML_TYPE_I32);
    assert(ggml_nelements(src1) == 3);
@ -7283,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
    assert(nb0 == sizeof(ggml_fp16_t));
    const int ith = params->ith;
    const int nth = params->nth;
    const int nr = ggml_nrows(src0);
    // rows per thread
    const int dr = (nr + nth - 1)/nth;
    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    // row index used to determine which thread to use
    int ir = 0;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
                for (int i0 = 0; i0 < n_dims; i0 += 2) {
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
@ -9385,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    } break;
                case GGML_OP_ROPE:
                    {
-                        node->n_tasks = 1;
+                        node->n_tasks = n_threads;
                    } break;
                case GGML_OP_CONV_1D_1S:
                case GGML_OP_CONV_1D_2S:
--- a/ggml.h
+++ b/ggml.h
@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d(
        size_t                nb1, // row stride in bytes
        size_t                offset);
 struct ggml_tensor * ggml_view_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        size_t                nb1, // row   stride in bytes
        size_t                nb2, // slice stride in bytes
        size_t                offset);
 struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
--- a/llama.cpp
+++ b/llama.cpp
@ -812,37 +812,35 @@ static bool llama_eval_internal(
        // self-attention
        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            // compute Q and K and RoPE them
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
            // store key and value to memory
-            if (N >= 1) {
+            {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                // compute the transposed [N, n_embd] V matrix
-                struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
                        (   n_ctx)*ggml_element_size(kv_self.v),
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
                // important: storing RoPE-ed version of K in the KV cache!
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }
            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
-                        ggml_rope(ctx0,
+                        Qcur,
                            ggml_cpy(ctx0,
                                Qcur,
                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
                            n_past, n_rot, 0),
                        0, 2, 1, 3);
            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
            struct ggml_tensor * K =
                ggml_permute(ctx0,
-                        ggml_rope(ctx0,
+                        ggml_reshape_3d(ctx0,
-                            ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
-                                ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
                                n_embd/n_head, n_head, n_past + N),
                            n_past, n_rot, 1),
                        0, 2, 1, 3);
            // K * Q
@ -860,18 +858,23 @@ static bool llama_eval_internal(
            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // split cached V into n_head heads
-            struct ggml_tensor * V_trans =
+            struct ggml_tensor * V =
-                ggml_cpy(ctx0,
+                ggml_view_3d(ctx0, kv_self.v,
-                    ggml_permute(ctx0,
+                        n_past + N, n_embd/n_head, n_head,
-                            ggml_reshape_3d(ctx0,
+                        n_ctx*ggml_element_size(kv_self.v),
-                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
+                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
-                                n_embd/n_head, n_head, n_past + N),
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
                            1, 2, 0, 3),
                    ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
-            // KQV = transpose(V) * KQ_soft_max
+#if 1
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 #else
            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
            // is there a better way?
            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
 #endif
            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -957,9 +960,13 @@ static bool llama_eval_internal(
    ggml_build_forward_expand(&gf, inpL);
    ggml_graph_compute       (ctx0, &gf);
    // print timing information per ggml operation (for debugging purposes)
    // requires GGML_PERF to be defined
    //ggml_graph_print(&gf);
    // plot the computation graph in dot format (for debugging purposes)
    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
    //}
    //embd_w.resize(n_vocab*N);
@ -1231,7 +1238,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
        }
    }
-    sample_top_k(logits_id, top_k);
+    if (top_k > 0 && top_k < n_logits) {
        sample_top_k(logits_id, top_k);
    }
    float maxl = -std::numeric_limits<float>::infinity();
    for (const auto & kv : logits_id) {
--- a/media/llama-leader.jpeg
+++ b/media/llama-leader.jpeg
--- a/media/llama0-banner.png
+++ b/media/llama0-banner.png
--- a/media/llama0-logo.png
+++ b/media/llama0-logo.png
--- a/media/llama1-banner.png
+++ b/media/llama1-banner.png
--- a/media/llama1-logo.png
+++ b/media/llama1-logo.png