diff --git a/.gitignore b/.gitignore index fe902e6f5..30f3d8e66 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,7 @@ __pycache__ .swiftpm dist/ -*.spec \ No newline at end of file +*.spec + +zig-out/ +zig-cache/ diff --git a/Package.swift b/Package.swift index 79d13c82d..2c2c147ba 100644 --- a/Package.swift +++ b/Package.swift @@ -13,7 +13,10 @@ let package = Package( path: ".", sources: ["ggml.c", "llama.cpp"], publicHeadersPath: "spm-headers", - cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])] + cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], + linkerSettings: [ + .linkedFramework("Accelerate") + ] ), ], cxxLanguageStandard: .cxx11 diff --git a/build.zig b/build.zig new file mode 100644 index 000000000..b73460880 --- /dev/null +++ b/build.zig @@ -0,0 +1,62 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "llama", + .target = target, + .optimize = optimize, + }); + lib.linkLibCpp(); + lib.addIncludePath("."); + lib.addIncludePath("examples"); + lib.addCSourceFiles(&.{ + "ggml.c", + }, &.{"-std=c11"}); + lib.addCSourceFiles(&.{ + "llama.cpp", + "examples/common.cpp", + }, &.{"-std=c++11"}); + lib.install(); + + const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize }; + const exe = build_example("main", build_args); + _ = build_example("quantize", build_args); + _ = build_example("perplexity", build_args); + _ = build_example("embedding", build_args); + + // create "zig build run" command for ./main + + const run_cmd = exe.run(); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| { + run_cmd.addArgs(args); + } + + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); +} + +fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { + const b = args.b; + const lib = args.lib; + const target = args.target; + const optimize = args.optimize; + + const exe = b.addExecutable(.{ + .name = name, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath("."); + exe.addIncludePath("examples"); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), + }, &.{"-std=c++11"}); + exe.linkLibrary(lib); + exe.install(); + + return exe; +} diff --git a/examples/Miku.sh b/examples/Miku.sh new file mode 100755 index 000000000..352478a15 --- /dev/null +++ b/examples/Miku.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -e + +AI_NAME="${AI_NAME:-Miku}" +MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" +USER_NAME="${USER_NAME:-Anon}" + +# Uncomment and adjust to the number of CPU cores you want to use. +#N_THREAD="${N_THREAD:-4}" +N_PREDICTS="${N_PREDICTS:-4096}" + +GEN_OPTIONS=(--batch_size 1024 +--ctx_size 2048 +--keep -1 +--repeat_last_n 256 +--repeat_penalty 1.17647 +--temp 0.7 +--top_k 40 +--top_p 0.5) + +if [ -n "$N_THREAD" ]; then + GEN_OPTIONS+=(--threads "$N_THREAD") +fi + +./main "${GEN_OPTIONS[@]}" \ + --model "$MODEL" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --reverse-prompt "${USER_NAME}:" \ + --prompt " +This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer. +${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between ${USER_NAME} and ${AI_NAME} +The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. +${AI_NAME} can only communicate through text, so she can't send images or videos. + + +${USER_NAME}: Hello! +${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that! +${AI_NAME}: What do you like to do in your free time? ^_^ +${USER_NAME}:" "$@" diff --git a/ggml.c b/ggml.c index 59e84ab45..8a60bc383 100644 --- a/ggml.c +++ b/ggml.c @@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.pad =*/ { 0 }, }; - ggml_assert_aligned(result->data); + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads + //ggml_assert_aligned(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + + result->nb[0] = src->nb[0]; + result->nb[1] = src->nb[1]; + result->nb[2] = src->nb[2]; + result->nb[3] = src->nb[3]; + + return result; } //////////////////////////////////////////////////////////////////////////////// @@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d( return result; } +// ggml_view_3d + +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, + size_t nb2, + size_t offset) { + if (a->grad) { + GGML_ASSERT(false); // gradient propagation is not supported + } + + const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = result->nb[2]*ne2; + + result->op = GGML_OP_VIEW; + result->grad = NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the offset here? + + return result; +} + // ggml_permute struct ggml_tensor * ggml_permute( @@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(ggml_fp16_t)) { - if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - const size_t rs = ne00*nb00; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; - } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); } } - } else if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; + } + return; + } - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - } - } - } else { - GGML_ASSERT(false); // TODO: implement - } + GGML_ASSERT(false); // TODO: implement } } @@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00*nb00; + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; + if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, rs); + memcpy(dst_ptr, src0_ptr, sizeof(float)); - id++; - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - } - } - } else { - GGML_ASSERT(false); // TODO: implement - } + GGML_ASSERT(false); // TODO: implement } } @@ -7199,7 +7238,6 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7226,11 +7264,28 @@ static void ggml_compute_forward_rope_f32( assert(nb0 == sizeof(float)); - // TODO: optimize + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -7256,7 +7311,6 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7283,10 +7337,28 @@ static void ggml_compute_forward_rope_f16( assert(nb0 == sizeof(ggml_fp16_t)); + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -9385,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ROPE: { - node->n_tasks = 1; + node->n_tasks = n_threads; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: diff --git a/ggml.h b/ggml.h index ad962b109..3c94efc35 100644 --- a/ggml.h +++ b/ggml.h @@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d( size_t nb1, // row stride in bytes size_t offset); +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + struct ggml_tensor * ggml_permute( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/koboldcpp.dll b/koboldcpp.dll index 25ba23ba7..6ba731ac6 100644 Binary files a/koboldcpp.dll and b/koboldcpp.dll differ diff --git a/koboldcpp_blas.dll b/koboldcpp_blas.dll index f62ae8eac..a6905decd 100644 Binary files a/koboldcpp_blas.dll and b/koboldcpp_blas.dll differ diff --git a/llama.cpp b/llama.cpp index 854bb8993..581a8399d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -810,37 +810,35 @@ static bool llama_eval_internal( // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); + { + // compute the transposed [N, n_embd] V matrix + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); } - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - n_past, n_rot, 0), + Qcur, 0, 2, 1, 3); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ggml_tensor * K = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), - n_embd/n_head, n_head, n_past + N), - n_past, n_rot, 1), + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q @@ -858,18 +856,23 @@ static bool llama_eval_internal( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(kv_self.v), + n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, + il*n_ctx*ggml_element_size(kv_self.v)*n_embd); - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); @@ -955,9 +958,13 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, inpL); ggml_graph_compute (ctx0, &gf); + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + //ggml_graph_print(&gf); + + // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); //} //embd_w.resize(n_vocab*N); @@ -1229,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k( } } - sample_top_k(logits_id, top_k); + if (top_k > 0 && top_k < n_logits) { + sample_top_k(logits_id, top_k); + } float maxl = -std::numeric_limits::infinity(); for (const auto & kv : logits_id) { diff --git a/media/llama-leader.jpeg b/media/llama-leader.jpeg new file mode 100644 index 000000000..0b4e6e1cf Binary files /dev/null and b/media/llama-leader.jpeg differ diff --git a/media/llama0-banner.png b/media/llama0-banner.png new file mode 100644 index 000000000..cee3a87f1 Binary files /dev/null and b/media/llama0-banner.png differ diff --git a/media/llama0-logo.png b/media/llama0-logo.png new file mode 100644 index 000000000..e55b38bd9 Binary files /dev/null and b/media/llama0-logo.png differ diff --git a/media/llama1-banner.png b/media/llama1-banner.png new file mode 100644 index 000000000..1e469584e Binary files /dev/null and b/media/llama1-banner.png differ diff --git a/media/llama1-logo.png b/media/llama1-logo.png new file mode 100644 index 000000000..365c5b865 Binary files /dev/null and b/media/llama1-logo.png differ diff --git a/quantize.exe b/quantize.exe index 72ae9a009..37d2b8a61 100644 Binary files a/quantize.exe and b/quantize.exe differ