Merge branch 'master' into concedo
# Conflicts: # .gitignore # Makefile # README.md
This commit is contained in:
commit
0e889ed6db
15 changed files with 381 additions and 173 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -35,4 +35,7 @@ __pycache__
|
||||||
.swiftpm
|
.swiftpm
|
||||||
|
|
||||||
dist/
|
dist/
|
||||||
*.spec
|
*.spec
|
||||||
|
|
||||||
|
zig-out/
|
||||||
|
zig-cache/
|
||||||
|
|
|
@ -13,7 +13,10 @@ let package = Package(
|
||||||
path: ".",
|
path: ".",
|
||||||
sources: ["ggml.c", "llama.cpp"],
|
sources: ["ggml.c", "llama.cpp"],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
|
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||||
|
linkerSettings: [
|
||||||
|
.linkedFramework("Accelerate")
|
||||||
|
]
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
cxxLanguageStandard: .cxx11
|
cxxLanguageStandard: .cxx11
|
||||||
|
|
62
build.zig
Normal file
62
build.zig
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
const std = @import("std");
|
||||||
|
|
||||||
|
pub fn build(b: *std.Build) void {
|
||||||
|
const target = b.standardTargetOptions(.{});
|
||||||
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
|
|
||||||
|
const lib = b.addStaticLibrary(.{
|
||||||
|
.name = "llama",
|
||||||
|
.target = target,
|
||||||
|
.optimize = optimize,
|
||||||
|
});
|
||||||
|
lib.linkLibCpp();
|
||||||
|
lib.addIncludePath(".");
|
||||||
|
lib.addIncludePath("examples");
|
||||||
|
lib.addCSourceFiles(&.{
|
||||||
|
"ggml.c",
|
||||||
|
}, &.{"-std=c11"});
|
||||||
|
lib.addCSourceFiles(&.{
|
||||||
|
"llama.cpp",
|
||||||
|
"examples/common.cpp",
|
||||||
|
}, &.{"-std=c++11"});
|
||||||
|
lib.install();
|
||||||
|
|
||||||
|
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
|
||||||
|
const exe = build_example("main", build_args);
|
||||||
|
_ = build_example("quantize", build_args);
|
||||||
|
_ = build_example("perplexity", build_args);
|
||||||
|
_ = build_example("embedding", build_args);
|
||||||
|
|
||||||
|
// create "zig build run" command for ./main
|
||||||
|
|
||||||
|
const run_cmd = exe.run();
|
||||||
|
run_cmd.step.dependOn(b.getInstallStep());
|
||||||
|
if (b.args) |args| {
|
||||||
|
run_cmd.addArgs(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
const run_step = b.step("run", "Run the app");
|
||||||
|
run_step.dependOn(&run_cmd.step);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
||||||
|
const b = args.b;
|
||||||
|
const lib = args.lib;
|
||||||
|
const target = args.target;
|
||||||
|
const optimize = args.optimize;
|
||||||
|
|
||||||
|
const exe = b.addExecutable(.{
|
||||||
|
.name = name,
|
||||||
|
.target = target,
|
||||||
|
.optimize = optimize,
|
||||||
|
});
|
||||||
|
exe.addIncludePath(".");
|
||||||
|
exe.addIncludePath("examples");
|
||||||
|
exe.addCSourceFiles(&.{
|
||||||
|
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
||||||
|
}, &.{"-std=c++11"});
|
||||||
|
exe.linkLibrary(lib);
|
||||||
|
exe.install();
|
||||||
|
|
||||||
|
return exe;
|
||||||
|
}
|
49
examples/Miku.sh
Executable file
49
examples/Miku.sh
Executable file
|
@ -0,0 +1,49 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
AI_NAME="${AI_NAME:-Miku}"
|
||||||
|
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
||||||
|
USER_NAME="${USER_NAME:-Anon}"
|
||||||
|
|
||||||
|
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||||
|
#N_THREAD="${N_THREAD:-4}"
|
||||||
|
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||||
|
|
||||||
|
GEN_OPTIONS=(--batch_size 1024
|
||||||
|
--ctx_size 2048
|
||||||
|
--keep -1
|
||||||
|
--repeat_last_n 256
|
||||||
|
--repeat_penalty 1.17647
|
||||||
|
--temp 0.7
|
||||||
|
--top_k 40
|
||||||
|
--top_p 0.5)
|
||||||
|
|
||||||
|
if [ -n "$N_THREAD" ]; then
|
||||||
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
|
fi
|
||||||
|
|
||||||
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--n_predict "$N_PREDICTS" \
|
||||||
|
--color --interactive \
|
||||||
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
|
--prompt "
|
||||||
|
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||||
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||||
|
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
|
||||||
|
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
|
||||||
|
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
||||||
|
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||||
|
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||||
|
|
||||||
|
|
||||||
|
${USER_NAME}: Hello!
|
||||||
|
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
|
||||||
|
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
|
||||||
|
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
|
||||||
|
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
|
||||||
|
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
|
||||||
|
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
|
||||||
|
${AI_NAME}: What do you like to do in your free time? ^_^
|
||||||
|
${USER_NAME}:" "$@"
|
350
ggml.c
350
ggml.c
|
@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.pad =*/ { 0 },
|
/*.pad =*/ { 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_assert_aligned(result->data);
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||||
|
//ggml_assert_aligned(result->data);
|
||||||
|
|
||||||
for (int i = 0; i < n_dims; i++) {
|
for (int i = 0; i < n_dims; i++) {
|
||||||
result->ne[i] = ne[i];
|
result->ne[i] = ne[i];
|
||||||
|
@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
||||||
struct ggml_tensor * ggml_view_tensor(
|
struct ggml_tensor * ggml_view_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
const struct ggml_tensor * src) {
|
const struct ggml_tensor * src) {
|
||||||
return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
||||||
|
|
||||||
|
result->nb[0] = src->nb[0];
|
||||||
|
result->nb[1] = src->nb[1];
|
||||||
|
result->nb[2] = src->nb[2];
|
||||||
|
result->nb[3] = src->nb[3];
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_view_3d
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_view_3d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
size_t nb1,
|
||||||
|
size_t nb2,
|
||||||
|
size_t offset) {
|
||||||
|
if (a->grad) {
|
||||||
|
GGML_ASSERT(false); // gradient propagation is not supported
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
||||||
|
|
||||||
|
result->nb[1] = nb1;
|
||||||
|
result->nb[2] = nb2;
|
||||||
|
result->nb[3] = result->nb[2]*ne2;
|
||||||
|
|
||||||
|
result->op = GGML_OP_VIEW;
|
||||||
|
result->grad = NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->src1 = NULL; // TODO: maybe store the offset here?
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_permute
|
// ggml_permute
|
||||||
|
|
||||||
struct ggml_tensor * ggml_permute(
|
struct ggml_tensor * ggml_permute(
|
||||||
|
@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(params->ith == 0);
|
GGML_ASSERT(params->ith == 0);
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
||||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16(
|
||||||
const size_t nb02 = src0->nb[2];
|
const size_t nb02 = src0->nb[2];
|
||||||
const size_t nb03 = src0->nb[3];
|
const size_t nb03 = src0->nb[3];
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && src0->type == dst->type) {
|
const size_t nb0 = dst->nb[0];
|
||||||
|
const size_t nb1 = dst->nb[1];
|
||||||
|
const size_t nb2 = dst->nb[2];
|
||||||
|
const size_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||||
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->nb[0] == sizeof(ggml_fp16_t)) {
|
if (src0->type == dst->type &&
|
||||||
if (dst->type == GGML_TYPE_F16) {
|
src0->ne[0] == dst->ne[0] &&
|
||||||
size_t id = 0;
|
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
||||||
const size_t rs = ne00*nb00;
|
// copy by rows
|
||||||
|
const size_t rs = ne00*nb00;
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
memcpy(
|
||||||
char * dst_ptr = (char *) dst->data + id*rs;
|
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
||||||
|
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
||||||
memcpy(dst_ptr, src0_ptr, rs);
|
rs);
|
||||||
|
|
||||||
id++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (dst->type == GGML_TYPE_F32) {
|
}
|
||||||
size_t id = 0;
|
return;
|
||||||
float * dst_ptr = (float *) dst->data;
|
}
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
||||||
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
||||||
|
|
||||||
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
// dst counters
|
||||||
id++;
|
int64_t i10 = 0;
|
||||||
|
int64_t i11 = 0;
|
||||||
|
int64_t i12 = 0;
|
||||||
|
int64_t i13 = 0;
|
||||||
|
|
||||||
|
if (dst->type == GGML_TYPE_F16) {
|
||||||
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
|
memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
|
||||||
|
|
||||||
|
if (++i10 == ne00) {
|
||||||
|
i10 = 0;
|
||||||
|
if (++i11 == ne01) {
|
||||||
|
i11 = 0;
|
||||||
|
if (++i12 == ne02) {
|
||||||
|
i12 = 0;
|
||||||
|
if (++i13 == ne03) {
|
||||||
|
i13 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F32) {
|
||||||
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
|
*(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
|
||||||
|
|
||||||
|
if (++i10 == ne00) {
|
||||||
|
i10 = 0;
|
||||||
|
if (++i11 == ne01) {
|
||||||
|
i11 = 0;
|
||||||
|
if (++i12 == ne02) {
|
||||||
|
i12 = 0;
|
||||||
|
if (++i13 == ne03) {
|
||||||
|
i13 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false); // TODO: implement
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
|
||||||
size_t id = 0;
|
|
||||||
float * dst_ptr = (float *) dst->data;
|
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
||||||
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
||||||
|
|
||||||
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
|
||||||
id++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (dst->type == GGML_TYPE_F16) {
|
|
||||||
size_t id = 0;
|
|
||||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
||||||
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
||||||
|
|
||||||
dst_ptr[id] = *src0_ptr;
|
|
||||||
id++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false); // TODO: implement
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(params->ith == 0);
|
GGML_ASSERT(params->ith == 0);
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
||||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32(
|
||||||
const size_t nb02 = src0->nb[2];
|
const size_t nb02 = src0->nb[2];
|
||||||
const size_t nb03 = src0->nb[3];
|
const size_t nb03 = src0->nb[3];
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && src0->type == dst->type) {
|
const size_t nb0 = dst->nb[0];
|
||||||
|
const size_t nb1 = dst->nb[1];
|
||||||
|
const size_t nb2 = dst->nb[2];
|
||||||
|
const size_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||||
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->nb[0] == sizeof(float)) {
|
// dst counters
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
int64_t i10 = 0;
|
||||||
size_t id = 0;
|
int64_t i11 = 0;
|
||||||
const size_t rs = ne00*nb00;
|
int64_t i12 = 0;
|
||||||
|
int64_t i13 = 0;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
char * dst_ptr = (char *) dst->data + id*rs;
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
memcpy(dst_ptr, src0_ptr, rs);
|
memcpy(dst_ptr, src0_ptr, sizeof(float));
|
||||||
|
|
||||||
id++;
|
if (++i10 == dst->ne[0]) {
|
||||||
}
|
i10 = 0;
|
||||||
}
|
if (++i11 == dst->ne[1]) {
|
||||||
}
|
i11 = 0;
|
||||||
} else if (dst->type == GGML_TYPE_F16) {
|
if (++i12 == dst->ne[2]) {
|
||||||
size_t id = 0;
|
i12 = 0;
|
||||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
if (++i13 == dst->ne[3]) {
|
||||||
|
i13 = 0;
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
}
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
}
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
}
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
}
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
}
|
||||||
|
}
|
||||||
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
}
|
||||||
id++;
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
|
*(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
|
||||||
|
|
||||||
|
if (++i10 == dst->ne[0]) {
|
||||||
|
i10 = 0;
|
||||||
|
if (++i11 == dst->ne[1]) {
|
||||||
|
i11 = 0;
|
||||||
|
if (++i12 == dst->ne[2]) {
|
||||||
|
i12 = 0;
|
||||||
|
if (++i13 == dst->ne[3]) {
|
||||||
|
i13 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false); // TODO: implement
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
|
||||||
size_t id = 0;
|
|
||||||
float * dst_ptr = (float *) dst->data;
|
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
||||||
|
|
||||||
dst_ptr[id] = *src0_ptr;
|
|
||||||
id++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (dst->type == GGML_TYPE_F16) {
|
|
||||||
size_t id = 0;
|
|
||||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
||||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
||||||
|
|
||||||
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
|
||||||
id++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false); // TODO: implement
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7199,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
|
||||||
assert(src1->type == GGML_TYPE_I32);
|
assert(src1->type == GGML_TYPE_I32);
|
||||||
assert(ggml_nelements(src1) == 3);
|
assert(ggml_nelements(src1) == 3);
|
||||||
|
|
||||||
|
@ -7226,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
|
||||||
|
|
||||||
assert(nb0 == sizeof(float));
|
assert(nb0 == sizeof(float));
|
||||||
|
|
||||||
// TODO: optimize
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
// rows per thread
|
||||||
|
const int dr = (nr + nth - 1)/nth;
|
||||||
|
|
||||||
|
// row range for this thread
|
||||||
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
// row index used to determine which thread to use
|
||||||
|
int ir = 0;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
|
if (ir++ < ir0) continue;
|
||||||
|
if (ir > ir1) break;
|
||||||
|
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
||||||
|
|
||||||
|
@ -7256,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
assert(params->ith == 0);
|
|
||||||
assert(src1->type == GGML_TYPE_I32);
|
assert(src1->type == GGML_TYPE_I32);
|
||||||
assert(ggml_nelements(src1) == 3);
|
assert(ggml_nelements(src1) == 3);
|
||||||
|
|
||||||
|
@ -7283,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
|
||||||
|
|
||||||
assert(nb0 == sizeof(ggml_fp16_t));
|
assert(nb0 == sizeof(ggml_fp16_t));
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
// rows per thread
|
||||||
|
const int dr = (nr + nth - 1)/nth;
|
||||||
|
|
||||||
|
// row range for this thread
|
||||||
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
// row index used to determine which thread to use
|
||||||
|
int ir = 0;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
|
if (ir++ < ir0) continue;
|
||||||
|
if (ir > ir1) break;
|
||||||
|
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
||||||
|
|
||||||
|
@ -9385,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
node->n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_1D_1S:
|
case GGML_OP_CONV_1D_1S:
|
||||||
case GGML_OP_CONV_1D_2S:
|
case GGML_OP_CONV_1D_2S:
|
||||||
|
|
10
ggml.h
10
ggml.h
|
@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d(
|
||||||
size_t nb1, // row stride in bytes
|
size_t nb1, // row stride in bytes
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_view_3d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
size_t nb1, // row stride in bytes
|
||||||
|
size_t nb2, // slice stride in bytes
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_permute(
|
struct ggml_tensor * ggml_permute(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
BIN
koboldcpp.dll
BIN
koboldcpp.dll
Binary file not shown.
Binary file not shown.
73
llama.cpp
73
llama.cpp
|
@ -810,37 +810,35 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
if (N >= 1) {
|
{
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
// compute the transposed [N, n_embd] V matrix
|
||||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||||
|
|
||||||
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
|
||||||
struct ggml_tensor * Q =
|
struct ggml_tensor * Q =
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_rope(ctx0,
|
Qcur,
|
||||||
ggml_cpy(ctx0,
|
|
||||||
Qcur,
|
|
||||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
|
||||||
n_past, n_rot, 0),
|
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
|
||||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_rope(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
n_embd/n_head, n_head, n_past + N),
|
||||||
n_embd/n_head, n_head, n_past + N),
|
|
||||||
n_past, n_rot, 1),
|
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
|
||||||
// K * Q
|
// K * Q
|
||||||
|
@ -858,18 +856,23 @@ static bool llama_eval_internal(
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||||
|
|
||||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
// split cached V into n_head heads
|
||||||
struct ggml_tensor * V_trans =
|
struct ggml_tensor * V =
|
||||||
ggml_cpy(ctx0,
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
ggml_permute(ctx0,
|
n_past + N, n_embd/n_head, n_head,
|
||||||
ggml_reshape_3d(ctx0,
|
n_ctx*ggml_element_size(kv_self.v),
|
||||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
||||||
n_embd/n_head, n_head, n_past + N),
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
||||||
1, 2, 0, 3),
|
|
||||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
|
||||||
|
|
||||||
// KQV = transpose(V) * KQ_soft_max
|
#if 1
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
#else
|
||||||
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||||
|
// is there a better way?
|
||||||
|
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||||
|
#endif
|
||||||
|
|
||||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
@ -955,9 +958,13 @@ static bool llama_eval_internal(
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
ggml_graph_compute (ctx0, &gf);
|
ggml_graph_compute (ctx0, &gf);
|
||||||
|
|
||||||
|
// print timing information per ggml operation (for debugging purposes)
|
||||||
|
// requires GGML_PERF to be defined
|
||||||
|
//ggml_graph_print(&gf);
|
||||||
|
|
||||||
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_print (&gf);
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
||||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//embd_w.resize(n_vocab*N);
|
//embd_w.resize(n_vocab*N);
|
||||||
|
@ -1229,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sample_top_k(logits_id, top_k);
|
if (top_k > 0 && top_k < n_logits) {
|
||||||
|
sample_top_k(logits_id, top_k);
|
||||||
|
}
|
||||||
|
|
||||||
float maxl = -std::numeric_limits<float>::infinity();
|
float maxl = -std::numeric_limits<float>::infinity();
|
||||||
for (const auto & kv : logits_id) {
|
for (const auto & kv : logits_id) {
|
||||||
|
|
BIN
media/llama-leader.jpeg
Normal file
BIN
media/llama-leader.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 195 KiB |
BIN
media/llama0-banner.png
Normal file
BIN
media/llama0-banner.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 141 KiB |
BIN
media/llama0-logo.png
Normal file
BIN
media/llama0-logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 176 KiB |
BIN
media/llama1-banner.png
Normal file
BIN
media/llama1-banner.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 32 KiB |
BIN
media/llama1-logo.png
Normal file
BIN
media/llama1-logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 32 KiB |
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue