diff --git a/.gitignore b/.gitignore index c75e1cb5b..2ee9fb3be 100644 --- a/.gitignore +++ b/.gitignore @@ -21,9 +21,6 @@ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ -models/* -*.bin - /main /quantize /quantize-stats diff --git a/BLIS.md b/BLIS.md deleted file mode 100644 index 9b3c30605..000000000 --- a/BLIS.md +++ /dev/null @@ -1,67 +0,0 @@ -BLIS Installation Manual ------------------------- - -BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers. - -Project URL: https://github.com/flame/blis - -### Prepare: - -Compile BLIS: - -```bash -git clone https://github.com/flame/blis -cd blis -./configure --enable-cblas -t openmp,pthreads auto -# will install to /usr/local/ by default. -make -j -``` - -Install BLIS: - -```bash -sudo make install -``` - -We recommend using openmp since it's easier to modify the cores been used. - -### llama.cpp compilation - -Makefile: - -```bash -make LLAMA_BLIS=1 -j -# make LLAMA_BLIS=1 benchmark-matmult -``` - -CMake: - -```bash -mkdir build -cd build -cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME .. -make -j -``` - -### llama.cpp execution - -According to the BLIS documentation, we could set the following -environment variables to modify the behavior of openmp: - -``` -export GOMP_GPU_AFFINITY="0-19" -export BLIS_NUM_THREADS=14 -``` - -And then run the binaries as normal. - - -### Intel specific issue - -Some might get the error message saying that `libimf.so` cannot be found. -Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila). - -### Reference: - -1. https://github.com/flame/blis#getting-started -2. https://github.com/flame/blis/blob/master/docs/Multithreading.md diff --git a/README.md b/README.md index 231bfa5c9..c423b4898 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A self contained distributable from Concedo that exposes llama.cpp function bind What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights. -![Preview](preview.png) +![Preview](media/preview.png) ## Usage - [Download the latest release here](https://github.com/LostRuins/koboldcpp/releases/latest) or clone the repo. diff --git a/build.zig b/build.zig deleted file mode 100644 index 306127ffe..000000000 --- a/build.zig +++ /dev/null @@ -1,61 +0,0 @@ -const std = @import("std"); - -pub fn build(b: *std.build.Builder) void { - const target = b.standardTargetOptions(.{}); - const optimize = b.standardReleaseOptions(); - const want_lto = b.option(bool, "lto", "Want -fLTO"); - - const lib = b.addStaticLibrary("llama", null); - lib.want_lto = want_lto; - lib.setTarget(target); - lib.setBuildMode(optimize); - lib.linkLibCpp(); - lib.addIncludePath("."); - lib.addIncludePath("examples"); - lib.addCSourceFiles(&.{ - "ggml.c", - }, &.{"-std=c11"}); - lib.addCSourceFiles(&.{ - "llama.cpp", - }, &.{"-std=c++11"}); - lib.install(); - - const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto }; - - const exe = build_example("main", build_args); - _ = build_example("quantize", build_args); - _ = build_example("perplexity", build_args); - _ = build_example("embedding", build_args); - - // create "zig build run" command for ./main - - const run_cmd = exe.run(); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| { - run_cmd.addArgs(args); - } - - const run_step = b.step("run", "Run the app"); - run_step.dependOn(&run_cmd.step); -} - -fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { - const b = args.b; - const lib = args.lib; - const want_lto = args.want_lto; - - const exe = b.addExecutable(name, null); - exe.want_lto = want_lto; - lib.setTarget(args.target); - lib.setBuildMode(args.optimize); - exe.addIncludePath("."); - exe.addIncludePath("examples"); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - exe.install(); - - return exe; -} diff --git a/media/llama-leader.jpeg b/media/llama-leader.jpeg deleted file mode 100644 index 0b4e6e1cf..000000000 Binary files a/media/llama-leader.jpeg and /dev/null differ diff --git a/media/llama0-banner.png b/media/llama0-banner.png deleted file mode 100644 index cee3a87f1..000000000 Binary files a/media/llama0-banner.png and /dev/null differ diff --git a/media/llama0-logo.png b/media/llama0-logo.png deleted file mode 100644 index e55b38bd9..000000000 Binary files a/media/llama0-logo.png and /dev/null differ diff --git a/media/llama1-banner.png b/media/llama1-banner.png deleted file mode 100644 index 1e469584e..000000000 Binary files a/media/llama1-banner.png and /dev/null differ diff --git a/media/llama1-logo.png b/media/llama1-logo.png deleted file mode 100644 index 365c5b865..000000000 Binary files a/media/llama1-logo.png and /dev/null differ diff --git a/preview.png b/media/preview.png similarity index 100% rename from preview.png rename to media/preview.png diff --git a/models/ggml-vocab.bin b/models/ggml-vocab.bin deleted file mode 100644 index 38f63493a..000000000 Binary files a/models/ggml-vocab.bin and /dev/null differ diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 0458948d2..9e023a9d6 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -325,48 +325,6 @@ ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & mo fin.close(); -// //gpu offload for gpt2 -// #if defined(GGML_USE_CLBLAST) -// if(gpulayers>0) -// { -// const auto & hparams = model.hparams; -// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); -// if(GetQuantsUnshuffled()) -// { - -// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); - -// size_t vram_total = 0; - -// for (int i = 0; i < n_gpu; ++i) { -// const auto & layer = model.layers[i]; - -// ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g); -// ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b); -// ggml_v2_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_v2_nbytes(layer.ln_2_g); -// ggml_v2_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_v2_nbytes(layer.ln_2_b); -// ggml_v2_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_v2_nbytes(layer.c_attn_attn_w); -// ggml_v2_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_v2_nbytes(layer.c_attn_attn_b); -// ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_v2_nbytes(layer.c_attn_proj_b); -// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w); -// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b); -// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b); -// } - -// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); -// } -// else -// { -// if(n_gpu>0) -// { -// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); -// } -// } -// } -// #endif - return ModelLoadResult::SUCCESS; } diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 9cf78d9fe..b3d9a5666 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -337,50 +337,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.close(); -// //gpu offload for gpt2 -// #if defined(GGML_USE_CLBLAST) -// if(gpulayers>0) -// { -// const auto & hparams = model.hparams; -// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); -// if(GetQuantsUnshuffled()) -// { -// SetGPULayers(n_gpu); - -// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); - -// size_t vram_total = 0; - -// for (int i = 0; i < n_gpu; ++i) { -// const auto & layer = model.layers[i]; - -// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); -// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); -// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g); -// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b); -// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); -// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b); -// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b); -// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); -// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); -// } - -// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); -// } -// else -// { -// if(n_gpu>0) -// { -// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); -// } -// } -// } -// #endif - - return ModelLoadResult::SUCCESS; } diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index cfb48a8c6..100ca1ace 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -331,46 +331,6 @@ ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & mo fin.close(); -// //gpu offload for gptj -// #if defined(GGML_USE_CLBLAST) -// if(gpulayers>0) -// { -// const auto & hparams = model.hparams; -// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); -// if(GetQuantsUnshuffled()) -// { - -// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); - -// size_t vram_total = 0; - -// for (int i = 0; i < n_gpu; ++i) { -// const auto & layer = model.layers[i]; - -// ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g); -// ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b); -// ggml_v2_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_q_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_k_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_v_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w); -// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b); -// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w); -// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b); -// } - -// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); -// } -// else -// { -// if(n_gpu>0) -// { -// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); -// } -// } -// } -// #endif - return ModelLoadResult::SUCCESS; } diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 893e6ebd6..894e7fb4d 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -331,46 +331,6 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.close(); -// //gpu offload for gptj -// #if defined(GGML_USE_CLBLAST) -// if(gpulayers>0) -// { -// const auto & hparams = model.hparams; -// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); -// if(GetQuantsUnshuffled()) -// { -// SetGPULayers(n_gpu); - -// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); - -// size_t vram_total = 0; - -// for (int i = 0; i < n_gpu; ++i) { -// const auto & layer = model.layers[i]; - -// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); -// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); -// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); -// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); -// } - -// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); -// } -// else -// { -// if(n_gpu>0) -// { -// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); -// } -// } -// } -// #endif return ModelLoadResult::SUCCESS; diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt deleted file mode 100644 index 03e1d2c04..000000000 --- a/pocs/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -# dependencies - -find_package(Threads REQUIRED) - -# third-party - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -if (EMSCRIPTEN) -else() - add_subdirectory(vdot) -endif() diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt deleted file mode 100644 index fb89a1cd4..000000000 --- a/pocs/vdot/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -set(TARGET vdot) -add_executable(${TARGET} vdot.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TARGET q8dot) -add_executable(${TARGET} q8dot.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp deleted file mode 100644 index 5748c8ac2..000000000 --- a/pocs/vdot/q8dot.cpp +++ /dev/null @@ -1,172 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -constexpr int kVecSize = 1 << 16; - -// Copy-pasted from ggml.c -#define QK4_0 32 -typedef struct { - float d; // delta - uint8_t qs[QK4_0 / 2]; // nibbles / quants -} block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); - -#define QK4_1 32 -typedef struct { - float d; // delta - float m; // min - uint8_t qs[QK4_1 / 2]; // nibbles / quants -} block_q4_1; -static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); - -// Copy-pasted from ggml.c -#define QK8_0 32 -typedef struct { - float d; // delta - float s; // d * sum(qs[i]) - int8_t qs[QK8_0]; // quants -} block_q8_0; -static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); - -static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same"); -static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same"); - -template -void fillQ4blocks(std::vector& blocks, std::mt19937& rndm) { - for (auto& b : blocks) { - b.d = 1; - for (int i=0; i> 28; - uint8_t v2 = rndm() >> 28; - b.qs[i] = v1 | (v2 << 4); - } - } -} - -void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { - for (auto& b : blocks) { - b.d = 1; - int sum = 0; - for (int i=0; i> 24) - 128; - sum += b.qs[i]; - } - b.s = b.d * sum; - } -} - -float simpleDot(const block_q4_0& x, const block_q8_0& y) { - int s1 = 0; //, s2 = 0; - for (int i=0; i> 4; - int v3 = x.qs[i+1] & 0xf; - int v4 = x.qs[i+1] >> 4; - int j = 2*i; - s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; - //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; - } - return y.d * x.d * s1 - 8 * x.d * y.s; - //return y.d * x.d * (s1 - 8 * s2); -} - -float simpleDot(const block_q4_1& x, const block_q8_0& y) { - int s1 = 0; //, s2 = 0; - for (int i=0; i> 4; - int v3 = x.qs[i+1] & 0xf; - int v4 = x.qs[i+1] >> 4; - int j = 2*i; - s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; - //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; - } - return y.d * x.d * s1 + y.s * x.m; - //return y.d * (x.d * s1 + x.m * s2); -} - -struct Stat { - double sum = 0, sumt = 0, sumt2 = 0, maxt = 0; - int nloop = 0; - void addResult(double s, double t) { - sum += s; - sumt += t; sumt2 += t*t; maxt = std::max(maxt, t); - ++nloop; - } - void reportResult(const char* title) const { - if (nloop < 1) { - printf("%s(%s): no result\n",__func__,title); - return; - } - printf("============ %s\n",title); - printf(" = %g\n",sum/nloop); - auto t = sumt/nloop, dt = sumt2/nloop - t*t; - if (dt > 0) dt = sqrt(dt); - printf("