diff --git a/.gitignore b/.gitignore index 1c75d38d1..d8dd34fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize +/quantize-stats /result /perplexity /embedding @@ -33,3 +34,6 @@ compile_commands.json .venv __pycache__ .swiftpm + +zig-out/ +zig-cache/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a434f07b..6bec1f97b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ if (LLAMA_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS) add_link_options(${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas) else() message(WARNING "OpenBLAS not found") endif() @@ -139,6 +140,7 @@ if (LLAMA_ALL_WARNINGS) -Wpedantic -Wcast-qual -Wno-unused-function + -Wno-multichar ) else() # todo : msvc @@ -151,6 +153,10 @@ if (LLAMA_ALL_WARNINGS) endif() +if (MSVC) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) +endif() + if (LLAMA_LTO) include(CheckIPOSupported) check_ipo_supported(RESULT result OUTPUT output) @@ -240,7 +246,9 @@ endif() add_library(llama llama.cpp - llama.h) + llama.h + llama_internal.h + llama_util.h) target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump diff --git a/Makefile b/Makefile index 2f828bf10..3e58a28a7 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ LDFLAGS = # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function +CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific # TODO: support Windows @@ -72,6 +72,7 @@ endif ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # Use all CPU extensions that are available: CFLAGS += -march=native -mtune=native + CXXFLAGS += -march=native -mtune=native endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) @@ -141,14 +142,14 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h +llama.o: llama.cpp llama.h llama_util.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -159,12 +160,17 @@ main: examples/main/main.cpp ggml.o llama.o common.o quantize: examples/quantize/quantize.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +libllama.so: llama.o ggml.o + $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) # # Tests # diff --git a/Package.swift b/Package.swift index 79d13c82d..2c2c147ba 100644 --- a/Package.swift +++ b/Package.swift @@ -13,7 +13,10 @@ let package = Package( path: ".", sources: ["ggml.c", "llama.cpp"], publicHeadersPath: "spm-headers", - cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])] + cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], + linkerSettings: [ + .linkedFramework("Accelerate") + ] ), ], cxxLanguageStandard: .cxx11 diff --git a/README.md b/README.md index 508d315d5..5ef4318eb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llama.cpp -![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png) +![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) @@ -9,8 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** -- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457) -- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) +- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) ## Description @@ -28,20 +27,31 @@ Please do not make conclusions about the models based on the results from this i For all I know, it can be completely wrong. This project is for educational purposes. New features will probably be added mostly through community contributions. -Supported platforms: +**Supported platforms:** - [X] Mac OS - [X] Linux - [X] Windows (via CMake) - [X] Docker -Supported models: +**Supported models:** - [X] LLaMA 🦙 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) +- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) + +**Bindings:** + +- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) +- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) + +**UI:** + +- [nat/openplayground](https://github.com/nat/openplayground) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) --- @@ -145,6 +155,13 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp make +#For Windows and CMake, use the following command instead: +cd +mkdir build +cd build +cmake .. +cmake --build . --config Release + # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model @@ -333,20 +350,22 @@ We have two Docker images available for this project: The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. +Replace `/path/to/models` below with the actual path where you downloaded the models. + ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B ``` On complete, you are ready to play! ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with light image: ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` ### Contributing @@ -367,3 +386,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +### Docs + +- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) diff --git a/build.zig b/build.zig new file mode 100644 index 000000000..defc2c3ad --- /dev/null +++ b/build.zig @@ -0,0 +1,67 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + const want_lto = b.option(bool, "lto", "Want -fLTO"); + + const lib = b.addStaticLibrary(.{ + .name = "llama", + .target = target, + .optimize = optimize, + }); + lib.want_lto = want_lto; + lib.linkLibCpp(); + lib.addIncludePath("."); + lib.addIncludePath("examples"); + lib.addCSourceFiles(&.{ + "ggml.c", + }, &.{"-std=c11"}); + lib.addCSourceFiles(&.{ + "llama.cpp", + }, &.{"-std=c++11"}); + lib.install(); + + const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto }; + + const exe = build_example("main", build_args); + _ = build_example("quantize", build_args); + _ = build_example("perplexity", build_args); + _ = build_example("embedding", build_args); + + // create "zig build run" command for ./main + + const run_cmd = exe.run(); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| { + run_cmd.addArgs(args); + } + + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); +} + +fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { + const b = args.b; + const lib = args.lib; + const target = args.target; + const optimize = args.optimize; + const want_lto = args.want_lto; + + const exe = b.addExecutable(.{ + .name = name, + .target = target, + .optimize = optimize, + }); + exe.want_lto = want_lto; + exe.addIncludePath("."); + exe.addIncludePath("examples"); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), + "examples/common.cpp", + }, &.{"-std=c++11"}); + exe.linkLibrary(lib); + exe.install(); + + return exe; +} diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ce3a34710..67a7cea54 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ if (EMSCRIPTEN) else() add_subdirectory(main) add_subdirectory(quantize) + add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) endif() diff --git a/examples/Miku.sh b/examples/Miku.sh new file mode 100755 index 000000000..352478a15 --- /dev/null +++ b/examples/Miku.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -e + +AI_NAME="${AI_NAME:-Miku}" +MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" +USER_NAME="${USER_NAME:-Anon}" + +# Uncomment and adjust to the number of CPU cores you want to use. +#N_THREAD="${N_THREAD:-4}" +N_PREDICTS="${N_PREDICTS:-4096}" + +GEN_OPTIONS=(--batch_size 1024 +--ctx_size 2048 +--keep -1 +--repeat_last_n 256 +--repeat_penalty 1.17647 +--temp 0.7 +--top_k 40 +--top_p 0.5) + +if [ -n "$N_THREAD" ]; then + GEN_OPTIONS+=(--threads "$N_THREAD") +fi + +./main "${GEN_OPTIONS[@]}" \ + --model "$MODEL" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --reverse-prompt "${USER_NAME}:" \ + --prompt " +This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer. +${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between ${USER_NAME} and ${AI_NAME} +The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. +${AI_NAME} can only communicate through text, so she can't send images or videos. + + +${USER_NAME}: Hello! +${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that! +${AI_NAME}: What do you like to do in your free time? ^_^ +${USER_NAME}:" "$@" diff --git a/examples/common.cpp b/examples/common.cpp index 5400f6b01..f909eed24 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,7 +1,5 @@ #include "common.h" -#include "ggml.h" - #include #include #include @@ -16,12 +14,19 @@ #endif #if defined (_WIN32) +#include +#include #pragma comment(lib,"kernel32.lib") extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); +extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, + const wchar_t * lpWideCharStr, int cchWideChar, + char * lpMultiByteStr, int cbMultiByte, + const char * lpDefaultChar, bool * lpUsedDefaultChar); +#define CP_UTF8 65001 #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { @@ -154,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_color = true; } else if (arg == "--mlock") { params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; } else if (arg == "--mtest") { params.mem_test = true; } else if (arg == "--verbose-prompt") { @@ -233,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (ggml_mlock_supported()) { + if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); @@ -307,12 +317,20 @@ void win32_console_init(bool enable_color) { SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) } // Set console output codepage to UTF8 - SetConsoleOutputCP(65001); // CP_UTF8 + SetConsoleOutputCP(CP_UTF8); } void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10) if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) { - // Set console input codepage to UTF8 - SetConsoleCP(65001); // CP_UTF8 + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); } } + +// Convert a wide Unicode string to an UTF8 string +void win32_utf8_encode(const std::wstring & wstr, std::string & str) { + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string strTo(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); + str = strTo; +} #endif diff --git a/examples/common.h b/examples/common.h index 1505aa927..1ea6f7445 100644 --- a/examples/common.h +++ b/examples/common.h @@ -47,6 +47,7 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation @@ -92,4 +93,5 @@ void set_console_color(console_state & con_st, console_color_t color); #if defined (_WIN32) void win32_console_init(bool enable_color); +void win32_utf8_encode(const std::wstring & wstr, std::string & str); #endif diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index d397f35fd..2eda3ac01 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -38,6 +38,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 453450a41..d333d0dba 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -97,6 +97,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; ctx = llama_init_from_file(params.model.c_str(), lparams); @@ -386,10 +387,19 @@ int main(int argc, char ** argv) { std::string line; bool another_line = true; do { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // input stream is bad or EOF received + return 0; + } + win32_utf8_encode(wline, line); +#else if (!std::getline(std::cin, line)) { // input stream is bad or EOF received return 0; } +#endif if (line.empty() || line.back() != '\\') { another_line = false; } else { @@ -431,7 +441,7 @@ int main(int argc, char ** argv) { } // end of text token - if (embd.back() == llama_token_eos()) { + if (!embd.empty() && embd.back() == llama_token_eos()) { if (params.instruct) { is_interacting = true; } else { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 07ed0a829..b62f00d0c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -115,6 +115,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt new file mode 100644 index 000000000..7bebc11a1 --- /dev/null +++ b/examples/quantize-stats/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize-stats) +add_executable(${TARGET} quantize-stats.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp new file mode 100644 index 000000000..203bfe8cc --- /dev/null +++ b/examples/quantize-stats/quantize-stats.cpp @@ -0,0 +1,354 @@ +#include "ggml.h" +#include "llama.h" +#include "llama_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); + +struct quantize_stats_params { + std::string model = "models/7B/ggml-model-f16.bin"; + bool verbose = false; + bool per_layer_stats = false; + bool print_histogram = false; + bool reference = false; + std::vector include_layers; + std::vector exclude_layers; + std::vector include_types; +}; + +const int64_t SCRATCH_ELEMENTS = 32*32; +const size_t HISTOGRAM_BUCKETS = 150; +const double HISTOGRAM_RANGE = 0.03; + +struct error_stats { + size_t num_samples; + double total_error; + double max_error; + uint64_t error_histogram[HISTOGRAM_BUCKETS]; +}; + + +void quantize_stats_print_usage(int /*argc*/, char ** argv) { + quantize_stats_params params; + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -r, --reference\n"); + fprintf(stderr, " use reference implementation (default: false)\n"); + fprintf(stderr, " -v, --verbose\n"); + fprintf(stderr, " verbose output (default: false)\n"); + fprintf(stderr, " -p, --per-layer-stats\n"); + fprintf(stderr, " print stats per layer (default: false)\n"); + fprintf(stderr, " --histogram\n"); + fprintf(stderr, " print error histogram (default: false)\n"); + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); + fprintf(stderr, " only test layers matching pattern\n"); + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); + fprintf(stderr, " exclude layers matching pattern\n"); + fprintf(stderr, " -t TYPE, --type TYPE\n"); + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); + fprintf(stderr, "\n"); +} + +// Check if a layer is included/excluded by command line +bool layer_included(const quantize_stats_params params, const std::string & layer) { + for (const auto& excluded : params.exclude_layers) { + if (std::regex_search(layer, std::regex(excluded))) { + return false; + } + } + for (const auto& included : params.include_layers) { + if (std::regex_search(layer, std::regex(included))) { + return true; + } + } + return params.include_layers.empty(); +} + +// Update error statistics given vectors with the before/after result of quantization +void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { + for (int64_t i = 0; i < nelements; i++) { + double diff = input[i] - output[i]; + stats.total_error += diff * diff; + stats.max_error = fmax(fabs(diff), stats.max_error); + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; + } + stats.num_samples += nelements; +} + +double find_quantile(const error_stats & stats, double quantile) { + double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); + + double accum = 0; + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + accum += stats.error_histogram[i]; + if (accum >= sum*quantile) { + return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + } + } + return INFINITY; +} + +void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { + double rmse = sqrt(stats.total_error / (double) stats.num_samples); + double median = find_quantile(stats, .5); + double pct95 = find_quantile(stats, .95); + printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); + if (print_histogram) { + printf("Error distribution:\n"); + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; + printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + } + } +} + +// copied from ggml.h - verify that we can access this as a flat array +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +// Run quantization function for a single layer and update error stats +void test_roundtrip_on_layer( + std::string & name, + bool print_layer_stats, + const quantize_fns_t & qfns, + bool use_reference, + const ggml_tensor * layer, + float * input_scratch, + char *quantized_scratch, + float * output_scratch, + error_stats & total_error) { + + assert(tensor_is_contiguous(layer)); + error_stats layer_error {}; + int64_t nelements = ggml_nelements(layer); + + for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { + int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, total_error); + if (print_layer_stats) { + update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); + } + } + if (print_layer_stats) { + print_error_stats(name, layer_error, false); + } +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + quantize_stats_params params; + + // read command line + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + quantize_stats_print_usage(argc, argv); + exit(0); + } else if (arg == "-r" || arg == "--reference") { + params.reference = true; + } else if (arg == "-v") { + params.verbose = true; + } else if (arg == "-p" || arg == "--per-layer-stats") { + params.per_layer_stats = true; + } else if (arg == "--histogram") { + params.print_histogram = true; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-l" || arg == "--include-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_layers.push_back(argv[i]); + } else if (arg == "-L" || arg == "--exclude-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.exclude_layers.push_back(argv[i]); + } else if (arg == "-t" || arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + // find match + } + if (j < GGML_TYPE_COUNT) { + params.include_types.push_back((ggml_type) j); + } else { + fprintf(stderr, "error: %s not in list of types\n", argv[i]); + invalid_param = true; + } + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + + // load the model + fprintf(stderr, "Loading model\n"); + + const int64_t t_main_start_us = ggml_time_us(); + llama_context * ctx; + + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = 256; + lparams.n_parts = 1; + lparams.seed = 1; + lparams.f16_kv = false; + lparams.use_mlock = false; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + const auto &tensors = llama_internal_get_tensor_map(ctx); + + // check layer tensors + int included_layers = 0; + int64_t max_nelements = 0; + bool is_f16 = false; + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + } + if (kv_tensor.second->type == GGML_TYPE_F16) { + is_f16 = true; + } else if (kv_tensor.second->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); + llama_free(ctx); + return 1; + } + included_layers++; + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); + } + + if (is_f16) { + printf("note: source model is f16\n"); + } + printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); + // allocate scratch space + std::vector input_scratch(SCRATCH_ELEMENTS); + std::vector quantized_scratch(SCRATCH_ELEMENTS*4); + std::vector output_scratch(SCRATCH_ELEMENTS); + + // loop throught quantization types + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + continue; + } + quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (params.verbose) { + printf("testing %s ...\n", type_strs[i]); + } + + error_stats global_stats {}; + + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { type_strs[i] }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + qfns, + params.reference, + kv_tensor.second, + input_scratch.data(), + quantized_scratch.data(), + output_scratch.data(), + global_stats + ); + } + + print_error_stats(type_strs[i], global_stats, params.print_histogram); + } + } + + + llama_free(ctx); + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/flake.nix b/flake.nix index 4c2717e0d..cd1b6d28e 100644 --- a/flake.nix +++ b/flake.nix @@ -30,6 +30,9 @@ mkdir -p $out/bin mv bin/main $out/bin/llama mv bin/quantize $out/bin/quantize + mv bin/embedding $out/bin/embedding + mv bin/perplexity $out/bin/perplexity + echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml chmod +x $out/bin/convert-pth-to-ggml diff --git a/ggml.c b/ggml.c index b5b6cf8fc..a817f8321 100644 --- a/ggml.c +++ b/ggml.c @@ -92,17 +92,6 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif -#define GGML_MLOCK_SUPPORT 0 - -#ifdef __has_include - #if __has_include() - #undef GGML_MLOCK_SUPPORT - #define GGML_MLOCK_SUPPORT 1 - #include - #endif -#endif - - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -605,10 +594,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); - // absolute max - const float amax = MAX( - MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), - MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); + const float amax = vmaxvq_f32(amaxv[0]); const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0f/d : 0.0f; @@ -930,7 +916,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int float32x4_t minv[8]; float32x4_t maxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l); for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); @@ -953,7 +939,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int for (int l = 0; l < 8; l++) { const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); - const int32x4_t vi = vcvtq_s32_f32(v); + const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest + const int32x4_t vi = vcvtq_s32_f32(vf); y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); @@ -2685,21 +2672,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); -// -// ggml object -// - -struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - char padding[8]; -}; - -static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -2711,7 +2683,6 @@ struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; - bool mem_buffer_mlocked; bool no_alloc; int n_objects; @@ -2998,7 +2969,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_size =*/ params.mem_size, /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, - /*.mem_buffer_mlocked =*/ false, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, @@ -3031,14 +3001,6 @@ void ggml_free(struct ggml_context * ctx) { GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - if (munlock(ctx->mem_buffer, ctx->mem_size)) { - fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno)); - } - } -#endif - if (ctx->mem_buffer_owned) { free(ctx->mem_buffer); } @@ -3067,48 +3029,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } -#ifdef __APPLE__ -#define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" -#else -#define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" -#endif - -bool ggml_mlock_supported(void) { - return GGML_MLOCK_SUPPORT; -} - -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p) { - // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32 -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - return true; - } - if (mlock(ctx->mem_buffer, ctx->mem_size) || - (opt_extra_len && - mlock(opt_extra_addr, opt_extra_len))) { - if ((*err_p = malloc(1024))) { - snprintf(*err_p, 1024, - "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION, - ctx->mem_size + opt_extra_len, - strerror(errno)); - } - return false; - } - ctx->mem_buffer_mlocked = true; - return true; -#else // GGML_MLOCK_SUPPORT - *err_p = strdup("can't mlock because it's not supported on this system"); - return false; -#endif // GGML_MLOCK_SUPPORT -} - //////////////////////////////////////////////////////////////////////////////// struct ggml_tensor * ggml_new_tensor_impl( @@ -3214,7 +3134,8 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.pad =*/ { 0 }, }; - ggml_assert_aligned(result->data); + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads + //ggml_assert_aligned(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -3615,7 +3536,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + + result->nb[0] = src->nb[0]; + result->nb[1] = src->nb[1]; + result->nb[2] = src->nb[2]; + result->nb[3] = src->nb[3]; + + return result; } //////////////////////////////////////////////////////////////////////////////// @@ -4505,6 +4433,37 @@ struct ggml_tensor * ggml_view_2d( return result; } +// ggml_view_3d + +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, + size_t nb2, + size_t offset) { + if (a->grad) { + GGML_ASSERT(false); // gradient propagation is not supported + } + + const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = result->nb[2]*ne2; + + result->op = GGML_OP_VIEW; + result->grad = NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the offset here? + + return result; +} + // ggml_permute struct ggml_tensor * ggml_permute( @@ -4840,7 +4799,6 @@ static void ggml_compute_forward_dup_f16( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4857,85 +4815,96 @@ static void ggml_compute_forward_dup_f16( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(ggml_fp16_t)) { - if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - const size_t rs = ne00*nb00; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; - } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); } } - } else if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; + } + return; + } - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - } - } - } else { - GGML_ASSERT(false); // TODO: implement - } + GGML_ASSERT(false); // TODO: implement } } @@ -4944,7 +4913,6 @@ static void ggml_compute_forward_dup_f32( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4961,85 +4929,76 @@ static void ggml_compute_forward_dup_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00*nb00; + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; + if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, rs); + memcpy(dst_ptr, src0_ptr, sizeof(float)); - id++; - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - } - } - } else { - GGML_ASSERT(false); // TODO: implement - } + GGML_ASSERT(false); // TODO: implement } } @@ -6520,29 +6479,27 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); -typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); -typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); - -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; - static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .vec_dot_q = ggml_vec_dot_q4_0, + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .vec_dot_q = ggml_vec_dot_q4_1, }, }; +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -7194,7 +7151,6 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7221,11 +7177,28 @@ static void ggml_compute_forward_rope_f32( assert(nb0 == sizeof(float)); - // TODO: optimize + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -7251,7 +7224,6 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7278,10 +7250,28 @@ static void ggml_compute_forward_rope_f16( assert(nb0 == sizeof(ggml_fp16_t)); + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -9380,7 +9370,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ROPE: { - node->n_tasks = 1; + node->n_tasks = n_threads; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: diff --git a/ggml.h b/ggml.h index ad962b109..af16c647c 100644 --- a/ggml.h +++ b/ggml.h @@ -253,6 +253,19 @@ enum ggml_op { GGML_OP_COUNT, }; + +// ggml object +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + char padding[8]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); -bool ggml_mlock_supported(void); -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p); - struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, @@ -558,6 +564,16 @@ struct ggml_tensor * ggml_view_2d( size_t nb1, // row stride in bytes size_t offset); +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + struct ggml_tensor * ggml_permute( struct ggml_context * ctx, struct ggml_tensor * a, @@ -773,6 +789,30 @@ int ggml_cpu_has_blas(void); int ggml_cpu_has_sse3(void); int ggml_cpu_has_vsx(void); + +// +// Internal types and functions exposed for tests and benchmarks +// + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif +typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + +typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; + vec_dot_q_t vec_dot_q; +} quantize_fns_t; + +quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index 78cf9395f..203a1adc0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,49 +1,26 @@ +#include "llama_util.h" #include "llama.h" +#include "llama_internal.h" #include "ggml.h" +#include #include #include #include #include #include #include -#include #include #include - -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) -#define WIN32_LEAN_AND_MEAN -#include -#else -#include -#include -#include -#include -#endif - -#define Min(X, Y) ((Y) > (X) ? (X) : (Y)) -#define Max(X, Y) ((Y) < (X) ? (X) : (Y)) +#include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 -#define LLAMA_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - - -// determine number of model parts based on the dimension -static const std::unordered_map LLAMA_N_PARTS = { - { 4096, 1 }, - { 5120, 2 }, - { 6656, 4 }, - { 8192, 8 }, -}; // available llama models enum e_model { @@ -93,14 +70,18 @@ static const std::map MEM_REQ_EVAL = { // default hparams (LLaMA 7B) struct llama_hparams { - int32_t n_vocab = 32000; - int32_t n_ctx = 512; // this is provided as user input? - int32_t n_embd = 4096; - int32_t n_mult = 256; - int32_t n_head = 32; - int32_t n_layer = 32; - int32_t n_rot = 64; - int32_t f16 = 1; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t f16 = 1; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } }; struct llama_layer { @@ -126,11 +107,17 @@ struct llama_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; - std::vector buf; + llama_buffer buf; int n; // number of tokens currently in the cache + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_model { @@ -146,22 +133,30 @@ struct llama_model { std::vector layers; // context - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; // key + value cache for the self attention // TODO: move to llama_state struct llama_kv_cache kv_self; // the model memory buffer - std::vector buf; + llama_buffer buf; // model memory mapped file - void * mm_addr = NULL; - uint64_t mm_length = 0; + std::unique_ptr mapping; - // tensors - int n_loaded; - std::unordered_map tensors; + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_vocab { @@ -206,8 +201,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - std::vector buf_compute; - std::vector buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_buffer buf_compute; + llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -220,11 +215,11 @@ struct llama_context { last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); } if (buf_last >= 0) { - buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size); + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); } buf_last = i; @@ -244,6 +239,508 @@ struct llama_context { } }; +template +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw format("error dividing %zu / %zu", a, b); + } + return a / b; +} + +static std::string llama_format_tensor_shape(const std::vector & ne) { + std::string ret = "[" + std::to_string(ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + ret += " x " + std::to_string(ne.at(i)); + } + ret += "]"; + return ret; +} + +static const char * llama_format_type(enum ggml_type type) { + switch (type) { + case GGML_TYPE_F32: return "f32"; + case GGML_TYPE_F16: return "f16"; + case GGML_TYPE_Q4_0: return "q4_0"; + case GGML_TYPE_Q4_1: return "q4_1"; + default: LLAMA_ASSERT(false); + } +} + +static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_load_tensor_shard { + std::vector ne; + size_t size; + enum ggml_type type; + size_t file_idx; + size_t file_off; + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +enum llama_split_type { + SPLIT_NONE, + SPLIT_BY_COLUMNS, + SPLIT_BY_ROWS +}; + +struct llama_load_tensor { + std::vector shards; + + std::string name; + enum ggml_type type = GGML_TYPE_F32; + llama_split_type split_type = SPLIT_NONE; + std::vector ne; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; + + llama_load_tensor(const std::string & name) : name(name) {} + + void calc_all() { + calc_type(); + calc_split_type(); + calc_ne(); + calc_size(); + } + + void calc_type() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.type != first_shard.type) { + throw format("inconsistent tensor shard type in '%s'", name.c_str()); + } + } + type = first_shard.type; + } + + void calc_split_type() { + if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file + shards.size() == 1) { // only one file? + split_type = SPLIT_NONE; + } else if (name.find("tok_embeddings.") == 0 || + name.find(".attention.wo.weight") != std::string::npos || + name.find(".feed_forward.w2.weight") != std::string::npos) { + split_type = SPLIT_BY_COLUMNS; + } else { + split_type = SPLIT_BY_ROWS; + } + } + + void calc_ne() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.ne != first_shard.ne) { + throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + } + } + ne = first_shard.ne; + LLAMA_ASSERT(shards.size() <= UINT32_MAX); + uint32_t n_shards = (uint32_t) shards.size(); + switch (split_type) { + case SPLIT_NONE: + ne = first_shard.ne; + break; + case SPLIT_BY_COLUMNS: + ne = {checked_mul(first_shard.ne[0], n_shards), + first_shard.ne[1]}; + break; + case SPLIT_BY_ROWS: + ne = {first_shard.ne[0], + checked_mul(first_shard.ne[1], n_shards)}; + break; + } + } + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +struct llama_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector tensors; + std::unordered_map name_to_idx; +}; + +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding +}; + +struct llama_file_loader { + llama_file file; + llama_file_version file_version; + llama_hparams hparams; + llama_vocab vocab; + + llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + : file(fname, "rb") { + fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(file_idx, tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + uint32_t version = 0; + + if (magic != 'ggml') { + version = file.read_u32(); + } + + if (magic == 'ggml' && version == 0) { + file_version = LLAMA_FILE_VERSION_GGML; + } else if (magic == 'ggmf' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGMF_V1; + } else if (magic == 'ggjt' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGJT_V1; + } else { + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); + } + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.f16 = file.read_u32(); + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + file.read_raw(&score, sizeof(score)); + } + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_load_tensor_shard shard; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + uint32_t ftype = file.read_u32(); + shard.ne.resize(n_dims); + file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + } + switch (ftype) { + case 0: shard.type = GGML_TYPE_F32; break; + case 1: shard.type = GGML_TYPE_F16; break; + case 2: shard.type = GGML_TYPE_Q4_0; break; + case 3: shard.type = GGML_TYPE_Q4_1; break; + default: { + throw format("unrecognized ftype %u\n", ftype); + } + } + + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + // skip to the next multiple of 32 bytes + file.seek(-file.tell() & 31, SEEK_CUR); + } + shard.file_idx = file_idx; + shard.file_off = file.tell(); + + shard.calc_size(); + file.seek(shard.size, SEEK_CUR); + + auto it = tensors_map.name_to_idx.find(name); + size_t idx; + if (it != tensors_map.name_to_idx.end()) { + idx = it->second; + } else { + tensors_map.tensors.emplace_back(name); + idx = tensors_map.tensors.size() - 1; + tensors_map.name_to_idx.emplace(name, idx); + } + tensors_map.tensors.at(idx).shards.push_back(shard); + } + } +}; + +struct llama_file_saver { + llama_file file; + llama_file_loader * any_file_loader; + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + : file(fname, "wb"), any_file_loader(any_file_loader) { + fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_f16); + write_vocab(); + } + void write_magic() { + file.write_u32('ggjt'); // magic + file.write_u32(1); // version + } + void write_hparams(uint32_t new_f16) { + const llama_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_mult); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.n_rot); + file.write_u32(new_f16); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + uint32_t ftype; + switch (new_type) { + case GGML_TYPE_F32: ftype = 0; break; + case GGML_TYPE_F16: ftype = 1; break; + case GGML_TYPE_Q4_0: ftype = 2; break; + case GGML_TYPE_Q4_1: ftype = 3; break; + default: LLAMA_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(ftype); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-file.tell() & 31, SEEK_CUR); + LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_model_loader { + std::vector> file_loaders; + llama_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr mapping; + + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + file_loaders.emplace_back(first_file); + uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); + for (uint32_t i = 1; i < n_parts; i++) { + std::string fname = fname_base + "." + std::to_string(i); + auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + file_loaders.emplace_back(ith_file); + if (ith_file->hparams != first_file->hparams) { + throw format("llama.cpp: hparams inconsistent between files"); + } + } + if (!llama_mmap::SUPPORTED) { + use_mmap = false; + } + if (use_mmap && alignment_prevents_mmap()) { + fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + use_mmap = false; + } + this->use_mmap = use_mmap; + for (llama_load_tensor & lt : tensors_map.tensors) { + lt.calc_all(); + } + } + + bool alignment_prevents_mmap() { + for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const llama_load_tensor_shard & shard : lt.shards) { + if (shard.file_off & 3) { + return true; + } + } + } + return false; + } + + uint32_t guess_n_parts() const { + auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); + if (it == tensors_map.name_to_idx.end()) { + throw std::string("missing tok_embeddings.weight"); + } + const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, std::vector ne) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + } + llama_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + } + return get_tensor_for(lt); + } + + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * tensor; + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::string("llama.cpp: file contained more tensors than expected"); + } + } + + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t data_size = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + if (!lmlock) { + // Don't call the callback since the actual loading will be lazy + // and we can't measure it. + progress_callback = NULL; + } + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (llama_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + load_data_for(lt); + lt.ggml_tensor->data = lt.data; + done_size += lt.size; + if (use_mmap && lmlock) { + lmlock->grow_to(done_size); + } + } + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + } + + void load_data_for(llama_load_tensor & lt) { + if (use_mmap) { + LLAMA_ASSERT(lt.shards.size() == 1); + lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; + } else if (lt.split_type == SPLIT_NONE) { + llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + file.seek(lt.shards.at(0).file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } else if (lt.split_type == SPLIT_BY_ROWS) { + size_t offset = 0; + for (llama_load_tensor_shard & shard : lt.shards) { + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + file.read_raw(lt.data + offset, shard.size); + offset += shard.size; + } + LLAMA_ASSERT(offset == lt.size); + } else if (lt.split_type == SPLIT_BY_COLUMNS) { + // Let's load the data into temporary buffers to ensure the OS performs large loads. + std::vector tmp_bufs; + tmp_bufs.resize(lt.shards.size()); + for (size_t i = 0; i < lt.shards.size(); i++) { + llama_load_tensor_shard & shard = lt.shards.at(i); + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + tmp_bufs.at(i).resize(shard.size); + file.read_raw(tmp_bufs.at(i).addr, shard.size); + } + // Then reshape. + size_t num_rows = lt.ne.at(1); + size_t per_shard_row_size = lt.shards.at(0).size / num_rows; + size_t out_offset = 0; + for (size_t row = 0; row < num_rows; row++) { + for (llama_buffer & tmp_buf : tmp_bufs) { + memcpy(lt.data + out_offset, + tmp_buf.addr + row * per_shard_row_size, + per_shard_row_size); + out_offset += per_shard_row_size; + } + } + LLAMA_ASSERT(out_offset == lt.size); + } + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(llama_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + + // // kv cache // @@ -262,8 +759,8 @@ static bool kv_cache_init( cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; - params.mem_size = cache.buf.size(); - params.mem_buffer = cache.buf.data(); + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); @@ -279,13 +776,6 @@ static bool kv_cache_init( return true; } -static void kv_cache_free(struct llama_kv_cache & cache) { - if (cache.ctx) { - ggml_free(cache.ctx); - cache.ctx = nullptr; - } -} - struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, @@ -294,6 +784,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, /*.progress_callback =*/ nullptr, @@ -303,243 +794,94 @@ struct llama_context_params llama_context_default_params() { return result; } +bool llama_mmap_supported() { + return llama_mmap::SUPPORTED; +} + +bool llama_mlock_supported() { + return llama_mlock::SUPPORTED; +} + // // model loading // -static void *mmap_file(const char *fname, uint64_t *mm_length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - HANDLE hFile = CreateFileA(fname, - GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, - NULL); - if (hFile == INVALID_HANDLE_VALUE) return 0; - LARGE_INTEGER fileSize; - fileSize.QuadPart = -1; - GetFileSizeEx(hFile, &fileSize); - int64_t length = fileSize.QuadPart; - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - CloseHandle(hFile); - if (!hMapping) return 0; - void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - CloseHandle(hMapping); - if (!addr) return 0; -#else - int fd = open(fname, O_RDONLY); - if (fd == -1) return 0; - int64_t length = lseek(fd, 0, SEEK_END); - void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) return 0; -#endif - *mm_length = length; - return addr; +static const char *llama_file_version_name(llama_file_version version) { + switch (version) { + case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; + default: LLAMA_ASSERT(false); + } } -static void munmap_file(void * addr, size_t length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - UnmapViewOfFile(addr); -#else - munmap(addr, length); -#endif +static const char *llama_model_type_name(e_model type) { + switch (type) { + case MODEL_7B: return "7B"; + case MODEL_13B: return "13B"; + case MODEL_30B: return "30B"; + case MODEL_65B: return "65B"; + default: LLAMA_ASSERT(false); + } } -static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { - fprintf(stderr, - "%s: invalid model file (bad magic [got %#x want %#x])\n" - "\tyou most likely need to regenerate your ggml files\n" - "\tthe benefit is you'll get 10-100x faster load times\n" - "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" - "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" - "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", - path, got, want); - return false; -} - -static bool llama_model_load( +static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, - int n_parts, ggml_type memory_type, + bool use_mmap, + bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void *progress_callback_user_data) { - fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + void * progress_callback_user_data) { lctx.t_start_us = ggml_time_us(); + std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + + lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); auto & model = lctx.model; - auto & vocab = lctx.vocab; + model.hparams = ml->file_loaders.at(0)->hparams; + llama_file_version file_version = ml->file_loaders.at(0)->file_version; + auto & hparams = model.hparams; + uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - std::vector f_buf(1024*1024); - fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); - - fin.seekg(0, fin.end); - const size_t file_size = fin.tellg(); - fin.seekg(0); - - // verify magic { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n", - __func__, fname.c_str()); - return false; + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = e_model::MODEL_65B; break; } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); - } - - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - } - - int n_ff = 0; - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); hparams.n_ctx = n_ctx; - - n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - - if (n_parts < 1) { - n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - } - - // temp warning to tell the user to use "--n_parts" - if (hparams.f16 == 4 && n_parts != 1) { - fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); - fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); - } - - if (hparams.n_layer == 32) { - model.type = e_model::MODEL_7B; - } - - if (hparams.n_layer == 40) { - model.type = e_model::MODEL_13B; - } - - if (hparams.n_layer == 60) { - model.type = e_model::MODEL_30B; - } - - if (hparams.n_layer == 80) { - model.type = e_model::MODEL_65B; - } - - fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); - fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); - fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); - fprintf(stderr, "%s: type = %d\n", __func__, model.type); } - // load vocab { - std::string word; - vocab.id_to_token.resize(model.hparams.n_vocab); - std::vector tmp(64); - - for (int i = 0; i < model.hparams.n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - if (len > 0) { - tmp.resize(len); - fin.read(tmp.data(), len); - word.assign(tmp.data(), len); - } else { - word.clear(); - } - - float score; - fin.read((char *) &score, sizeof(score)); - - vocab.token_to_id[word] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; - tok_score.score = score; - } + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (vocab_only) { - return true; + return; } - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - // wtype is for per-layer weights, while vtype is for other weights - ggml_type wtype, vtype; - switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; - case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return false; - } - } - - // map model into memory - char *mm_addr = NULL; - model.mm_addr = mmap_file(fname.c_str(), &model.mm_length); - if (model.mm_addr == NULL) { - fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); - return false; - } - mm_addr = (char *)model.mm_addr; - fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0)); - auto & ctx = model.ctx; - size_t ctx_size = 0; - { - const auto &hparams = model.hparams; - const int n_layer = hparams.n_layer; - ctx_size += (5 + 10*n_layer)*256; // object overhead - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } + size_t ctx_size, mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); // print memory requirements { @@ -548,7 +890,7 @@ static bool llama_model_load( // this is the total memory required to run the inference const size_t mem_required = ctx_size + - model.mm_length + + mmapped_size + MEM_REQ_SCRATCH0.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_EVAL.at (model.type); @@ -564,17 +906,20 @@ static bool llama_model_load( // create the ggml context { lctx.model.buf.resize(ctx_size); + if (use_mlock) { + lctx.model.mlock_buf.init(lctx.model.buf.addr); + lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size(), - /*.mem_buffer =*/ lctx.model.buf.data(), - /*.no_alloc =*/ true, + /*.mem_size =*/ lctx.model.buf.size, + /*.mem_buffer =*/ lctx.model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, }; model.ctx = ggml_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; + throw format("ggml_init() failed"); } } @@ -582,161 +927,71 @@ static bool llama_model_load( { const auto & hparams = model.hparams; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + ml->ggml_ctx = ctx; + + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); model.layers.resize(n_layer); - - model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - // map by name - model.tensors["tok_embeddings.weight"] = model.tok_embeddings; - - model.tensors["norm.weight"] = model.norm; - model.tensors["output.weight"] = model.output; - - for (int i = 0; i < n_layer; ++i) { + for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + std::string layers_i = "layers." + std::to_string(i); - layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); - layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; - - model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); } } - std::vector tmp; + ml->done_getting_tensors(); - if (progress_callback) { - progress_callback(0.0, progress_callback_user_data); + // populate `tensors_by_name` + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } - fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str()); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - // load weights - { - size_t total_size = 0; - model.n_loaded = 0; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return false; - } - - auto tensor = model.tensors[name.data()]; - - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return false; - } - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - return false; - } - if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - switch (ftype) { - case 0: // f32 - case 1: // f16 - break; - case 2: // q4_0 - case 3: // q4_1 - assert(ne[0] % 64 == 0); - break; - default: - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); - return false; - }; - - // load the tensor data into memory without copying or reading it - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(tensor); - offset = (offset + 31) & -32; - tensor->data = mm_addr + offset; - fin.seekg(offset + tensor_data_size); - total_size += tensor_data_size; - model.n_loaded++; - - // progress - if (progress_callback) { - double current_progress = size_t(fin.tellg()) / double(file_size); - progress_callback(current_progress, progress_callback_user_data); - } - } - - fin.close(); - - fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); - if (model.n_loaded == 0) { - fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); - } else if (model.n_loaded != (int) model.tensors.size()) { - fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); - return false; - } - } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration lctx.t_load_us = ggml_time_us() - lctx.t_start_us; +} - if (progress_callback) { - progress_callback(1.0, progress_callback_user_data); +static bool llama_model_load( + const std::string & fname, + llama_context & lctx, + int n_ctx, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::string & err) { + fprintf(stderr, "error loading model: %s\n", err.c_str()); + return false; } - - return true; } // evaluate the transformer @@ -774,8 +1029,8 @@ static bool llama_eval_internal( auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size(), - /*.mem_buffer =*/ buf_compute.data(), + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; @@ -810,37 +1065,35 @@ static bool llama_eval_internal( // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); + { + // compute the transposed [N, n_embd] V matrix + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); } - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - n_past, n_rot, 0), + Qcur, 0, 2, 1, 3); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ggml_tensor * K = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), - n_embd/n_head, n_head, n_past + N), - n_past, n_rot, 1), + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q @@ -858,18 +1111,23 @@ static bool llama_eval_internal( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(kv_self.v), + n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, + il*n_ctx*ggml_element_size(kv_self.v)*n_embd); - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); @@ -955,9 +1213,13 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, inpL); ggml_graph_compute (ctx0, &gf); + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + //ggml_graph_print(&gf); + + // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); //} //embd_w.resize(n_vocab*N); @@ -1054,7 +1316,7 @@ struct llama_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - size_t char_len = Min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -1229,17 +1491,13 @@ static llama_vocab::id llama_sample_top_p_top_k( } } - sample_top_k(logits_id, top_k); - - float maxl = -std::numeric_limits::infinity(); - for (const auto & kv : logits_id) { - maxl = Max(maxl, kv.first); - } + sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits); // compute probs for the top k tokens std::vector probs; probs.reserve(logits_id.size()); + float maxl = logits_id[0].first; double sum = 0.0; for (const auto & kv : logits_id) { const float p = expf(kv.first - maxl); @@ -1262,16 +1520,11 @@ static llama_vocab::id llama_sample_top_p_top_k( break; } } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } } //printf("\n"); //for (int i = 0; i < (int) 10; i++) { - // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); + // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]); //} //printf("\n\n"); //exit(0); @@ -1286,298 +1539,118 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -// TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; - +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { + ggml_type quantized_type; switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + case 2: quantized_type = GGML_TYPE_Q4_0; break; + case 3: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid quantization type %d\n", itype); }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + /*vocab_only*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); - llama_vocab vocab; + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); + size_t idx = 0; + for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } + printf("[%zu/%zu] %36s - %s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + llama_format_type(tensor.type)); - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", - __func__, fname_inp.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); - } + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); - fout.write((char *) &magic, sizeof(magic)); + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_buffer work; - uint32_t format_version; - finp.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - - fout.write((char *) &format_version, sizeof(format_version)); - } - - llama_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &itype, sizeof(hparams.f16)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::vector word(32); - vocab.id_to_token.resize(n_vocab); - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) &word[0], len); - fout.write((char *) &word[0], len); - - float score; - finp.read ((char *) &score, sizeof(score)); - fout.write((char *) &score, sizeof(score)); - - vocab.token_to_id[word.data()] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word.data(); - tok_score.score = score; - } - } - - // load weights - { - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector work; - - std::vector data_u8; - std::vector data_f16; - std::vector data_f32; - - std::vector hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - finp.read(reinterpret_cast(&length), sizeof(length)); - finp.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (finp.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - finp.read (&name[0], length); - - { - // ensure tensor data is aligned - uint64_t offset = finp.tellg(); - offset = (offset + 31) & -32; - finp.seekg(offset); - } - - { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - // regexes of tensor names to be quantized - const std::vector k_names = { - ".*weight", - }; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_buffer f32_conv_buf; + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (tensor.type == GGML_TYPE_F16) { + f32_conv_buf.resize(nelements * sizeof(float)); + f32_data = (float *) f32_conv_buf.addr; + auto f16_data = (const ggml_fp16_t *) tensor.data; + for (size_t i = 0; i < nelements; i++) { + f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ftype != 0 && ftype != 1) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - if (ftype == 1) { - data_f16.resize(nelements); - finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); - } - - ftype = itype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); - - data_u8.resize(nelements*bpe); - finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); + throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); } - fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); - fout.write(reinterpret_cast(&length), sizeof(length)); - fout.write(reinterpret_cast(&ftype), sizeof(ftype)); - for (int i = 0; i < n_dims; ++i) { - fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); - } - fout.write(&name[0], length); + printf("quantizing .. "); + fflush(stdout); - { - // ensure tensor data is aligned - uint64_t offset = fout.tellp(); - offset = (offset + 31) & -32; - fout.seekp(offset); + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + + switch (new_type) { + case GGML_TYPE_Q4_0: + { + new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + default: + LLAMA_ASSERT(false); } - if (quantize) { - printf("quantizing .. "); - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - - switch (type) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - default: - { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); - return false; - } - } - - fout.write(reinterpret_cast(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / float(nelements)); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); + printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; } - total_size_org += nelements * sizeof(float); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / float(sum_all)); + for (size_t i = 0; i < hist_cur.size(); i++) { + printf("%5.3f ", hist_cur[i] / float(nelements)); } printf("\n"); } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); } - finp.close(); - fout.close(); + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - return true; + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + printf("%5.3f ", hist_all[i] / float(sum_all)); + } + printf("\n"); + } } // @@ -1595,32 +1668,36 @@ struct llama_context * llama_init_from_file( params.seed = time(NULL); } + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + ++*cur_percentage_p; + fprintf(stderr, "."); + fflush(stderr); + if (percentage >= 100) { + fprintf(stderr, "\n"); + } + } + }; + } + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, - params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); return nullptr; } - if (params.use_mlock) { - char *err; - if (!ggml_mlock(ctx->model.ctx, - ctx->model.mm_addr, - ctx->model.mm_length, - &err)) { - fprintf(stderr, "%s\n", err); - free(err); - llama_free(ctx); - return nullptr; - } - } - // reserve memory for context buffers if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { @@ -1657,16 +1734,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - kv_cache_free(ctx->model.kv_self); - - if (ctx->model.ctx) { - ggml_free(ctx->model.ctx); - } - - if (ctx->model.mm_addr) { - munmap_file(ctx->model.mm_addr, ctx->model.mm_length); - } - delete ctx; } @@ -1674,23 +1741,24 @@ int llama_model_quantize( const char * fname_inp, const char * fname_out, int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { - fprintf(stderr, "%s: failed to quantize\n", __func__); + try { + llama_model_quantize_internal(fname_inp, fname_out, itype); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); return 1; } - - return 0; } // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.data(); + return ctx->model.kv_self.buf.addr; } // Returns the size of the KV cache size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size(); + return ctx->model.kv_self.buf.size; } int llama_get_kv_cache_token_count(struct llama_context * ctx) { @@ -1704,8 +1772,8 @@ void llama_set_kv_cache( size_t n_size, int n_token_count) { // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); - memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size); + memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size); ctx->model.kv_self.n = n_token_count; } @@ -1816,9 +1884,9 @@ llama_token llama_sample_top_p_top_k( void llama_print_timings(struct llama_context * ctx) { const int64_t t_end_us = ggml_time_us(); - const int32_t n_sample = Max(1, ctx->n_sample); - const int32_t n_eval = Max(1, ctx->n_eval); - const int32_t n_p_eval = Max(1, ctx->n_p_eval); + const int32_t n_sample = std::max(1, ctx->n_sample); + const int32_t n_eval = std::max(1, ctx->n_eval); + const int32_t n_p_eval = std::max(1, ctx->n_p_eval); fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); @@ -1854,3 +1922,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } + +// For internal test use +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors_by_name; +} diff --git a/llama.h b/llama.h index 04e2bf71c..42c364c6b 100644 --- a/llama.h +++ b/llama.h @@ -55,6 +55,7 @@ extern "C" { bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only @@ -66,6 +67,9 @@ extern "C" { LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_API bool llama_mmap_supported(); + LLAMA_API bool llama_mlock_supported(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -166,4 +170,4 @@ extern "C" { } #endif -#endif +#endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h new file mode 100644 index 000000000..543eed996 --- /dev/null +++ b/llama_internal.h @@ -0,0 +1,12 @@ +// Internal header to be included by llama.cpp and tests/benchmarks only. + +#ifndef LLAMA_INTERNAL_H +#define LLAMA_INTERNAL_H + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif // LLAMA_INTERNAL_H diff --git a/llama_util.h b/llama_util.h new file mode 100755 index 000000000..d68f49bd2 --- /dev/null +++ b/llama_util.h @@ -0,0 +1,383 @@ +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. + +#ifndef LLAMA_UTIL_H +#define LLAMA_UTIL_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #include + #include // for _fseeki64 +#endif + +#define LLAMA_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static std::string format(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + LLAMA_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + LLAMA_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +}; + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw format("failed to open %s: %s", fname, std::strerror(errno)); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + LLAMA_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + LLAMA_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw format("read error: %s", strerror(errno)); + } + if (ret != 1) { + throw std::string("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw format("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; +#ifdef __linux__ + flags |= MAP_POPULATE; +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + throw format("mmap failed: %s", strerror(errno)); + } + + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + + ~llama_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + CloseHandle(hFile); + + if (hMapping == NULL) { + throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); + } + + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_mmap(struct llama_file *) { + throw std::string("mmap not supported"); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; + + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * addr) { + LLAMA_ASSERT(this->addr == NULL && this->size == 0); + this->addr = addr; + } + + void grow_to(size_t target_size) { + LLAMA_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION, + size, this->size, std::strerror(errno)); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * addr, size_t size) { + for (int tries = 1; ; tries++) { + if (VirtualLock(addr, size)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + size, this->size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = size + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += size; + max_ws_size += size; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * addr, size_t size) { + if (!VirtualUnlock(addr, size)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + void raw_lock(const void * addr, size_t size) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + } + + void raw_unlock(const void * addr, size_t size) {} +#endif +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct llama_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + void resize(size_t size) { + delete[] addr; + addr = new uint8_t[size]; + this->size = size; + } + + ~llama_buffer() { + delete[] addr; + } +}; +#endif diff --git a/media/llama-leader.jpeg b/media/llama-leader.jpeg new file mode 100644 index 000000000..0b4e6e1cf Binary files /dev/null and b/media/llama-leader.jpeg differ diff --git a/media/llama0-banner.png b/media/llama0-banner.png new file mode 100644 index 000000000..cee3a87f1 Binary files /dev/null and b/media/llama0-banner.png differ diff --git a/media/llama0-logo.png b/media/llama0-logo.png new file mode 100644 index 000000000..e55b38bd9 Binary files /dev/null and b/media/llama0-logo.png differ diff --git a/media/llama1-banner.png b/media/llama1-banner.png new file mode 100644 index 000000000..1e469584e Binary files /dev/null and b/media/llama1-banner.png differ diff --git a/media/llama1-logo.png b/media/llama1-logo.png new file mode 100644 index 000000000..365c5b865 Binary files /dev/null and b/media/llama1-logo.png differ