diff --git a/.gitignore b/.gitignore index a0525430d..d8dd34fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize +/quantize-stats /result /perplexity /embedding diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a434f07b..6bec1f97b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,7 @@ if (LLAMA_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS) add_link_options(${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas) else() message(WARNING "OpenBLAS not found") endif() @@ -139,6 +140,7 @@ if (LLAMA_ALL_WARNINGS) -Wpedantic -Wcast-qual -Wno-unused-function + -Wno-multichar ) else() # todo : msvc @@ -151,6 +153,10 @@ if (LLAMA_ALL_WARNINGS) endif() +if (MSVC) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) +endif() + if (LLAMA_LTO) include(CheckIPOSupported) check_ipo_supported(RESULT result OUTPUT output) @@ -240,7 +246,9 @@ endif() add_library(llama llama.cpp - llama.h) + llama.h + llama_internal.h + llama_util.h) target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump diff --git a/Makefile b/Makefile index cb14ffdbc..3e58a28a7 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ LDFLAGS = # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function +CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific # TODO: support Windows @@ -142,14 +142,14 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h +llama.o: llama.cpp llama.h llama_util.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -160,12 +160,17 @@ main: examples/main/main.cpp ggml.o llama.o common.o quantize: examples/quantize/quantize.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +libllama.so: llama.o ggml.o + $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) # # Tests # diff --git a/README.md b/README.md index d9269c0f1..ef82855e4 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ New features will probably be added mostly through community contributions. - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) +- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) **Bindings:** @@ -350,20 +351,22 @@ We have two Docker images available for this project: The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. +Replace `/path/to/models` below with the actual path where you downloaded the models. + ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B ``` On complete, you are ready to play! ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with light image: ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` ### Contributing diff --git a/build.zig b/build.zig index b73460880..defc2c3ad 100644 --- a/build.zig +++ b/build.zig @@ -3,12 +3,14 @@ const std = @import("std"); pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); + const want_lto = b.option(bool, "lto", "Want -fLTO"); const lib = b.addStaticLibrary(.{ .name = "llama", .target = target, .optimize = optimize, }); + lib.want_lto = want_lto; lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("examples"); @@ -17,11 +19,11 @@ pub fn build(b: *std.Build) void { }, &.{"-std=c11"}); lib.addCSourceFiles(&.{ "llama.cpp", - "examples/common.cpp", }, &.{"-std=c++11"}); lib.install(); - const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize }; + const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto }; + const exe = build_example("main", build_args); _ = build_example("quantize", build_args); _ = build_example("perplexity", build_args); @@ -44,16 +46,19 @@ fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjS const lib = args.lib; const target = args.target; const optimize = args.optimize; + const want_lto = args.want_lto; const exe = b.addExecutable(.{ .name = name, .target = target, .optimize = optimize, }); + exe.want_lto = want_lto; exe.addIncludePath("."); exe.addIncludePath("examples"); exe.addCSourceFiles(&.{ std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), + "examples/common.cpp", }, &.{"-std=c++11"}); exe.linkLibrary(lib); exe.install(); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ce3a34710..67a7cea54 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ if (EMSCRIPTEN) else() add_subdirectory(main) add_subdirectory(quantize) + add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) endif() diff --git a/examples/common.cpp b/examples/common.cpp index 6b2d62168..b7d381735 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,7 +1,5 @@ #include "common.h" -#include "ggml.h" - #include #include #include @@ -16,12 +14,19 @@ #endif #if defined (_WIN32) +#include +#include #pragma comment(lib,"kernel32.lib") extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); +extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, + const wchar_t * lpWideCharStr, int cchWideChar, + char * lpMultiByteStr, int cbMultiByte, + const char * lpDefaultChar, bool * lpUsedDefaultChar); +#define CP_UTF8 65001 #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { @@ -162,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_color = true; } else if (arg == "--mlock") { params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; } else if (arg == "--mtest") { params.mem_test = true; } else if (arg == "--verbose-prompt") { @@ -247,9 +254,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (ggml_mlock_supported()) { + if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); @@ -321,12 +331,20 @@ void win32_console_init(bool enable_color) { SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) } // Set console output codepage to UTF8 - SetConsoleOutputCP(65001); // CP_UTF8 + SetConsoleOutputCP(CP_UTF8); } void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10) if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) { - // Set console input codepage to UTF8 - SetConsoleCP(65001); // CP_UTF8 + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); } } + +// Convert a wide Unicode string to an UTF8 string +void win32_utf8_encode(const std::wstring & wstr, std::string & str) { + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string strTo(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); + str = strTo; +} #endif diff --git a/examples/common.h b/examples/common.h index 8022c6268..793d6c2a4 100644 --- a/examples/common.h +++ b/examples/common.h @@ -48,6 +48,7 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation @@ -93,4 +94,5 @@ void set_console_color(console_state & con_st, console_color_t color); #if defined (_WIN32) void win32_console_init(bool enable_color); +void win32_utf8_encode(const std::wstring & wstr, std::string & str); #endif diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 950b4478c..03cd64439 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -38,6 +38,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b8cd33b75..13a35a787 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -97,6 +97,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; ctx = llama_init_from_file(params.model.c_str(), lparams); @@ -386,10 +387,19 @@ int main(int argc, char ** argv) { std::string line; bool another_line = true; do { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // input stream is bad or EOF received + return 0; + } + win32_utf8_encode(wline, line); +#else if (!std::getline(std::cin, line)) { // input stream is bad or EOF received return 0; } +#endif if (line.empty() || line.back() != '\\') { another_line = false; } else { @@ -431,7 +441,7 @@ int main(int argc, char ** argv) { } // end of text token - if (embd.back() == llama_token_eos()) { + if (!embd.empty() && embd.back() == llama_token_eos()) { if (params.instruct) { is_interacting = true; } else { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 88a14fbcd..69a5aa7f3 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -115,6 +115,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt new file mode 100644 index 000000000..7bebc11a1 --- /dev/null +++ b/examples/quantize-stats/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize-stats) +add_executable(${TARGET} quantize-stats.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp new file mode 100644 index 000000000..203bfe8cc --- /dev/null +++ b/examples/quantize-stats/quantize-stats.cpp @@ -0,0 +1,354 @@ +#include "ggml.h" +#include "llama.h" +#include "llama_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); + +struct quantize_stats_params { + std::string model = "models/7B/ggml-model-f16.bin"; + bool verbose = false; + bool per_layer_stats = false; + bool print_histogram = false; + bool reference = false; + std::vector include_layers; + std::vector exclude_layers; + std::vector include_types; +}; + +const int64_t SCRATCH_ELEMENTS = 32*32; +const size_t HISTOGRAM_BUCKETS = 150; +const double HISTOGRAM_RANGE = 0.03; + +struct error_stats { + size_t num_samples; + double total_error; + double max_error; + uint64_t error_histogram[HISTOGRAM_BUCKETS]; +}; + + +void quantize_stats_print_usage(int /*argc*/, char ** argv) { + quantize_stats_params params; + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -r, --reference\n"); + fprintf(stderr, " use reference implementation (default: false)\n"); + fprintf(stderr, " -v, --verbose\n"); + fprintf(stderr, " verbose output (default: false)\n"); + fprintf(stderr, " -p, --per-layer-stats\n"); + fprintf(stderr, " print stats per layer (default: false)\n"); + fprintf(stderr, " --histogram\n"); + fprintf(stderr, " print error histogram (default: false)\n"); + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); + fprintf(stderr, " only test layers matching pattern\n"); + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); + fprintf(stderr, " exclude layers matching pattern\n"); + fprintf(stderr, " -t TYPE, --type TYPE\n"); + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); + fprintf(stderr, "\n"); +} + +// Check if a layer is included/excluded by command line +bool layer_included(const quantize_stats_params params, const std::string & layer) { + for (const auto& excluded : params.exclude_layers) { + if (std::regex_search(layer, std::regex(excluded))) { + return false; + } + } + for (const auto& included : params.include_layers) { + if (std::regex_search(layer, std::regex(included))) { + return true; + } + } + return params.include_layers.empty(); +} + +// Update error statistics given vectors with the before/after result of quantization +void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { + for (int64_t i = 0; i < nelements; i++) { + double diff = input[i] - output[i]; + stats.total_error += diff * diff; + stats.max_error = fmax(fabs(diff), stats.max_error); + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; + } + stats.num_samples += nelements; +} + +double find_quantile(const error_stats & stats, double quantile) { + double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); + + double accum = 0; + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + accum += stats.error_histogram[i]; + if (accum >= sum*quantile) { + return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + } + } + return INFINITY; +} + +void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { + double rmse = sqrt(stats.total_error / (double) stats.num_samples); + double median = find_quantile(stats, .5); + double pct95 = find_quantile(stats, .95); + printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); + if (print_histogram) { + printf("Error distribution:\n"); + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; + printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + } + } +} + +// copied from ggml.h - verify that we can access this as a flat array +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +// Run quantization function for a single layer and update error stats +void test_roundtrip_on_layer( + std::string & name, + bool print_layer_stats, + const quantize_fns_t & qfns, + bool use_reference, + const ggml_tensor * layer, + float * input_scratch, + char *quantized_scratch, + float * output_scratch, + error_stats & total_error) { + + assert(tensor_is_contiguous(layer)); + error_stats layer_error {}; + int64_t nelements = ggml_nelements(layer); + + for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { + int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, total_error); + if (print_layer_stats) { + update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); + } + } + if (print_layer_stats) { + print_error_stats(name, layer_error, false); + } +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + quantize_stats_params params; + + // read command line + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + quantize_stats_print_usage(argc, argv); + exit(0); + } else if (arg == "-r" || arg == "--reference") { + params.reference = true; + } else if (arg == "-v") { + params.verbose = true; + } else if (arg == "-p" || arg == "--per-layer-stats") { + params.per_layer_stats = true; + } else if (arg == "--histogram") { + params.print_histogram = true; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-l" || arg == "--include-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_layers.push_back(argv[i]); + } else if (arg == "-L" || arg == "--exclude-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.exclude_layers.push_back(argv[i]); + } else if (arg == "-t" || arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + // find match + } + if (j < GGML_TYPE_COUNT) { + params.include_types.push_back((ggml_type) j); + } else { + fprintf(stderr, "error: %s not in list of types\n", argv[i]); + invalid_param = true; + } + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + + // load the model + fprintf(stderr, "Loading model\n"); + + const int64_t t_main_start_us = ggml_time_us(); + llama_context * ctx; + + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = 256; + lparams.n_parts = 1; + lparams.seed = 1; + lparams.f16_kv = false; + lparams.use_mlock = false; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + const auto &tensors = llama_internal_get_tensor_map(ctx); + + // check layer tensors + int included_layers = 0; + int64_t max_nelements = 0; + bool is_f16 = false; + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + } + if (kv_tensor.second->type == GGML_TYPE_F16) { + is_f16 = true; + } else if (kv_tensor.second->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); + llama_free(ctx); + return 1; + } + included_layers++; + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); + } + + if (is_f16) { + printf("note: source model is f16\n"); + } + printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); + // allocate scratch space + std::vector input_scratch(SCRATCH_ELEMENTS); + std::vector quantized_scratch(SCRATCH_ELEMENTS*4); + std::vector output_scratch(SCRATCH_ELEMENTS); + + // loop throught quantization types + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + continue; + } + quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (params.verbose) { + printf("testing %s ...\n", type_strs[i]); + } + + error_stats global_stats {}; + + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { type_strs[i] }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + qfns, + params.reference, + kv_tensor.second, + input_scratch.data(), + quantized_scratch.data(), + output_scratch.data(), + global_stats + ); + } + + print_error_stats(type_strs[i], global_stats, params.print_histogram); + } + } + + + llama_free(ctx); + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/flake.nix b/flake.nix index 4c2717e0d..cd1b6d28e 100644 --- a/flake.nix +++ b/flake.nix @@ -30,6 +30,9 @@ mkdir -p $out/bin mv bin/main $out/bin/llama mv bin/quantize $out/bin/quantize + mv bin/embedding $out/bin/embedding + mv bin/perplexity $out/bin/perplexity + echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml chmod +x $out/bin/convert-pth-to-ggml diff --git a/ggml.c b/ggml.c index 8a60bc383..ada3bbbdc 100644 --- a/ggml.c +++ b/ggml.c @@ -26,14 +26,9 @@ #define static_assert(cond, msg) struct global_scope_noop_trick #endif -#if defined _MSC_VER || defined(__MINGW32__) +#if defined(_WIN32) -#if !defined(__MINGW32__) -#include -#else -// ref: https://github.com/ggerganov/whisper.cpp/issues/168 #include -#endif typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; @@ -97,17 +92,6 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif -#define GGML_MLOCK_SUPPORT 0 - -#ifdef __has_include - #if __has_include() - #undef GGML_MLOCK_SUPPORT - #define GGML_MLOCK_SUPPORT 1 - #include - #endif -#endif - - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -610,10 +594,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); - // absolute max - const float amax = MAX( - MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), - MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); + const float amax = vmaxvq_f32(amaxv[0]); const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0f/d : 0.0f; @@ -935,7 +916,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int float32x4_t minv[8]; float32x4_t maxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l); for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); @@ -958,7 +939,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int for (int l = 0; l < 8; l++) { const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); - const int32x4_t vi = vcvtq_s32_f32(v); + const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest + const int32x4_t vi = vcvtq_s32_f32(vf); y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); @@ -1962,7 +1944,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - /* Prepare the constants we will need during execution */ + /* Prepare the constants we will need during execution */ const __m256i lowMask = _mm256_set1_epi8( 0xF ); const __m256i offset_8 = _mm256_set1_epi16( 8 ); @@ -1972,61 +1954,59 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Main loop for (int i = 0; i < nb; i+=UNROLL_COUNT) { - - // This loop will be unrolled by the compiler + // This loop will be unrolled by the compiler for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps( xy_q ); - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( scale, q, acc ); } - - } + } // Return horizontal sum of the acc vector __m128 res = _mm256_extractf128_ps( acc, 1 ); @@ -2087,18 +2067,18 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest float sum1 = 0.0f; for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &px[i + 0]; - const block_q4_0 * restrict y0 = &py[i + 0]; - const block_q4_0 * restrict x1 = &px[i + 1]; - const block_q4_0 * restrict y1 = &py[i + 1]; + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict y0 = &y[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q4_0 * restrict y1 = &y[i + 1]; const v128_t m4b = wasm_u8x16_splat(0xf); const v128_t s8b = wasm_i8x16_splat(0x8); - const v128_t v0_0 = wasm_v128_load(x0.qs); - const v128_t v0_1 = wasm_v128_load(y0.qs); - const v128_t v1_0 = wasm_v128_load(x1.qs); - const v128_t v1_1 = wasm_v128_load(y1.qs); + const v128_t v0_0 = wasm_v128_load(x0->qs); + const v128_t v0_1 = wasm_v128_load(y0->qs); + const v128_t v1_0 = wasm_v128_load(x1->qs); + const v128_t v1_1 = wasm_v128_load(y1->qs); // 4-bit -> 8-bit const v128_t v0_0l = wasm_v128_and(v0_0, m4b); @@ -2629,6 +2609,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "SCALE", "CPY", + "CONT", "RESHAPE", "VIEW", "PERMUTE", @@ -2644,7 +2625,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_FF", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2673,6 +2654,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "x*v", "x-\\>y", + "cont(x)", "reshape(x)", "view(x)", "permute(x)", @@ -2688,22 +2670,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_ff(x)", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); - -// -// ggml object -// - -struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - char padding[8]; -}; - -static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -2716,7 +2683,6 @@ struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; - bool mem_buffer_mlocked; bool no_alloc; int n_objects; @@ -3003,7 +2969,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_size =*/ params.mem_size, /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, - /*.mem_buffer_mlocked =*/ false, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, @@ -3036,14 +3001,6 @@ void ggml_free(struct ggml_context * ctx) { GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - if (munlock(ctx->mem_buffer, ctx->mem_size)) { - fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno)); - } - } -#endif - if (ctx->mem_buffer_owned) { free(ctx->mem_buffer); } @@ -3072,48 +3029,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } -#ifdef __APPLE__ -#define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" -#else -#define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" -#endif - -bool ggml_mlock_supported(void) { - return GGML_MLOCK_SUPPORT; -} - -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p) { - // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32 -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - return true; - } - if (mlock(ctx->mem_buffer, ctx->mem_size) || - (opt_extra_len && - mlock(opt_extra_addr, opt_extra_len))) { - if ((*err_p = malloc(1024))) { - snprintf(*err_p, 1024, - "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION, - ctx->mem_size + opt_extra_len, - strerror(errno)); - } - return false; - } - ctx->mem_buffer_mlocked = true; - return true; -#else // GGML_MLOCK_SUPPORT - *err_p = strdup("can't mlock because it's not supported on this system"); - return false; -#endif // GGML_MLOCK_SUPPORT -} - //////////////////////////////////////////////////////////////////////////////// struct ggml_tensor * ggml_new_tensor_impl( @@ -4388,6 +4303,41 @@ struct ggml_tensor * ggml_cpy_inplace( return ggml_cpy_impl(ctx, a, b, true); } +// ggml_cont + +struct ggml_tensor * ggml_cont_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, true); +} + // ggml_reshape struct ggml_tensor * ggml_reshape( @@ -4930,6 +4880,85 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + if (ggml_is_contiguous(dst)) { + if (src0->nb[0] == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -5024,6 +5053,105 @@ static void ggml_compute_forward_dup_f32( return; } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + // TODO: simplify + if (src0->nb[0] == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -5144,14 +5272,18 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { - const int j0 = (n/nth)*ith; - const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1); - - for (int j = j0; j < j1; j++) { + for (int j = ith; j < n; j += nth) { +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + j*nb01), 1, + (float *) ((char *) src1->data + j*nb11), 1, + (float *) ((char *) dst->data + j*nb1), 1, nc); +#else ggml_vec_add_f32(nc, (float *) ((char *) dst->data + j*nb1), (float *) ((char *) src0->data + j*nb01), (float *) ((char *) src1->data + j*nb11)); +#endif } } else { // src1 is not contiguous @@ -6564,29 +6696,27 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); -typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); -typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); - -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; - static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .vec_dot_q = ggml_vec_dot_q4_0, + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .vec_dot_q = ggml_vec_dot_q4_1, }, }; +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -6901,6 +7031,15 @@ static void ggml_compute_forward_cpy( ggml_compute_forward_dup(params, src0, dst); } +// ggml_compute_forward_cont + +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( @@ -8731,6 +8870,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_cpy(params, tensor->src0, tensor); } break; + case GGML_OP_CONT: + { + ggml_compute_forward_cont(params, tensor->src0, tensor); + } break; case GGML_OP_RESHAPE: { ggml_compute_forward_reshape(params, tensor->src0, tensor); @@ -8975,8 +9118,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_impl(ctx, src1->grad, - // TODO: fix transpose, the node will break the graph connections - ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad), + ggml_mul_mat(ctx, + ggml_cont(ctx, ggml_transpose(ctx, src0)), + tensor->grad), inplace); } } break; @@ -8988,6 +9132,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONT: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_RESHAPE: { GGML_ASSERT(false); // TODO: not implemented @@ -9442,6 +9590,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = n_threads; } break; case GGML_OP_CPY: + case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: diff --git a/ggml.h b/ggml.h index 3c94efc35..a5245a8ae 100644 --- a/ggml.h +++ b/ggml.h @@ -236,6 +236,7 @@ enum ggml_op { GGML_OP_SCALE, GGML_OP_CPY, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_VIEW, GGML_OP_PERMUTE, @@ -253,6 +254,19 @@ enum ggml_op { GGML_OP_COUNT, }; + +// ggml object +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + char padding[8]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -344,13 +358,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); -bool ggml_mlock_supported(void); -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p); - struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, @@ -519,6 +526,11 @@ struct ggml_tensor * ggml_cpy( struct ggml_tensor * a, struct ggml_tensor * b); +// make contiguous +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view struct ggml_tensor * ggml_reshape( @@ -783,6 +795,30 @@ int ggml_cpu_has_blas(void); int ggml_cpu_has_sse3(void); int ggml_cpu_has_vsx(void); + +// +// Internal types and functions exposed for tests and benchmarks +// + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif +typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + +typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; + vec_dot_q_t vec_dot_q; +} quantize_fns_t; + +quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index 3732e3809..4e77bb31b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,49 +1,26 @@ +#include "llama_util.h" #include "llama.h" +#include "llama_internal.h" #include "ggml.h" +#include #include #include #include #include #include #include -#include #include #include - -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) -#define WIN32_LEAN_AND_MEAN -#include -#else -#include -#include -#include -#include -#endif - -#define Min(X, Y) ((Y) > (X) ? (X) : (Y)) -#define Max(X, Y) ((Y) < (X) ? (X) : (Y)) +#include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 -#define LLAMA_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - - -// determine number of model parts based on the dimension -static const std::unordered_map LLAMA_N_PARTS = { - { 4096, 1 }, - { 5120, 2 }, - { 6656, 4 }, - { 8192, 8 }, -}; // available llama models enum e_model { @@ -93,14 +70,18 @@ static const std::map MEM_REQ_EVAL = { // default hparams (LLaMA 7B) struct llama_hparams { - int32_t n_vocab = 32000; - int32_t n_ctx = 512; // this is provided as user input? - int32_t n_embd = 4096; - int32_t n_mult = 256; - int32_t n_head = 32; - int32_t n_layer = 32; - int32_t n_rot = 64; - int32_t f16 = 1; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t f16 = 1; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } }; struct llama_layer { @@ -126,11 +107,17 @@ struct llama_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; - std::vector buf; + llama_buffer buf; int n; // number of tokens currently in the cache + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_model { @@ -146,22 +133,30 @@ struct llama_model { std::vector layers; // context - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; // key + value cache for the self attention // TODO: move to llama_state struct llama_kv_cache kv_self; // the model memory buffer - std::vector buf; + llama_buffer buf; // model memory mapped file - void * mm_addr = NULL; - uint64_t mm_length = 0; + std::unique_ptr mapping; - // tensors - int n_loaded; - std::unordered_map tensors; + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_vocab { @@ -206,8 +201,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - std::vector buf_compute; - std::vector buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_buffer buf_compute; + llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -220,11 +215,11 @@ struct llama_context { last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); } if (buf_last >= 0) { - buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size); + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); } buf_last = i; @@ -244,6 +239,508 @@ struct llama_context { } }; +template +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw format("error dividing %zu / %zu", a, b); + } + return a / b; +} + +static std::string llama_format_tensor_shape(const std::vector & ne) { + std::string ret = "[" + std::to_string(ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + ret += " x " + std::to_string(ne.at(i)); + } + ret += "]"; + return ret; +} + +static const char * llama_format_type(enum ggml_type type) { + switch (type) { + case GGML_TYPE_F32: return "f32"; + case GGML_TYPE_F16: return "f16"; + case GGML_TYPE_Q4_0: return "q4_0"; + case GGML_TYPE_Q4_1: return "q4_1"; + default: LLAMA_ASSERT(false); + } +} + +static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_load_tensor_shard { + std::vector ne; + size_t size; + enum ggml_type type; + size_t file_idx; + size_t file_off; + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +enum llama_split_type { + SPLIT_NONE, + SPLIT_BY_COLUMNS, + SPLIT_BY_ROWS +}; + +struct llama_load_tensor { + std::vector shards; + + std::string name; + enum ggml_type type = GGML_TYPE_F32; + llama_split_type split_type = SPLIT_NONE; + std::vector ne; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; + + llama_load_tensor(const std::string & name) : name(name) {} + + void calc_all() { + calc_type(); + calc_split_type(); + calc_ne(); + calc_size(); + } + + void calc_type() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.type != first_shard.type) { + throw format("inconsistent tensor shard type in '%s'", name.c_str()); + } + } + type = first_shard.type; + } + + void calc_split_type() { + if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file + shards.size() == 1) { // only one file? + split_type = SPLIT_NONE; + } else if (name.find("tok_embeddings.") == 0 || + name.find(".attention.wo.weight") != std::string::npos || + name.find(".feed_forward.w2.weight") != std::string::npos) { + split_type = SPLIT_BY_COLUMNS; + } else { + split_type = SPLIT_BY_ROWS; + } + } + + void calc_ne() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.ne != first_shard.ne) { + throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + } + } + ne = first_shard.ne; + LLAMA_ASSERT(shards.size() <= UINT32_MAX); + uint32_t n_shards = (uint32_t) shards.size(); + switch (split_type) { + case SPLIT_NONE: + ne = first_shard.ne; + break; + case SPLIT_BY_COLUMNS: + ne = {checked_mul(first_shard.ne[0], n_shards), + first_shard.ne[1]}; + break; + case SPLIT_BY_ROWS: + ne = {first_shard.ne[0], + checked_mul(first_shard.ne[1], n_shards)}; + break; + } + } + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +struct llama_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector tensors; + std::unordered_map name_to_idx; +}; + +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding +}; + +struct llama_file_loader { + llama_file file; + llama_file_version file_version; + llama_hparams hparams; + llama_vocab vocab; + + llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + : file(fname, "rb") { + fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(file_idx, tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + uint32_t version = 0; + + if (magic != 'ggml') { + version = file.read_u32(); + } + + if (magic == 'ggml' && version == 0) { + file_version = LLAMA_FILE_VERSION_GGML; + } else if (magic == 'ggmf' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGMF_V1; + } else if (magic == 'ggjt' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGJT_V1; + } else { + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); + } + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.f16 = file.read_u32(); + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + file.read_raw(&score, sizeof(score)); + } + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_load_tensor_shard shard; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + uint32_t ftype = file.read_u32(); + shard.ne.resize(n_dims); + file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + } + switch (ftype) { + case 0: shard.type = GGML_TYPE_F32; break; + case 1: shard.type = GGML_TYPE_F16; break; + case 2: shard.type = GGML_TYPE_Q4_0; break; + case 3: shard.type = GGML_TYPE_Q4_1; break; + default: { + throw format("unrecognized ftype %u\n", ftype); + } + } + + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + // skip to the next multiple of 32 bytes + file.seek(-file.tell() & 31, SEEK_CUR); + } + shard.file_idx = file_idx; + shard.file_off = file.tell(); + + shard.calc_size(); + file.seek(shard.size, SEEK_CUR); + + auto it = tensors_map.name_to_idx.find(name); + size_t idx; + if (it != tensors_map.name_to_idx.end()) { + idx = it->second; + } else { + tensors_map.tensors.emplace_back(name); + idx = tensors_map.tensors.size() - 1; + tensors_map.name_to_idx.emplace(name, idx); + } + tensors_map.tensors.at(idx).shards.push_back(shard); + } + } +}; + +struct llama_file_saver { + llama_file file; + llama_file_loader * any_file_loader; + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + : file(fname, "wb"), any_file_loader(any_file_loader) { + fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_f16); + write_vocab(); + } + void write_magic() { + file.write_u32('ggjt'); // magic + file.write_u32(1); // version + } + void write_hparams(uint32_t new_f16) { + const llama_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_mult); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.n_rot); + file.write_u32(new_f16); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + uint32_t ftype; + switch (new_type) { + case GGML_TYPE_F32: ftype = 0; break; + case GGML_TYPE_F16: ftype = 1; break; + case GGML_TYPE_Q4_0: ftype = 2; break; + case GGML_TYPE_Q4_1: ftype = 3; break; + default: LLAMA_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(ftype); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-file.tell() & 31, SEEK_CUR); + LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_model_loader { + std::vector> file_loaders; + llama_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr mapping; + + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + file_loaders.emplace_back(first_file); + uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); + for (uint32_t i = 1; i < n_parts; i++) { + std::string fname = fname_base + "." + std::to_string(i); + auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + file_loaders.emplace_back(ith_file); + if (ith_file->hparams != first_file->hparams) { + throw format("llama.cpp: hparams inconsistent between files"); + } + } + if (!llama_mmap::SUPPORTED) { + use_mmap = false; + } + if (use_mmap && alignment_prevents_mmap()) { + fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + use_mmap = false; + } + this->use_mmap = use_mmap; + for (llama_load_tensor & lt : tensors_map.tensors) { + lt.calc_all(); + } + } + + bool alignment_prevents_mmap() { + for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const llama_load_tensor_shard & shard : lt.shards) { + if (shard.file_off & 3) { + return true; + } + } + } + return false; + } + + uint32_t guess_n_parts() const { + auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); + if (it == tensors_map.name_to_idx.end()) { + throw std::string("missing tok_embeddings.weight"); + } + const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, std::vector ne) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + } + llama_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + } + return get_tensor_for(lt); + } + + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * tensor; + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::string("llama.cpp: file contained more tensors than expected"); + } + } + + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t data_size = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + if (!lmlock) { + // Don't call the callback since the actual loading will be lazy + // and we can't measure it. + progress_callback = NULL; + } + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (llama_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + load_data_for(lt); + lt.ggml_tensor->data = lt.data; + done_size += lt.size; + if (use_mmap && lmlock) { + lmlock->grow_to(done_size); + } + } + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + } + + void load_data_for(llama_load_tensor & lt) { + if (use_mmap) { + LLAMA_ASSERT(lt.shards.size() == 1); + lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; + } else if (lt.split_type == SPLIT_NONE) { + llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + file.seek(lt.shards.at(0).file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } else if (lt.split_type == SPLIT_BY_ROWS) { + size_t offset = 0; + for (llama_load_tensor_shard & shard : lt.shards) { + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + file.read_raw(lt.data + offset, shard.size); + offset += shard.size; + } + LLAMA_ASSERT(offset == lt.size); + } else if (lt.split_type == SPLIT_BY_COLUMNS) { + // Let's load the data into temporary buffers to ensure the OS performs large loads. + std::vector tmp_bufs; + tmp_bufs.resize(lt.shards.size()); + for (size_t i = 0; i < lt.shards.size(); i++) { + llama_load_tensor_shard & shard = lt.shards.at(i); + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + tmp_bufs.at(i).resize(shard.size); + file.read_raw(tmp_bufs.at(i).addr, shard.size); + } + // Then reshape. + size_t num_rows = lt.ne.at(1); + size_t per_shard_row_size = lt.shards.at(0).size / num_rows; + size_t out_offset = 0; + for (size_t row = 0; row < num_rows; row++) { + for (llama_buffer & tmp_buf : tmp_bufs) { + memcpy(lt.data + out_offset, + tmp_buf.addr + row * per_shard_row_size, + per_shard_row_size); + out_offset += per_shard_row_size; + } + } + LLAMA_ASSERT(out_offset == lt.size); + } + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(llama_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + + // // kv cache // @@ -262,8 +759,8 @@ static bool kv_cache_init( cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; - params.mem_size = cache.buf.size(); - params.mem_buffer = cache.buf.data(); + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); @@ -279,13 +776,6 @@ static bool kv_cache_init( return true; } -static void kv_cache_free(struct llama_kv_cache & cache) { - if (cache.ctx) { - ggml_free(cache.ctx); - cache.ctx = nullptr; - } -} - struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, @@ -294,6 +784,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, /*.progress_callback =*/ nullptr, @@ -303,243 +794,94 @@ struct llama_context_params llama_context_default_params() { return result; } +bool llama_mmap_supported() { + return llama_mmap::SUPPORTED; +} + +bool llama_mlock_supported() { + return llama_mlock::SUPPORTED; +} + // // model loading // -static void *mmap_file(const char *fname, uint64_t *mm_length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - HANDLE hFile = CreateFileA(fname, - GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, - NULL); - if (hFile == INVALID_HANDLE_VALUE) return 0; - LARGE_INTEGER fileSize; - fileSize.QuadPart = -1; - GetFileSizeEx(hFile, &fileSize); - int64_t length = fileSize.QuadPart; - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - CloseHandle(hFile); - if (!hMapping) return 0; - void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - CloseHandle(hMapping); - if (!addr) return 0; -#else - int fd = open(fname, O_RDONLY); - if (fd == -1) return 0; - int64_t length = lseek(fd, 0, SEEK_END); - void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) return 0; -#endif - *mm_length = length; - return addr; +static const char *llama_file_version_name(llama_file_version version) { + switch (version) { + case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; + default: LLAMA_ASSERT(false); + } } -static void munmap_file(void * addr, size_t length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - UnmapViewOfFile(addr); -#else - munmap(addr, length); -#endif +static const char *llama_model_type_name(e_model type) { + switch (type) { + case MODEL_7B: return "7B"; + case MODEL_13B: return "13B"; + case MODEL_30B: return "30B"; + case MODEL_65B: return "65B"; + default: LLAMA_ASSERT(false); + } } -static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { - fprintf(stderr, - "%s: invalid model file (bad magic [got %#x want %#x])\n" - "\tyou most likely need to regenerate your ggml files\n" - "\tthe benefit is you'll get 10-100x faster load times\n" - "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" - "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" - "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", - path, got, want); - return false; -} - -static bool llama_model_load( +static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, - int n_parts, ggml_type memory_type, + bool use_mmap, + bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void *progress_callback_user_data) { - fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + void * progress_callback_user_data) { lctx.t_start_us = ggml_time_us(); + std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + + lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); auto & model = lctx.model; - auto & vocab = lctx.vocab; + model.hparams = ml->file_loaders.at(0)->hparams; + llama_file_version file_version = ml->file_loaders.at(0)->file_version; + auto & hparams = model.hparams; + uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - std::vector f_buf(1024*1024); - fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); - - fin.seekg(0, fin.end); - const size_t file_size = fin.tellg(); - fin.seekg(0); - - // verify magic { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n", - __func__, fname.c_str()); - return false; + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = e_model::MODEL_65B; break; } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); - } - - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - } - - int n_ff = 0; - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); hparams.n_ctx = n_ctx; - - n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - - if (n_parts < 1) { - n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - } - - // temp warning to tell the user to use "--n_parts" - if (hparams.f16 == 4 && n_parts != 1) { - fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); - fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); - } - - if (hparams.n_layer == 32) { - model.type = e_model::MODEL_7B; - } - - if (hparams.n_layer == 40) { - model.type = e_model::MODEL_13B; - } - - if (hparams.n_layer == 60) { - model.type = e_model::MODEL_30B; - } - - if (hparams.n_layer == 80) { - model.type = e_model::MODEL_65B; - } - - fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); - fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); - fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); - fprintf(stderr, "%s: type = %d\n", __func__, model.type); } - // load vocab { - std::string word; - vocab.id_to_token.resize(model.hparams.n_vocab); - std::vector tmp(64); - - for (int i = 0; i < model.hparams.n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - if (len > 0) { - tmp.resize(len); - fin.read(tmp.data(), len); - word.assign(tmp.data(), len); - } else { - word.clear(); - } - - float score; - fin.read((char *) &score, sizeof(score)); - - vocab.token_to_id[word] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; - tok_score.score = score; - } + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (vocab_only) { - return true; + return; } - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - // wtype is for per-layer weights, while vtype is for other weights - ggml_type wtype, vtype; - switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; - case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return false; - } - } - - // map model into memory - char *mm_addr = NULL; - model.mm_addr = mmap_file(fname.c_str(), &model.mm_length); - if (model.mm_addr == NULL) { - fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); - return false; - } - mm_addr = (char *)model.mm_addr; - fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0)); - auto & ctx = model.ctx; - size_t ctx_size = 0; - { - const auto &hparams = model.hparams; - const int n_layer = hparams.n_layer; - ctx_size += (5 + 10*n_layer)*256; // object overhead - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } + size_t ctx_size, mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); // print memory requirements { @@ -548,7 +890,7 @@ static bool llama_model_load( // this is the total memory required to run the inference const size_t mem_required = ctx_size + - model.mm_length + + mmapped_size + MEM_REQ_SCRATCH0.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_EVAL.at (model.type); @@ -564,17 +906,20 @@ static bool llama_model_load( // create the ggml context { lctx.model.buf.resize(ctx_size); + if (use_mlock) { + lctx.model.mlock_buf.init(lctx.model.buf.addr); + lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size(), - /*.mem_buffer =*/ lctx.model.buf.data(), - /*.no_alloc =*/ true, + /*.mem_size =*/ lctx.model.buf.size, + /*.mem_buffer =*/ lctx.model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, }; model.ctx = ggml_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; + throw format("ggml_init() failed"); } } @@ -582,161 +927,71 @@ static bool llama_model_load( { const auto & hparams = model.hparams; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + ml->ggml_ctx = ctx; + + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); model.layers.resize(n_layer); - - model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - // map by name - model.tensors["tok_embeddings.weight"] = model.tok_embeddings; - - model.tensors["norm.weight"] = model.norm; - model.tensors["output.weight"] = model.output; - - for (int i = 0; i < n_layer; ++i) { + for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + std::string layers_i = "layers." + std::to_string(i); - layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); - layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; - - model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); } } - std::vector tmp; + ml->done_getting_tensors(); - if (progress_callback) { - progress_callback(0.0, progress_callback_user_data); + // populate `tensors_by_name` + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } - fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str()); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - // load weights - { - size_t total_size = 0; - model.n_loaded = 0; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return false; - } - - auto tensor = model.tensors[name.data()]; - - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return false; - } - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - return false; - } - if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - switch (ftype) { - case 0: // f32 - case 1: // f16 - break; - case 2: // q4_0 - case 3: // q4_1 - assert(ne[0] % 64 == 0); - break; - default: - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); - return false; - }; - - // load the tensor data into memory without copying or reading it - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(tensor); - offset = (offset + 31) & -32; - tensor->data = mm_addr + offset; - fin.seekg(offset + tensor_data_size); - total_size += tensor_data_size; - model.n_loaded++; - - // progress - if (progress_callback) { - double current_progress = size_t(fin.tellg()) / double(file_size); - progress_callback(current_progress, progress_callback_user_data); - } - } - - fin.close(); - - fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); - if (model.n_loaded == 0) { - fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); - } else if (model.n_loaded != (int) model.tensors.size()) { - fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); - return false; - } - } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration lctx.t_load_us = ggml_time_us() - lctx.t_start_us; +} - if (progress_callback) { - progress_callback(1.0, progress_callback_user_data); +static bool llama_model_load( + const std::string & fname, + llama_context & lctx, + int n_ctx, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::string & err) { + fprintf(stderr, "error loading model: %s\n", err.c_str()); + return false; } - - return true; } // evaluate the transformer @@ -776,8 +1031,8 @@ static bool llama_eval_internal( auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size(), - /*.mem_buffer =*/ buf_compute.data(), + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; @@ -1063,7 +1318,7 @@ struct llama_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - size_t char_len = Min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -1238,19 +1493,13 @@ static llama_vocab::id llama_sample_top_p_top_k( } } - if (top_k > 0 && top_k < n_logits) { - sample_top_k(logits_id, top_k); - } - - float maxl = -std::numeric_limits::infinity(); - for (const auto & kv : logits_id) { - maxl = Max(maxl, kv.first); - } + sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits); // compute probs for the top k tokens std::vector probs; probs.reserve(logits_id.size()); + float maxl = logits_id[0].first; double sum = 0.0; for (const auto & kv : logits_id) { const float p = expf(kv.first - maxl); @@ -1273,16 +1522,11 @@ static llama_vocab::id llama_sample_top_p_top_k( break; } } - - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { - probs[i] *= cumsum; - } } //printf("\n"); //for (int i = 0; i < (int) 10; i++) { - // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); + // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]); //} //printf("\n\n"); //exit(0); @@ -1297,298 +1541,118 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -// TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; - +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { + ggml_type quantized_type; switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + case 2: quantized_type = GGML_TYPE_Q4_0; break; + case 3: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid quantization type %d\n", itype); }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + /*vocab_only*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); - llama_vocab vocab; + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); + size_t idx = 0; + for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } + printf("[%zu/%zu] %36s - %s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + llama_format_type(tensor.type)); - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", - __func__, fname_inp.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); - } + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); - fout.write((char *) &magic, sizeof(magic)); + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_buffer work; - uint32_t format_version; - finp.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - - fout.write((char *) &format_version, sizeof(format_version)); - } - - llama_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &itype, sizeof(hparams.f16)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::vector word(32); - vocab.id_to_token.resize(n_vocab); - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) &word[0], len); - fout.write((char *) &word[0], len); - - float score; - finp.read ((char *) &score, sizeof(score)); - fout.write((char *) &score, sizeof(score)); - - vocab.token_to_id[word.data()] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word.data(); - tok_score.score = score; - } - } - - // load weights - { - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector work; - - std::vector data_u8; - std::vector data_f16; - std::vector data_f32; - - std::vector hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - finp.read(reinterpret_cast(&length), sizeof(length)); - finp.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (finp.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - finp.read (&name[0], length); - - { - // ensure tensor data is aligned - uint64_t offset = finp.tellg(); - offset = (offset + 31) & -32; - finp.seekg(offset); - } - - { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - // regexes of tensor names to be quantized - const std::vector k_names = { - ".*weight", - }; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_buffer f32_conv_buf; + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (tensor.type == GGML_TYPE_F16) { + f32_conv_buf.resize(nelements * sizeof(float)); + f32_data = (float *) f32_conv_buf.addr; + auto f16_data = (const ggml_fp16_t *) tensor.data; + for (size_t i = 0; i < nelements; i++) { + f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ftype != 0 && ftype != 1) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - if (ftype == 1) { - data_f16.resize(nelements); - finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); - } - - ftype = itype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); - - data_u8.resize(nelements*bpe); - finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); + throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); } - fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); - fout.write(reinterpret_cast(&length), sizeof(length)); - fout.write(reinterpret_cast(&ftype), sizeof(ftype)); - for (int i = 0; i < n_dims; ++i) { - fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); - } - fout.write(&name[0], length); + printf("quantizing .. "); + fflush(stdout); - { - // ensure tensor data is aligned - uint64_t offset = fout.tellp(); - offset = (offset + 31) & -32; - fout.seekp(offset); + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + + switch (new_type) { + case GGML_TYPE_Q4_0: + { + new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + default: + LLAMA_ASSERT(false); } - if (quantize) { - printf("quantizing .. "); - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - - switch (type) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - default: - { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); - return false; - } - } - - fout.write(reinterpret_cast(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / float(nelements)); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); + printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; } - total_size_org += nelements * sizeof(float); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / float(sum_all)); + for (size_t i = 0; i < hist_cur.size(); i++) { + printf("%5.3f ", hist_cur[i] / float(nelements)); } printf("\n"); } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); } - finp.close(); - fout.close(); + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - return true; + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + printf("%5.3f ", hist_all[i] / float(sum_all)); + } + printf("\n"); + } } // @@ -1606,32 +1670,36 @@ struct llama_context * llama_init_from_file( params.seed = time(NULL); } + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + ++*cur_percentage_p; + fprintf(stderr, "."); + fflush(stderr); + if (percentage >= 100) { + fprintf(stderr, "\n"); + } + } + }; + } + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, - params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); return nullptr; } - if (params.use_mlock) { - char *err; - if (!ggml_mlock(ctx->model.ctx, - ctx->model.mm_addr, - ctx->model.mm_length, - &err)) { - fprintf(stderr, "%s\n", err); - free(err); - llama_free(ctx); - return nullptr; - } - } - // reserve memory for context buffers if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { @@ -1668,16 +1736,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - kv_cache_free(ctx->model.kv_self); - - if (ctx->model.ctx) { - ggml_free(ctx->model.ctx); - } - - if (ctx->model.mm_addr) { - munmap_file(ctx->model.mm_addr, ctx->model.mm_length); - } - delete ctx; } @@ -1685,23 +1743,24 @@ int llama_model_quantize( const char * fname_inp, const char * fname_out, int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { - fprintf(stderr, "%s: failed to quantize\n", __func__); + try { + llama_model_quantize_internal(fname_inp, fname_out, itype); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); return 1; } - - return 0; } // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.data(); + return ctx->model.kv_self.buf.addr; } // Returns the size of the KV cache size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size(); + return ctx->model.kv_self.buf.size; } int llama_get_kv_cache_token_count(struct llama_context * ctx) { @@ -1715,8 +1774,8 @@ void llama_set_kv_cache( size_t n_size, int n_token_count) { // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); - memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size); + memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size); ctx->model.kv_self.n = n_token_count; } @@ -1828,9 +1887,9 @@ llama_token llama_sample_top_p_top_k( void llama_print_timings(struct llama_context * ctx) { const int64_t t_end_us = ggml_time_us(); - const int32_t n_sample = Max(1, ctx->n_sample); - const int32_t n_eval = Max(1, ctx->n_eval); - const int32_t n_p_eval = Max(1, ctx->n_p_eval); + const int32_t n_sample = std::max(1, ctx->n_sample); + const int32_t n_eval = std::max(1, ctx->n_eval); + const int32_t n_p_eval = std::max(1, ctx->n_p_eval); fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); @@ -1866,3 +1925,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } + +// For internal test use +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors_by_name; +} diff --git a/llama.h b/llama.h index 14443adca..fdb00f3a2 100644 --- a/llama.h +++ b/llama.h @@ -55,6 +55,7 @@ extern "C" { bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only @@ -66,6 +67,9 @@ extern "C" { LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_API bool llama_mmap_supported(); + LLAMA_API bool llama_mlock_supported(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -167,4 +171,4 @@ extern "C" { } #endif -#endif +#endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h new file mode 100644 index 000000000..543eed996 --- /dev/null +++ b/llama_internal.h @@ -0,0 +1,12 @@ +// Internal header to be included by llama.cpp and tests/benchmarks only. + +#ifndef LLAMA_INTERNAL_H +#define LLAMA_INTERNAL_H + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif // LLAMA_INTERNAL_H diff --git a/llama_util.h b/llama_util.h new file mode 100755 index 000000000..d68f49bd2 --- /dev/null +++ b/llama_util.h @@ -0,0 +1,383 @@ +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. + +#ifndef LLAMA_UTIL_H +#define LLAMA_UTIL_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #include + #include // for _fseeki64 +#endif + +#define LLAMA_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static std::string format(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + LLAMA_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + LLAMA_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +}; + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw format("failed to open %s: %s", fname, std::strerror(errno)); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + LLAMA_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + LLAMA_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw format("read error: %s", strerror(errno)); + } + if (ret != 1) { + throw std::string("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw format("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; +#ifdef __linux__ + flags |= MAP_POPULATE; +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + throw format("mmap failed: %s", strerror(errno)); + } + + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + + ~llama_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + CloseHandle(hFile); + + if (hMapping == NULL) { + throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); + } + + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_mmap(struct llama_file *) { + throw std::string("mmap not supported"); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; + + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * addr) { + LLAMA_ASSERT(this->addr == NULL && this->size == 0); + this->addr = addr; + } + + void grow_to(size_t target_size) { + LLAMA_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION, + size, this->size, std::strerror(errno)); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * addr, size_t size) { + for (int tries = 1; ; tries++) { + if (VirtualLock(addr, size)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + size, this->size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = size + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += size; + max_ws_size += size; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * addr, size_t size) { + if (!VirtualUnlock(addr, size)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + void raw_lock(const void * addr, size_t size) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + } + + void raw_unlock(const void * addr, size_t size) {} +#endif +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct llama_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + void resize(size_t size) { + delete[] addr; + addr = new uint8_t[size]; + this->size = size; + } + + ~llama_buffer() { + delete[] addr; + } +}; +#endif