From 7297128db8159c7b12db4c28a4532b993025c2e5 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 7 Aug 2023 08:35:53 +0300 Subject: [PATCH 1/4] [Zig] Rewrite build for Zig 0.11 (#2514) * zig build fixes * Disable LTO on Windows. --- build.zig | 139 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 60 deletions(-) diff --git a/build.zig b/build.zig index 2287d2a2c..04c88d8a2 100644 --- a/build.zig +++ b/build.zig @@ -1,68 +1,87 @@ +// Compatible with Zig Version 0.11.0 const std = @import("std"); -const commit_hash = @embedFile(".git/refs/heads/master"); +const Compile = std.Build.Step.Compile; +const ConfigHeader = std.Build.Step.ConfigHeader; +const Mode = std.builtin.Mode; +const CrossTarget = std.zig.CrossTarget; + +const Maker = struct { + builder: *std.build.Builder, + target: CrossTarget, + optimize: Mode, + config_header: *ConfigHeader, + + const cflags = .{"-std=c11"}; + const cxxflags = .{"-std=c++11"}; + + fn init(builder: *std.build.Builder) Maker { + const commit_hash = @embedFile(".git/refs/heads/master"); + const config_header = builder.addConfigHeader( + .{ .style = .blank, .include_path = "build-info.h" }, + .{ + .BUILD_NUMBER = 0, + .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline + }, + ); + return Maker{ + .builder = builder, + .target = builder.standardTargetOptions(.{}), + .optimize = builder.standardOptimizeOption(.{}), + .config_header = config_header, + }; + } + + fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { + const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); + if (std.mem.endsWith(u8, src, ".c")) { + o.addCSourceFiles(&.{src}, &cflags); + o.linkLibC(); + } else { + o.addCSourceFiles(&.{src}, &cxxflags); + o.linkLibCpp(); + } + o.addIncludePath(.{ .path = "." }); + o.addIncludePath(.{ .path = "./examples" }); + return o; + } + + fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { + const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); + e.addIncludePath(.{ .path = "." }); + e.addIncludePath(.{ .path = "./examples" }); + e.addCSourceFiles(&.{src}, &cxxflags); + for (deps) |d| e.addObject(d); + e.linkLibC(); + e.linkLibCpp(); + e.addConfigHeader(m.config_header); + m.builder.installArtifact(e); + + // Currently a bug is preventing correct linking for optimized builds for Windows: + // https://github.com/ziglang/zig/issues/15958 + if (e.target.isWindows()) { + e.want_lto = false; + } + return e; + } +}; -// Zig Version: 0.11.0-dev.3986+e05c242cd pub fn build(b: *std.build.Builder) void { - const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); + const make = Maker.init(b); - const config_header = b.addConfigHeader( - .{ .style = .blank, .include_path = "build-info.h" }, - .{ - .BUILD_NUMBER = 0, - .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline - }, - ); + const ggml = make.obj("ggml", "ggml.c"); + const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); + const llama = make.obj("llama", "llama.cpp"); + const common = make.obj("common", "examples/common.cpp"); + const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); - lib.linkLibC(); - lib.linkLibCpp(); - lib.addIncludePath("."); - lib.addIncludePath("./examples"); - lib.addConfigHeader(config_header); - lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"}); - lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"}); - b.installArtifact(lib); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama }); - const examples = .{ - "main", - "baby-llama", - "embedding", - "metal", - "perplexity", - "quantize", - "quantize-stats", - "save-load-state", - "server", - "simple", - "train-text-from-scratch", - }; - - inline for (examples) |example_name| { - const exe = b.addExecutable(.{ - .name = example_name, - .target = target, - .optimize = optimize, - }); - exe.addIncludePath("."); - exe.addIncludePath("./examples"); - exe.addConfigHeader(config_header); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - b.installArtifact(exe); - - const run_cmd = b.addRunArtifact(exe); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| run_cmd.addArgs(args); - - const run_step = b.step("run-" ++ example_name, "Run the app"); - run_step.dependOn(&run_cmd.step); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + if (server.target.isWindows()) { + server.linkSystemLibrary("ws2_32"); } } From 34a14b28ff7f3c98730339bacee035091b2a812a Mon Sep 17 00:00:00 2001 From: GiviMAD Date: Sun, 6 Aug 2023 23:21:46 -0700 Subject: [PATCH 2/4] [Makefile] Move ARM CFLAGS before compilation (#2536) --- Makefile | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index e0528aeee..897c5cb9a 100644 --- a/Makefile +++ b/Makefile @@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #CXXFLAGS += -mssse3 endif +ifneq ($(filter aarch64%,$(UNAME_M)),) + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) + CFLAGS += -mcpu=native + CXXFLAGS += -mcpu=native +endif + +ifneq ($(filter armv6%,$(UNAME_M)),) + # Raspberry Pi 1, Zero + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access +endif + +ifneq ($(filter armv7%,$(UNAME_M)),) + # Raspberry Pi 2 + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations +endif + +ifneq ($(filter armv8%,$(UNAME_M)),) + # Raspberry Pi 3, 4, Zero 2 (32-bit) + CFLAGS += -mfp16-format=ieee -mno-unaligned-access +endif + ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) @@ -270,28 +292,6 @@ ifdef LLAMA_METAL OBJS += ggml-metal.o endif # LLAMA_METAL -ifneq ($(filter aarch64%,$(UNAME_M)),) - # Apple M1, M2, etc. - # Raspberry Pi 3, 4, Zero 2 (64-bit) - CFLAGS += -mcpu=native - CXXFLAGS += -mcpu=native -endif - -ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, Zero - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -endif - -ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 2 - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations -endif - -ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 3, 4, Zero 2 (32-bit) - CFLAGS += -mfp16-format=ieee -mno-unaligned-access -endif - ifdef LLAMA_METAL ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ From f6f9896ac3d2ff207e18f87dab85d126ceef5236 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Aug 2023 10:52:57 +0300 Subject: [PATCH 3/4] metal : fix out-of-bounds access + inc concurrency nodes (#2416) * metal : fix out-of-bounds access + style changes * metal : increase concurrency nodes to 2*GGML_MAX_NODES --- ggml-metal.m | 57 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 3f098d396..b47a98e21 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -7,6 +7,11 @@ #import #import +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -15,6 +20,8 @@ #define UNUSED(x) (void)(x) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + struct ggml_metal_buffer { const char * name; @@ -36,7 +43,7 @@ struct ggml_metal_context { int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; - int concur_list[GGML_MAX_NODES]; + int concur_list[GGML_MAX_CONCUR]; int concur_list_len; // custom kernels @@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time - int nodes_unused[GGML_MAX_NODES]; + int nodes_unused[GGML_MAX_CONCUR]; - for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;} - for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;} + for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } + for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } ctx->concur_list_len = 0; - int n_left = gf->n_nodes; - int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list - int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos + int n_left = gf->n_nodes; + int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list + int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos while (n_left > 0) { // number of nodes at a layer (that can be issued concurrently) @@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency( for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { if (nodes_unused[i]) { // if the requirements for gf->nodes[i] are satisfied - int exe_flag=1; + int exe_flag = 1; + // scan all srcs for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; if (src_cur) { // if is leaf nodes it's satisfied. - if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;} + // TODO: ggml_is_leaf() + if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) { + continue; + } // otherwise this src should be the output from previous nodes. int is_found = 0; + // scan 2*search_depth back because we inserted barrier. - for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { - if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;} + //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { + for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { + if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { + is_found = 1; + break; + } + } + if (is_found == 0) { + exe_flag = 0; + break; } - if (is_found == 0) {exe_flag = 0; break;} } } if (exe_flag) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; - int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); + int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); for (int j = n_start; j < i; j++) { if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ && gf->nodes[j]->op != GGML_OP_VIEW \ @@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency( if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { continue; - } else { - exe_flag = 0; } + + exe_flag = 0; } } } @@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency( ctx->concur_list[level_pos + concurrency] = -1; ctx->concur_list_len++; // jump all sorted nodes at nodes_bak - while (!nodes_unused[n_start]) {n_start++;} + while (!nodes_unused[n_start]) { + n_start++; + } level_pos += concurrency + 1; } - if (ctx->concur_list_len > GGML_MAX_NODES) { + if (ctx->concur_list_len > GGML_MAX_CONCUR) { fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -453,7 +474,7 @@ void ggml_metal_graph_compute( // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES; + const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; From 3d9a55181603e85a26378a850a14068034e5002d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 7 Aug 2023 10:09:40 +0200 Subject: [PATCH 4/4] Fixed mmap prefetch for GPU offloading (#2529) --- llama-util.h | 2 +- llama.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama-util.h b/llama-util.h index 3fc03ce28..6e9e39ddb 100644 --- a/llama-util.h +++ b/llama-util.h @@ -219,7 +219,7 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } + if (prefetch >= file->size) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { diff --git a/llama.cpp b/llama.cpp index 839739870..39aefd499 100644 --- a/llama.cpp +++ b/llama.cpp @@ -747,12 +747,12 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; - size_t prefetch_size = 0; + size_t prefetch_size = file_loader->file.size; size_t lock_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; - if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { - prefetch_size += lt.size; + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + prefetch_size -= lt.size; } }