diff --git a/Makefile b/Makefile index e0528aeee..32598edfe 100644 --- a/Makefile +++ b/Makefile @@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #CXXFLAGS += -mssse3 endif +ifneq ($(filter aarch64%,$(UNAME_M)),) + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) + CFLAGS += -mcpu=native + CXXFLAGS += -mcpu=native +endif + +ifneq ($(filter armv6%,$(UNAME_M)),) + # Raspberry Pi 1, Zero + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access +endif + +ifneq ($(filter armv7%,$(UNAME_M)),) + # Raspberry Pi 2 + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations +endif + +ifneq ($(filter armv8%,$(UNAME_M)),) + # Raspberry Pi 3, 4, Zero 2 (32-bit) + CFLAGS += -mfp16-format=ieee -mno-unaligned-access +endif + ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) @@ -270,28 +292,6 @@ ifdef LLAMA_METAL OBJS += ggml-metal.o endif # LLAMA_METAL -ifneq ($(filter aarch64%,$(UNAME_M)),) - # Apple M1, M2, etc. - # Raspberry Pi 3, 4, Zero 2 (64-bit) - CFLAGS += -mcpu=native - CXXFLAGS += -mcpu=native -endif - -ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, Zero - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -endif - -ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 2 - CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations -endif - -ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 3, 4, Zero 2 (32-bit) - CFLAGS += -mfp16-format=ieee -mno-unaligned-access -endif - ifdef LLAMA_METAL ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ @@ -380,7 +380,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml. save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) diff --git a/build.zig b/build.zig index 2287d2a2c..04c88d8a2 100644 --- a/build.zig +++ b/build.zig @@ -1,68 +1,87 @@ +// Compatible with Zig Version 0.11.0 const std = @import("std"); -const commit_hash = @embedFile(".git/refs/heads/master"); +const Compile = std.Build.Step.Compile; +const ConfigHeader = std.Build.Step.ConfigHeader; +const Mode = std.builtin.Mode; +const CrossTarget = std.zig.CrossTarget; + +const Maker = struct { + builder: *std.build.Builder, + target: CrossTarget, + optimize: Mode, + config_header: *ConfigHeader, + + const cflags = .{"-std=c11"}; + const cxxflags = .{"-std=c++11"}; + + fn init(builder: *std.build.Builder) Maker { + const commit_hash = @embedFile(".git/refs/heads/master"); + const config_header = builder.addConfigHeader( + .{ .style = .blank, .include_path = "build-info.h" }, + .{ + .BUILD_NUMBER = 0, + .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline + }, + ); + return Maker{ + .builder = builder, + .target = builder.standardTargetOptions(.{}), + .optimize = builder.standardOptimizeOption(.{}), + .config_header = config_header, + }; + } + + fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { + const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); + if (std.mem.endsWith(u8, src, ".c")) { + o.addCSourceFiles(&.{src}, &cflags); + o.linkLibC(); + } else { + o.addCSourceFiles(&.{src}, &cxxflags); + o.linkLibCpp(); + } + o.addIncludePath(.{ .path = "." }); + o.addIncludePath(.{ .path = "./examples" }); + return o; + } + + fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { + const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); + e.addIncludePath(.{ .path = "." }); + e.addIncludePath(.{ .path = "./examples" }); + e.addCSourceFiles(&.{src}, &cxxflags); + for (deps) |d| e.addObject(d); + e.linkLibC(); + e.linkLibCpp(); + e.addConfigHeader(m.config_header); + m.builder.installArtifact(e); + + // Currently a bug is preventing correct linking for optimized builds for Windows: + // https://github.com/ziglang/zig/issues/15958 + if (e.target.isWindows()) { + e.want_lto = false; + } + return e; + } +}; -// Zig Version: 0.11.0-dev.3986+e05c242cd pub fn build(b: *std.build.Builder) void { - const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); + const make = Maker.init(b); - const config_header = b.addConfigHeader( - .{ .style = .blank, .include_path = "build-info.h" }, - .{ - .BUILD_NUMBER = 0, - .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline - }, - ); + const ggml = make.obj("ggml", "ggml.c"); + const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); + const llama = make.obj("llama", "llama.cpp"); + const common = make.obj("common", "examples/common.cpp"); + const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp"); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); - lib.linkLibC(); - lib.linkLibCpp(); - lib.addIncludePath("."); - lib.addIncludePath("./examples"); - lib.addConfigHeader(config_header); - lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"}); - lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"}); - b.installArtifact(lib); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama }); - const examples = .{ - "main", - "baby-llama", - "embedding", - "metal", - "perplexity", - "quantize", - "quantize-stats", - "save-load-state", - "server", - "simple", - "train-text-from-scratch", - }; - - inline for (examples) |example_name| { - const exe = b.addExecutable(.{ - .name = example_name, - .target = target, - .optimize = optimize, - }); - exe.addIncludePath("."); - exe.addIncludePath("./examples"); - exe.addConfigHeader(config_header); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - b.installArtifact(exe); - - const run_cmd = b.addRunArtifact(exe); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| run_cmd.addArgs(args); - - const run_step = b.step("run-" ++ example_name, "Run the app"); - run_step.dependOn(&run_cmd.step); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + if (server.target.isWindows()) { + server.linkSystemLibrary("ws2_32"); } } diff --git a/convert.py b/convert.py index ab6a4e10e..f3bf17980 100644 --- a/convert.py +++ b/convert.py @@ -465,6 +465,13 @@ class GGMLQuantizedTensor(Tensor): def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) + def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) + + def part(self, n_part: int) -> 'UnquantizedTensor': + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] diff --git a/examples/common.cpp b/examples/common.cpp index 46ceef966..1789b8a95 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -203,6 +203,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = 1.0f/std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -575,8 +581,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); + fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); diff --git a/examples/console.cpp b/examples/console.cpp index 4c32f3b09..8966b107f 100644 --- a/examples/console.cpp +++ b/examples/console.cpp @@ -80,8 +80,10 @@ namespace console { // Set console input codepage to UTF16 _setmode(_fileno(stdin), _O_WTEXT); - if (!simple_io) { - // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + if (simple_io) { + dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; + } else { dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); } if (!SetConsoleMode(hConIn, dwMode)) { diff --git a/examples/llama.vim b/examples/llama.vim new file mode 100644 index 000000000..f03fadfb7 --- /dev/null +++ b/examples/llama.vim @@ -0,0 +1,132 @@ +" Requires an already running llama.cpp server +" To install either copy or symlink to ~/.vim/autoload/llama.vim +" Then start with either :call llama#doLlamaGen(), +" or add a keybind to your vimrc such as +" nnoremap Z :call llama#doLlamaGen() +" Similarly, you could add an insert mode keybind with +" inoremap call llama#doLlamaGen() +" +" g:llama_api_url and g:llama_overrides can be configured in your .vimrc +" let g:llama_api_url = "192.168.1.10:8080" +" llama_overrides can also be set through buffer/window scopes. For instance +" autocmd filetype python let b:llama_overrides = {"temp": 0.2} +" Could be added to your .vimrc to automatically set a lower temperature when +" editing a python script +" Additionally, an override dict can be stored at the top of a file +" !*{"stop": ["User:"]} +" Could be added to the start of your chatlog.txt to set the stopping token +" These parameter dicts are merged together from lowest to highest priority: +" server default -> g:llama_overrides -> w:llama_overrides -> +" b:llama_overrides -> in file (!*) overrides +" +" Sublists (like logit_bias and stop) are overridden, not merged +" Example override: +" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} +if !exists("g:llama_api_url") + let g:llama_api_url= "127.0.0.1:8080" +endif +if !exists("g:llama_overrides") + let g:llama_overrides = {} +endif +const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } +const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] +let s:linedict = {} + +func s:callbackHandler(bufn, channel, msg) + if len(a:msg) < 3 + return + elseif a:msg[0] == "d" + let l:msg = a:msg[6:-1] + else + let l:msg = a:msg + endif + let l:decoded_msg = json_decode(l:msg) + let l:newtext = split(l:decoded_msg['content'], "\n", 1) + if len(l:newtext) > 0 + call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) + else + echo "nothing genned" + endif + if len(newtext) > 1 + let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) + let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 + endif + if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop + echo "Finished generation" + endif +endfunction + +func llama#doLlamaGen() + if exists("b:job") + if job_status(b:job) == "run" + call job_stop(b:job) + return + endif + endif + + let l:cbuffer = bufnr("%") + let s:linedict[l:cbuffer] = line('$') + let l:buflines = getbufline(l:cbuffer, 1, 1000) + let l:querydata = copy(s:querydata) + call extend(l:querydata, g:llama_overrides) + if exists("w:llama_overrides") + call extend(l:querydata, w:llama_overrides) + endif + if exists("b:llama_overrides") + call extend(l:querydata, b:llama_overrides) + endif + if l:buflines[0][0:1] == '!*' + let l:userdata = json_decode(l:buflines[0][2:-1]) + call extend(l:querydata, l:userdata) + let l:buflines = l:buflines[1:-1] + endif + let l:querydata.prompt = join(l:buflines, "\n") + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) +endfunction + +" Echos the tokkenization of the provided string , or cursor to end of word +" Onus is placed on the user to include the preceding space +func llama#tokenizeWord(...) + if (a:0 > 0) + let l:input = a:1 + else + exe "normal \"*ye" + let l:input = @* + endif + let l:querydata = {"content": l:input} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) +endfunction + +func s:tokenizeWordCallback(plaintext, channel, msg) + echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) +endfunction + + +" Echos the token count of the entire buffer (or provided string) +" Example usage :echo llama#tokenCount() +func llama#tokenCount(...) + if (a:0 > 0) + let l:buflines = a:1 + else + let l:buflines = getline(1,1000) + if l:buflines[0][0:1] == '!*' + let l:buflines = l:buflines[1:-1] + endif + let l:buflines = join(l:buflines, "\n") + endif + let l:querydata = {"content": l:buflines} + let l:curlcommand = copy(s:curlcommand) + let l:curlcommand[2] = json_encode(l:querydata) + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" + let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) +endfunction + +func s:tokenCountCallback(channel, msg) + let resp = json_decode(a:msg) + echo len(resp.tokens) +endfunction diff --git a/examples/llm.vim b/examples/llm.vim index efecad0cd..594a28549 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -1,3 +1,5 @@ +" Basic plugin example + function! Llm() let url = "http://127.0.0.1:8080/completion" @@ -16,8 +18,10 @@ function! Llm() " Extract the content field from the response let content = json_decode(response).content + let split_newlines = split(content, '\n', 1) + " Insert the content at the cursor position - call setline(line('.'), getline('.') . content) + call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:]) endfunction command! Llm call Llm() diff --git a/examples/main/README.md b/examples/main/README.md index 014112e5d..55c16096f 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by - `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. +### Extended Context Size + +Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. + +- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. + ### Keep Prompt The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. diff --git a/examples/server/README.md b/examples/server/README.md index aee31ae42..e56ca063a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -151,6 +151,8 @@ node . `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). + `grammar`: Set grammar for grammar-based sampling (default: no grammar) + `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). `ignore_eos`: Ignore end of stream token and continue generating (default: false). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 10b666c1c..fe91b2343 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,7 @@ #include "common.h" #include "llama.h" #include "build-info.h" +#include "grammar-parser.h" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -195,6 +196,8 @@ struct llama_server_context llama_context *ctx = nullptr; gpt_params params; + llama_grammar *grammar = nullptr; + bool truncated = false; bool stopped_eos = false; bool stopped_word = false; @@ -226,6 +229,7 @@ struct llama_server_context void rewind() { params.antiprompt.clear(); + params.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; @@ -237,6 +241,7 @@ struct llama_server_context stopped_limit = false; stopping_word = ""; multibyte_pending = 0; + grammar = nullptr; n_remain = 0; n_past = 0; @@ -257,6 +262,33 @@ struct llama_server_context return true; } + bool loadGrammar() + { + if (!params.grammar.empty()) { + grammar_parser::parse_state parsed_grammar; + + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + LOG_ERROR("grammar parse error", {{"grammar", params.grammar}}); + return false; + } + grammar_parser::print_grammar(stderr, parsed_grammar); + + { + auto it = params.logit_bias.find(llama_token_eos()); + if (it != params.logit_bias.end() && it->second == -INFINITY) { + LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); + } + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + return true; + } + void loadPrompt() { params.prompt.insert(0, 1, ' '); // always add a first space @@ -420,6 +452,10 @@ struct llama_server_context logits[llama_token_nl()] = nl_logit; } + if (grammar != nullptr) { + llama_sample_grammar(ctx, &candidates_p, grammar); + } + if (temp <= 0) { // Greedy sampling @@ -457,10 +493,15 @@ struct llama_server_context } } + if (grammar != nullptr) { + llama_grammar_accept_token(ctx, grammar, result.tok); + } + for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i) { result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p}); } + last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(result.tok); num_tokens_predicted++; @@ -947,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama) {"stream", llama.stream}, {"logit_bias", llama.params.logit_bias}, {"n_probs", llama.params.n_probs}, + {"grammar", llama.params.grammar}, }; } @@ -1048,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.seed = body.value("seed", default_params.seed); llama.params.prompt = body.value("prompt", default_params.prompt); + llama.params.grammar = body.value("grammar", default_params.grammar); llama.params.n_probs = body.value("n_probs", default_params.n_probs); llama.params.logit_bias.clear(); @@ -1179,6 +1222,12 @@ int main(int argc, char **argv) parse_options_completion(json::parse(req.body), llama); + if (!llama.loadGrammar()) + { + res.status = 400; + return; + } + llama.loadPrompt(); llama.beginCompletion(); @@ -1334,8 +1383,12 @@ int main(int argc, char **argv) svr.set_error_handler([](const Request &, Response &res) { - res.set_content("File Not Found", "text/plain"); - res.status = 404; }); + if (res.status == 400) { + res.set_content("Invalid request", "text/plain"); + } else { + res.set_content("File Not Found", "text/plain"); + res.status = 404; + } }); // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); @@ -1363,6 +1416,9 @@ int main(int argc, char **argv) return 1; } + if (llama.grammar != nullptr) { + llama_grammar_free(llama.grammar); + } llama_backend_free(); return 0; diff --git a/ggml-metal.m b/ggml-metal.m index 3f098d396..b47a98e21 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -7,6 +7,11 @@ #import #import +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -15,6 +20,8 @@ #define UNUSED(x) (void)(x) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + struct ggml_metal_buffer { const char * name; @@ -36,7 +43,7 @@ struct ggml_metal_context { int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; - int concur_list[GGML_MAX_NODES]; + int concur_list[GGML_MAX_CONCUR]; int concur_list_len; // custom kernels @@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time - int nodes_unused[GGML_MAX_NODES]; + int nodes_unused[GGML_MAX_CONCUR]; - for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;} - for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;} + for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } + for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } ctx->concur_list_len = 0; - int n_left = gf->n_nodes; - int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list - int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos + int n_left = gf->n_nodes; + int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list + int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos while (n_left > 0) { // number of nodes at a layer (that can be issued concurrently) @@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency( for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { if (nodes_unused[i]) { // if the requirements for gf->nodes[i] are satisfied - int exe_flag=1; + int exe_flag = 1; + // scan all srcs for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; if (src_cur) { // if is leaf nodes it's satisfied. - if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;} + // TODO: ggml_is_leaf() + if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) { + continue; + } // otherwise this src should be the output from previous nodes. int is_found = 0; + // scan 2*search_depth back because we inserted barrier. - for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { - if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;} + //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { + for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { + if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { + is_found = 1; + break; + } + } + if (is_found == 0) { + exe_flag = 0; + break; } - if (is_found == 0) {exe_flag = 0; break;} } } if (exe_flag) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; - int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); + int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); for (int j = n_start; j < i; j++) { if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ && gf->nodes[j]->op != GGML_OP_VIEW \ @@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency( if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { continue; - } else { - exe_flag = 0; } + + exe_flag = 0; } } } @@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency( ctx->concur_list[level_pos + concurrency] = -1; ctx->concur_list_len++; // jump all sorted nodes at nodes_bak - while (!nodes_unused[n_start]) {n_start++;} + while (!nodes_unused[n_start]) { + n_start++; + } level_pos += concurrency + 1; } - if (ctx->concur_list_len > GGML_MAX_NODES) { + if (ctx->concur_list_len > GGML_MAX_CONCUR) { fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); } } @@ -453,7 +474,7 @@ void ggml_metal_graph_compute( // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES; + const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; diff --git a/ggml.c b/ggml.c index fa0f98aa0..beb7f4641 100644 --- a/ggml.c +++ b/ggml.c @@ -195,8 +195,8 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -inline static void* ggml_aligned_malloc(size_t size) { - void* aligned_memory = NULL; +inline static void * ggml_aligned_malloc(size_t size) { + void * aligned_memory = NULL; #ifdef GGML_USE_METAL int result = posix_memalign(&aligned_memory, getpagesize(), size); #else @@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) { // // is enough, but just in case, adding the second part - return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); + return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); } size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { @@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, - /*.op_params =*/ {0}, + /*.op_params =*/ { 0 }, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, @@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings assert(params_size <= GGML_MAX_OP_PARAMS); memcpy(tensor->op_params, params, params_size); } @@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute( result->src[0] = a; int32_t params[] = { axis0, axis1, axis2, axis3 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); return result; } @@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[6] = { n_past, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, n_dims, mode, n_ctx }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi( int32_t op_params[3] = { n_past, n_head }; memcpy(op_params + 2, &bias_max, sizeof(float)); - ggml_set_op_params(result, &op_params, sizeof(op_params)); + ggml_set_op_params(result, op_params, sizeof(op_params)); result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp( struct ggml_tensor * result = ggml_view_tensor(ctx, a); float params[] = { min, max }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), a->ne[2], 1, 1, }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( // ggml_conv_2d -struct ggml_tensor* ggml_conv_2d( - struct ggml_context* ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int s1, int p0, @@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d( ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), a->ne[3], b->ne[3], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d( // ggml_conv_1d_ph -struct ggml_tensor* ggml_conv_1d_ph( +struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { // ggml_pool_1d -struct ggml_tensor* ggml_pool_1d( +struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d( ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { op, k0, s0, p0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d( // ggml_pool_2d -struct ggml_tensor* ggml_pool_2d( +struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d( ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), a->ne[2], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom1 +// ggml_map_custom1_f32 static struct ggml_tensor * ggml_map_custom1_impl_f32( struct ggml_context * ctx, @@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM1; + result->op = GGML_OP_MAP_CUSTOM1_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; @@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32( return ggml_map_custom1_impl_f32(ctx, a, fun, true); } -// ggml_map_custom2 +// ggml_map_custom2_f32 static struct ggml_tensor * ggml_map_custom2_impl_f32( struct ggml_context * ctx, @@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM2; + result->op = GGML_OP_MAP_CUSTOM2_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32( return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom3 +// ggml_map_custom3_f32 static struct ggml_tensor * ggml_map_custom3_impl_f32( struct ggml_context * ctx, @@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM3; + result->op = GGML_OP_MAP_CUSTOM3_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); } +// ggml_map_custom1 +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom1_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); +} + +// ggml_map_custom2 + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom2_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom2_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); +} + +// ggml_map_custom3 + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom3_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom3_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); +} + + + // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32( for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { - float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; ggml_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; @@ -10546,72 +10731,96 @@ static void ggml_compute_forward_mul_mat( return; } - // parallelize by src0 rows - const int64_t dr = (ne01 + nth - 1)/nth; - - const int64_t ir10 = dr*ith; - const int64_t ir11 = MIN(ir10 + dr, ne01); - - // src1 rows - const int64_t nr1 = ne11*ne12*ne13; - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { - const int64_t i13 = (ir1/(ne12*ne11)); - const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; - const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows - const int64_t ir0 = (ir1/ne11)%(ne02*ne03); - const int64_t i03 = (ir0/(ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. - // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: - // GG: this is likely the correct way to broadcast, though need some more thought - // therefore leaving the comments to remind us for now - const int64_t i02 = (i12 / (ne12 / ne02)); - // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) - // const int64_t i02 = (ir0 - i03*ne02); + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; + // distribute the thread work across the inner or outer loop based on which one is larger - const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; - for (int64_t ir = ir10; ir < ir11; ++ir) { - vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); - } + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + return; } - //int64_t t1 = ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } } - // ggml_compute_forward_out_prod - static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d( const struct ggml_tensor * src0, struct ggml_tensor * dst) { - const int32_t* opts = (const int32_t*)dst->op_params; + const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; @@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32( fun(dst, a); } - -static void ggml_compute_forward_map_custom1( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - struct ggml_tensor * dst, - const ggml_custom1_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom1_f32(params, a, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom2 static void ggml_compute_forward_map_custom2_f32( @@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32( } -static void ggml_compute_forward_map_custom2( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - struct ggml_tensor * dst, - const ggml_custom2_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32( fun(dst, a, b, c); } +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + + p->fun(dst, a, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + + p->fun(dst, a, b, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, - struct ggml_tensor * dst, - const ggml_custom3_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } + + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + + p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); } // ggml_compute_forward_cross_entropy_loss @@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; - case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2_F32: + { + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM3_F32: + { + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1: + { + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); } break; case GGML_OP_MAP_CUSTOM2: { - ggml_custom2_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MAP_CUSTOM3: { - ggml_custom3_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: @@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: @@ -16449,12 +16668,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: { n_tasks = 1; } break; + case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM3: + { + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; case GGML_OP_CROSS_ENTROPY_LOSS: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index aba92480c..bdbd12800 100644 --- a/ggml.h +++ b/ggml.h @@ -183,6 +183,15 @@ # define GGML_API #endif +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + #include #include #include @@ -374,6 +383,10 @@ extern "C" { GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + GGML_OP_MAP_CUSTOM1, GGML_OP_MAP_CUSTOM2, GGML_OP_MAP_CUSTOM3, @@ -570,6 +583,8 @@ extern "C" { GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); @@ -1240,7 +1255,7 @@ extern "C" { // conv_1d with padding = half // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor* ggml_conv_1d_ph( + GGML_API struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -1253,7 +1268,7 @@ extern "C" { GGML_OP_POOL_COUNT, }; - GGML_API struct ggml_tensor* ggml_pool_1d( + GGML_API struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1261,7 +1276,7 @@ extern "C" { int s0, // stride int p0); // padding - GGML_API struct ggml_tensor* ggml_pool_2d( + GGML_API struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1315,15 +1330,6 @@ extern "C" { int h0, int w); - // custom operators - - typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); - typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - - typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - GGML_API struct ggml_tensor * ggml_unary( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1334,63 +1340,138 @@ extern "C" { struct ggml_tensor * a, enum ggml_unary_op op); - GGML_API struct ggml_tensor * ggml_map_unary_f32( + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_binary_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); // loss function diff --git a/llama-util.h b/llama-util.h index 3fc03ce28..6e9e39ddb 100644 --- a/llama-util.h +++ b/llama-util.h @@ -219,7 +219,7 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } + if (prefetch >= file->size) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { diff --git a/llama.cpp b/llama.cpp index 01c904a34..f58ca7487 100644 --- a/llama.cpp +++ b/llama.cpp @@ -149,7 +149,7 @@ static const std::map & MEM_REQ_EVAL() } // amount of VRAM needed per batch size to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_BASE() { static std::map k_sizes = { @@ -157,14 +157,14 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_7B, 512ull * kB }, { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1536ull * kB }, - { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) + { MODEL_65B, 1280ull * kB }, + { MODEL_70B, 1280ull * kB }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { static std::map k_sizes = { @@ -172,8 +172,8 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_7B, 128ull }, { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, - { MODEL_65B, 416ull }, - { MODEL_70B, 416ull }, // TODO (likely can be reduced) + { MODEL_65B, 256ull }, + { MODEL_70B, 256ull }, }; return k_sizes; } @@ -747,12 +747,12 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; - size_t prefetch_size = 0; + size_t prefetch_size = file_loader->file.size; size_t lock_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; - if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { - prefetch_size += lt.size; + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + prefetch_size -= lt.size; } }