Merge branch 'ggerganov:master' into master

This commit is contained in:
Eve 2023-08-08 21:13:28 -04:00 committed by GitHub
commit 5624a29c1f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 882 additions and 299 deletions

View file

@ -142,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
#CXXFLAGS += -mssse3 #CXXFLAGS += -mssse3
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter ppc64%,$(UNAME_M)),) ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M))) ifneq (,$(findstring POWER9,$(POWER9_M)))
@ -270,28 +292,6 @@ ifdef LLAMA_METAL
OBJS += ggml-metal.o OBJS += ggml-metal.o
endif # LLAMA_METAL endif # LLAMA_METAL
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifdef LLAMA_METAL ifdef LLAMA_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
@ -380,7 +380,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)

139
build.zig
View file

@ -1,68 +1,87 @@
// Compatible with Zig Version 0.11.0
const std = @import("std"); const std = @import("std");
const commit_hash = @embedFile(".git/refs/heads/master"); const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
config_header: *ConfigHeader,
const cflags = .{"-std=c11"};
const cxxflags = .{"-std=c++11"};
fn init(builder: *std.build.Builder) Maker {
const commit_hash = @embedFile(".git/refs/heads/master");
const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" },
.{
.BUILD_NUMBER = 0,
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
},
);
return Maker{
.builder = builder,
.target = builder.standardTargetOptions(.{}),
.optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header,
};
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, &cflags);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, &cxxflags);
o.linkLibCpp();
}
o.addIncludePath(.{ .path = "." });
o.addIncludePath(.{ .path = "./examples" });
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addIncludePath(.{ .path = "." });
e.addIncludePath(.{ .path = "./examples" });
e.addCSourceFiles(&.{src}, &cxxflags);
for (deps) |d| e.addObject(d);
e.linkLibC();
e.linkLibCpp();
e.addConfigHeader(m.config_header);
m.builder.installArtifact(e);
// Currently a bug is preventing correct linking for optimized builds for Windows:
// https://github.com/ziglang/zig/issues/15958
if (e.target.isWindows()) {
e.want_lto = false;
}
return e;
}
};
// Zig Version: 0.11.0-dev.3986+e05c242cd
pub fn build(b: *std.build.Builder) void { pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{}); const make = Maker.init(b);
const optimize = b.standardOptimizeOption(.{});
const config_header = b.addConfigHeader( const ggml = make.obj("ggml", "ggml.c");
.{ .style = .blank, .include_path = "build-info.h" }, const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
.{ const llama = make.obj("llama", "llama.cpp");
.BUILD_NUMBER = 0, const common = make.obj("common", "examples/common.cpp");
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
},
);
const lib = b.addStaticLibrary(.{ _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
.name = "llama", _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
.target = target, _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
.optimize = optimize, _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
}); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
lib.linkLibC();
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("./examples");
lib.addConfigHeader(config_header);
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
b.installArtifact(lib);
const examples = .{ const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
"main", if (server.target.isWindows()) {
"baby-llama", server.linkSystemLibrary("ws2_32");
"embedding",
"metal",
"perplexity",
"quantize",
"quantize-stats",
"save-load-state",
"server",
"simple",
"train-text-from-scratch",
};
inline for (examples) |example_name| {
const exe = b.addExecutable(.{
.name = example_name,
.target = target,
.optimize = optimize,
});
exe.addIncludePath(".");
exe.addIncludePath("./examples");
exe.addConfigHeader(config_header);
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| run_cmd.addArgs(args);
const run_step = b.step("run-" ++ example_name, "Run the app");
run_step.dependOn(&run_cmd.step);
} }
} }

View file

@ -465,6 +465,13 @@ class GGMLQuantizedTensor(Tensor):
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]

View file

@ -203,6 +203,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.rope_freq_scale = std::stof(argv[i]); params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--rope-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--memory-f32") { } else if (arg == "--memory-f32") {
params.memory_f16 = false; params.memory_f16 = false;
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
@ -575,8 +581,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");

View file

@ -80,8 +80,10 @@ namespace console {
// Set console input codepage to UTF16 // Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT); _setmode(_fileno(stdin), _O_WTEXT);
if (!simple_io) { // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
// Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) if (simple_io) {
dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
} else {
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
} }
if (!SetConsoleMode(hConIn, dwMode)) { if (!SetConsoleMode(hConIn, dwMode)) {

132
examples/llama.vim Normal file
View file

@ -0,0 +1,132 @@
" Requires an already running llama.cpp server
" To install either copy or symlink to ~/.vim/autoload/llama.vim
" Then start with either :call llama#doLlamaGen(),
" or add a keybind to your vimrc such as
" nnoremap Z :call llama#doLlamaGen()<CR>
" Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
"
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
" Could be added to your .vimrc to automatically set a lower temperature when
" editing a python script
" Additionally, an override dict can be stored at the top of a file
" !*{"stop": ["User:"]}
" Could be added to the start of your chatlog.txt to set the stopping token
" These parameter dicts are merged together from lowest to highest priority:
" server default -> g:llama_overrides -> w:llama_overrides ->
" b:llama_overrides -> in file (!*) overrides
"
" Sublists (like logit_bias and stop) are overridden, not merged
" Example override:
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
if !exists("g:llama_api_url")
let g:llama_api_url= "127.0.0.1:8080"
endif
if !exists("g:llama_overrides")
let g:llama_overrides = {}
endif
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
let s:linedict = {}
func s:callbackHandler(bufn, channel, msg)
if len(a:msg) < 3
return
elseif a:msg[0] == "d"
let l:msg = a:msg[6:-1]
else
let l:msg = a:msg
endif
let l:decoded_msg = json_decode(l:msg)
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
if len(l:newtext) > 0
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
else
echo "nothing genned"
endif
if len(newtext) > 1
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
endif
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
echo "Finished generation"
endif
endfunction
func llama#doLlamaGen()
if exists("b:job")
if job_status(b:job) == "run"
call job_stop(b:job)
return
endif
endif
let l:cbuffer = bufnr("%")
let s:linedict[l:cbuffer] = line('$')
let l:buflines = getbufline(l:cbuffer, 1, 1000)
let l:querydata = copy(s:querydata)
call extend(l:querydata, g:llama_overrides)
if exists("w:llama_overrides")
call extend(l:querydata, w:llama_overrides)
endif
if exists("b:llama_overrides")
call extend(l:querydata, b:llama_overrides)
endif
if l:buflines[0][0:1] == '!*'
let l:userdata = json_decode(l:buflines[0][2:-1])
call extend(l:querydata, l:userdata)
let l:buflines = l:buflines[1:-1]
endif
let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction
" Echos the tokkenization of the provided string , or cursor to end of word
" Onus is placed on the user to include the preceding space
func llama#tokenizeWord(...)
if (a:0 > 0)
let l:input = a:1
else
exe "normal \"*ye"
let l:input = @*
endif
let l:querydata = {"content": l:input}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
endfunction
func s:tokenizeWordCallback(plaintext, channel, msg)
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
endfunction
" Echos the token count of the entire buffer (or provided string)
" Example usage :echo llama#tokenCount()
func llama#tokenCount(...)
if (a:0 > 0)
let l:buflines = a:1
else
let l:buflines = getline(1,1000)
if l:buflines[0][0:1] == '!*'
let l:buflines = l:buflines[1:-1]
endif
let l:buflines = join(l:buflines, "\n")
endif
let l:querydata = {"content": l:buflines}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
endfunction
func s:tokenCountCallback(channel, msg)
let resp = json_decode(a:msg)
echo len(resp.tokens)
endfunction

View file

@ -1,3 +1,5 @@
" Basic plugin example
function! Llm() function! Llm()
let url = "http://127.0.0.1:8080/completion" let url = "http://127.0.0.1:8080/completion"
@ -16,8 +18,10 @@ function! Llm()
" Extract the content field from the response " Extract the content field from the response
let content = json_decode(response).content let content = json_decode(response).content
let split_newlines = split(content, '\n', 1)
" Insert the content at the cursor position " Insert the content at the cursor position
call setline(line('.'), getline('.') . content) call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
endfunction endfunction
command! Llm call Llm() command! Llm call Llm()

View file

@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results. - `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Extended Context Size
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
### Keep Prompt ### Keep Prompt
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.

View file

@ -151,6 +151,8 @@ node .
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
`grammar`: Set grammar for grammar-based sampling (default: no grammar)
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
`ignore_eos`: Ignore end of stream token and continue generating (default: false). `ignore_eos`: Ignore end of stream token and continue generating (default: false).

View file

@ -1,6 +1,7 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "build-info.h"
#include "grammar-parser.h"
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@ -195,6 +196,8 @@ struct llama_server_context
llama_context *ctx = nullptr; llama_context *ctx = nullptr;
gpt_params params; gpt_params params;
llama_grammar *grammar = nullptr;
bool truncated = false; bool truncated = false;
bool stopped_eos = false; bool stopped_eos = false;
bool stopped_word = false; bool stopped_word = false;
@ -226,6 +229,7 @@ struct llama_server_context
void rewind() void rewind()
{ {
params.antiprompt.clear(); params.antiprompt.clear();
params.grammar.clear();
num_prompt_tokens = 0; num_prompt_tokens = 0;
num_tokens_predicted = 0; num_tokens_predicted = 0;
generated_text = ""; generated_text = "";
@ -237,6 +241,7 @@ struct llama_server_context
stopped_limit = false; stopped_limit = false;
stopping_word = ""; stopping_word = "";
multibyte_pending = 0; multibyte_pending = 0;
grammar = nullptr;
n_remain = 0; n_remain = 0;
n_past = 0; n_past = 0;
@ -257,6 +262,33 @@ struct llama_server_context
return true; return true;
} }
bool loadGrammar()
{
if (!params.grammar.empty()) {
grammar_parser::parse_state parsed_grammar;
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
return false;
}
grammar_parser::print_grammar(stderr, parsed_grammar);
{
auto it = params.logit_bias.find(llama_token_eos());
if (it != params.logit_bias.end() && it->second == -INFINITY) {
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
}
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
return true;
}
void loadPrompt() void loadPrompt()
{ {
params.prompt.insert(0, 1, ' '); // always add a first space params.prompt.insert(0, 1, ' '); // always add a first space
@ -420,6 +452,10 @@ struct llama_server_context
logits[llama_token_nl()] = nl_logit; logits[llama_token_nl()] = nl_logit;
} }
if (grammar != nullptr) {
llama_sample_grammar(ctx, &candidates_p, grammar);
}
if (temp <= 0) if (temp <= 0)
{ {
// Greedy sampling // Greedy sampling
@ -457,10 +493,15 @@ struct llama_server_context
} }
} }
if (grammar != nullptr) {
llama_grammar_accept_token(ctx, grammar, result.tok);
}
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i) for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
{ {
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p}); result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
} }
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(result.tok); last_n_tokens.push_back(result.tok);
num_tokens_predicted++; num_tokens_predicted++;
@ -947,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama)
{"stream", llama.stream}, {"stream", llama.stream},
{"logit_bias", llama.params.logit_bias}, {"logit_bias", llama.params.logit_bias},
{"n_probs", llama.params.n_probs}, {"n_probs", llama.params.n_probs},
{"grammar", llama.params.grammar},
}; };
} }
@ -1048,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.n_keep = body.value("n_keep", default_params.n_keep);
llama.params.seed = body.value("seed", default_params.seed); llama.params.seed = body.value("seed", default_params.seed);
llama.params.prompt = body.value("prompt", default_params.prompt); llama.params.prompt = body.value("prompt", default_params.prompt);
llama.params.grammar = body.value("grammar", default_params.grammar);
llama.params.n_probs = body.value("n_probs", default_params.n_probs); llama.params.n_probs = body.value("n_probs", default_params.n_probs);
llama.params.logit_bias.clear(); llama.params.logit_bias.clear();
@ -1179,6 +1222,12 @@ int main(int argc, char **argv)
parse_options_completion(json::parse(req.body), llama); parse_options_completion(json::parse(req.body), llama);
if (!llama.loadGrammar())
{
res.status = 400;
return;
}
llama.loadPrompt(); llama.loadPrompt();
llama.beginCompletion(); llama.beginCompletion();
@ -1334,8 +1383,12 @@ int main(int argc, char **argv)
svr.set_error_handler([](const Request &, Response &res) svr.set_error_handler([](const Request &, Response &res)
{ {
res.set_content("File Not Found", "text/plain"); if (res.status == 400) {
res.status = 404; }); res.set_content("Invalid request", "text/plain");
} else {
res.set_content("File Not Found", "text/plain");
res.status = 404;
} });
// set timeouts and change hostname and port // set timeouts and change hostname and port
svr.set_read_timeout(sparams.read_timeout); svr.set_read_timeout(sparams.read_timeout);
@ -1363,6 +1416,9 @@ int main(int argc, char **argv)
return 1; return 1;
} }
if (llama.grammar != nullptr) {
llama_grammar_free(llama.grammar);
}
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -7,6 +7,11 @@
#import <Metal/Metal.h> #import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h> #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef GGML_METAL_NDEBUG #ifdef GGML_METAL_NDEBUG
#define metal_printf(...) #define metal_printf(...)
#else #else
@ -15,6 +20,8 @@
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
struct ggml_metal_buffer { struct ggml_metal_buffer {
const char * name; const char * name;
@ -36,7 +43,7 @@ struct ggml_metal_context {
int n_buffers; int n_buffers;
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
int concur_list[GGML_MAX_NODES]; int concur_list[GGML_MAX_CONCUR];
int concur_list_len; int concur_list_len;
// custom kernels // custom kernels
@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_NODES]; int nodes_unused[GGML_MAX_CONCUR];
for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;} for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;} for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
int n_left = gf->n_nodes; int n_left = gf->n_nodes;
int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
while (n_left > 0) { while (n_left > 0) {
// number of nodes at a layer (that can be issued concurrently) // number of nodes at a layer (that can be issued concurrently)
@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
if (nodes_unused[i]) { if (nodes_unused[i]) {
// if the requirements for gf->nodes[i] are satisfied // if the requirements for gf->nodes[i] are satisfied
int exe_flag=1; int exe_flag = 1;
// scan all srcs // scan all srcs
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
if (src_cur) { if (src_cur) {
// if is leaf nodes it's satisfied. // if is leaf nodes it's satisfied.
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;} // TODO: ggml_is_leaf()
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
continue;
}
// otherwise this src should be the output from previous nodes. // otherwise this src should be the output from previous nodes.
int is_found = 0; int is_found = 0;
// scan 2*search_depth back because we inserted barrier. // scan 2*search_depth back because we inserted barrier.
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;} for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
is_found = 1;
break;
}
}
if (is_found == 0) {
exe_flag = 0;
break;
} }
if (is_found == 0) {exe_flag = 0; break;}
} }
} }
if (exe_flag) { if (exe_flag) {
// check if nodes[i]'s data will be overwritten by a node before nodes[i]. // check if nodes[i]'s data will be overwritten by a node before nodes[i].
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
int64_t data_start = (int64_t) gf->nodes[i]->data; int64_t data_start = (int64_t) gf->nodes[i]->data;
int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
for (int j = n_start; j < i; j++) { for (int j = n_start; j < i; j++) {
if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
&& gf->nodes[j]->op != GGML_OP_VIEW \ && gf->nodes[j]->op != GGML_OP_VIEW \
@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
continue; continue;
} else {
exe_flag = 0;
} }
exe_flag = 0;
} }
} }
} }
@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
ctx->concur_list[level_pos + concurrency] = -1; ctx->concur_list[level_pos + concurrency] = -1;
ctx->concur_list_len++; ctx->concur_list_len++;
// jump all sorted nodes at nodes_bak // jump all sorted nodes at nodes_bak
while (!nodes_unused[n_start]) {n_start++;} while (!nodes_unused[n_start]) {
n_start++;
}
level_pos += concurrency + 1; level_pos += concurrency + 1;
} }
if (ctx->concur_list_len > GGML_MAX_NODES) { if (ctx->concur_list_len > GGML_MAX_CONCUR) {
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
} }
} }
@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
// else fallback to serial dispatch // else fallback to serial dispatch
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES; const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;

544
ggml.c
View file

@ -195,8 +195,8 @@ typedef void * thread_ret_t;
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else #else
inline static void* ggml_aligned_malloc(size_t size) { inline static void * ggml_aligned_malloc(size_t size) {
void* aligned_memory = NULL; void * aligned_memory = NULL;
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
int result = posix_memalign(&aligned_memory, getpagesize(), size); int result = posix_memalign(&aligned_memory, getpagesize(), size);
#else #else
@ -3811,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK", "CROSS_ENTROPY_LOSS_BACK",
}; };
static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -3883,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)", "cross_entropy_loss_back(x,y)",
}; };
static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4110,7 +4110,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
// //
// is enough, but just in case, adding the second part // is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
} }
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@ -4253,7 +4253,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
@ -4602,7 +4602,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.ne =*/ { 1, 1, 1, 1 }, /*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 }, /*.nb =*/ { 0, 0, 0, 0 },
/*.op =*/ GGML_OP_NONE, /*.op =*/ GGML_OP_NONE,
/*.op_params =*/ {0}, /*.op_params =*/ { 0 },
/*.is_param =*/ false, /*.is_param =*/ false,
/*.grad =*/ NULL, /*.grad =*/ NULL,
/*.src =*/ { NULL }, /*.src =*/ { NULL },
@ -4634,6 +4634,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
} }
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
assert(params_size <= GGML_MAX_OP_PARAMS); assert(params_size <= GGML_MAX_OP_PARAMS);
memcpy(tensor->op_params, params, params_size); memcpy(tensor->op_params, params, params_size);
} }
@ -6439,7 +6440,7 @@ struct ggml_tensor * ggml_permute(
result->src[0] = a; result->src[0] = a;
int32_t params[] = { axis0, axis1, axis2, axis3 }; int32_t params[] = { axis0, axis1, axis2, axis3 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
return result; return result;
} }
@ -6565,7 +6566,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 }; int32_t params[] = { n_past, inplace ? 1 : 0 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_INF; result->op = GGML_OP_DIAG_MASK_INF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6605,7 +6606,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, inplace ? 1 : 0 }; int32_t params[] = { n_past, inplace ? 1 : 0 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_DIAG_MASK_ZERO; result->op = GGML_OP_DIAG_MASK_ZERO;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6721,9 +6722,9 @@ static struct ggml_tensor * ggml_rope_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[6] = { n_past, n_dims, mode, n_ctx }; int32_t params[6] = { n_past, n_dims, mode, n_ctx };
memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float));
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE; result->op = GGML_OP_ROPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6797,7 +6798,7 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[] = { n_past, n_dims, mode, n_ctx }; int32_t params[] = { n_past, n_dims, mode, n_ctx };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK; result->op = GGML_OP_ROPE_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6828,7 +6829,7 @@ struct ggml_tensor * ggml_alibi(
int32_t op_params[3] = { n_past, n_head }; int32_t op_params[3] = { n_past, n_head };
memcpy(op_params + 2, &bias_max, sizeof(float)); memcpy(op_params + 2, &bias_max, sizeof(float));
ggml_set_op_params(result, &op_params, sizeof(op_params)); ggml_set_op_params(result, op_params, sizeof(op_params));
result->op = GGML_OP_ALIBI; result->op = GGML_OP_ALIBI;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6855,7 +6856,7 @@ struct ggml_tensor * ggml_clamp(
struct ggml_tensor * result = ggml_view_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a);
float params[] = { min, max }; float params[] = { min, max };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CLAMP; result->op = GGML_OP_CLAMP;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6890,10 +6891,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
a->ne[2], 1, 1, a->ne[2], 1, 1,
}; };
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
int32_t params[] = { s0, p0, d0 }; int32_t params[] = { s0, p0, d0 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_1D; result->op = GGML_OP_CONV_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6905,10 +6906,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
// ggml_conv_2d // ggml_conv_2d
struct ggml_tensor* ggml_conv_2d( struct ggml_tensor * ggml_conv_2d(
struct ggml_context* ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
int s0, int s0,
int s1, int s1,
int p0, int p0,
@ -6929,10 +6930,10 @@ struct ggml_tensor* ggml_conv_2d(
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
a->ne[3], b->ne[3], a->ne[3], b->ne[3],
}; };
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { s0, s1, p0, p1, d0, d1 }; int32_t params[] = { s0, s1, p0, p1, d0, d1 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D; result->op = GGML_OP_CONV_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6945,7 +6946,7 @@ struct ggml_tensor* ggml_conv_2d(
// ggml_conv_1d_ph // ggml_conv_1d_ph
struct ggml_tensor* ggml_conv_1d_ph( struct ggml_tensor * ggml_conv_1d_ph(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
@ -6963,7 +6964,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
// ggml_pool_1d // ggml_pool_1d
struct ggml_tensor* ggml_pool_1d( struct ggml_tensor * ggml_pool_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_op_pool op, enum ggml_op_pool op,
@ -6982,10 +6983,10 @@ struct ggml_tensor* ggml_pool_1d(
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
a->ne[1], a->ne[1],
}; };
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
int32_t params[] = { op, k0, s0, p0 }; int32_t params[] = { op, k0, s0, p0 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_1D; result->op = GGML_OP_POOL_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6996,7 +6997,7 @@ struct ggml_tensor* ggml_pool_1d(
// ggml_pool_2d // ggml_pool_2d
struct ggml_tensor* ggml_pool_2d( struct ggml_tensor * ggml_pool_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_op_pool op, enum ggml_op_pool op,
@ -7019,10 +7020,10 @@ struct ggml_tensor* ggml_pool_2d(
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
a->ne[2], a->ne[2],
}; };
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_POOL_2D; result->op = GGML_OP_POOL_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -7190,7 +7191,7 @@ struct ggml_tensor * ggml_win_part(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
int32_t params[] = { npx, npy, w }; int32_t params[] = { npx, npy, w };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_WIN_PART; result->op = GGML_OP_WIN_PART;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -7220,7 +7221,7 @@ struct ggml_tensor * ggml_win_unpart(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
int32_t params[] = { w }; int32_t params[] = { w };
ggml_set_op_params(result, &params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_WIN_UNPART; result->op = GGML_OP_WIN_UNPART;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -7349,7 +7350,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
return ggml_map_binary_impl_f32(ctx, a, b, fun, true); return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
} }
// ggml_map_custom1 // ggml_map_custom1_f32
static struct ggml_tensor * ggml_map_custom1_impl_f32( static struct ggml_tensor * ggml_map_custom1_impl_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -7366,7 +7367,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
result->op = GGML_OP_MAP_CUSTOM1; result->op = GGML_OP_MAP_CUSTOM1_F32;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
@ -7387,7 +7388,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
return ggml_map_custom1_impl_f32(ctx, a, fun, true); return ggml_map_custom1_impl_f32(ctx, a, fun, true);
} }
// ggml_map_custom2 // ggml_map_custom2_f32
static struct ggml_tensor * ggml_map_custom2_impl_f32( static struct ggml_tensor * ggml_map_custom2_impl_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -7405,7 +7406,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
result->op = GGML_OP_MAP_CUSTOM2; result->op = GGML_OP_MAP_CUSTOM2_F32;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = b; result->src[1] = b;
@ -7429,7 +7430,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
} }
// ggml_map_custom3 // ggml_map_custom3_f32
static struct ggml_tensor * ggml_map_custom3_impl_f32( static struct ggml_tensor * ggml_map_custom3_impl_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -7448,7 +7449,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
result->op = GGML_OP_MAP_CUSTOM3; result->op = GGML_OP_MAP_CUSTOM3_F32;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = b; result->src[1] = b;
@ -7475,6 +7476,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
} }
// ggml_map_custom1
struct ggml_map_custom1_op_params {
ggml_custom1_op_t fun;
int n_tasks;
void * userdata;
};
static struct ggml_tensor * ggml_map_custom1_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
const ggml_custom1_op_t fun,
int n_tasks,
void * userdata,
bool inplace) {
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
bool is_node = false;
if (!inplace && a->grad) {
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_map_custom1_op_params params = {
/*.fun =*/ fun,
/*.n_tasks =*/ n_tasks,
/*.userdata =*/ userdata
};
ggml_set_op_params(result, (const void *) &params, sizeof(params));
result->op = GGML_OP_MAP_CUSTOM1;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx,
struct ggml_tensor * a,
const ggml_custom1_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
}
struct ggml_tensor * ggml_map_custom1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
const ggml_custom1_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
}
// ggml_map_custom2
struct ggml_map_custom2_op_params {
ggml_custom2_op_t fun;
int n_tasks;
void * userdata;
};
static struct ggml_tensor * ggml_map_custom2_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
const ggml_custom2_op_t fun,
int n_tasks,
void * userdata,
bool inplace) {
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
bool is_node = false;
if (!inplace && (a->grad || b->grad)) {
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_map_custom2_op_params params = {
/*.fun =*/ fun,
/*.n_tasks =*/ n_tasks,
/*.userdata =*/ userdata
};
ggml_set_op_params(result, (const void *) &params, sizeof(params));
result->op = GGML_OP_MAP_CUSTOM2;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
return result;
}
struct ggml_tensor * ggml_map_custom2(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
const ggml_custom2_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
}
struct ggml_tensor * ggml_map_custom2_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
const ggml_custom2_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
}
// ggml_map_custom3
struct ggml_map_custom3_op_params {
ggml_custom3_op_t fun;
int n_tasks;
void * userdata;
};
static struct ggml_tensor * ggml_map_custom3_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
const ggml_custom3_op_t fun,
int n_tasks,
void * userdata,
bool inplace) {
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
bool is_node = false;
if (!inplace && (a->grad || b->grad || c->grad)) {
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_map_custom3_op_params params = {
/*.fun =*/ fun,
/*.n_tasks =*/ n_tasks,
/*.userdata =*/ userdata
};
ggml_set_op_params(result, (const void *) &params, sizeof(params));
result->op = GGML_OP_MAP_CUSTOM3;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result;
}
struct ggml_tensor * ggml_map_custom3(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
const ggml_custom3_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
}
struct ggml_tensor * ggml_map_custom3_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
const ggml_custom3_op_t fun,
int n_tasks,
void * userdata) {
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
}
// ggml_cross_entropy_loss // ggml_cross_entropy_loss
struct ggml_tensor * ggml_cross_entropy_loss( struct ggml_tensor * ggml_cross_entropy_loss(
@ -9283,8 +9468,8 @@ static void ggml_compute_forward_sum_rows_f32(
for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i3 = 0; i3 < ne03; i3++) {
for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i2 = 0; i2 < ne02; i2++) {
for (int64_t i1 = 0; i1 < ne01; i1++) { for (int64_t i1 = 0; i1 < ne01; i1++) {
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0; float row_sum = 0;
ggml_vec_sum_f32(ne00, &row_sum, src_row); ggml_vec_sum_f32(ne00, &row_sum, src_row);
dst_row[0] = row_sum; dst_row[0] = row_sum;
@ -10546,72 +10731,96 @@ static void ggml_compute_forward_mul_mat(
return; return;
} }
// parallelize by src0 rows
const int64_t dr = (ne01 + nth - 1)/nth;
const int64_t ir10 = dr*ith;
const int64_t ir11 = MIN(ir10 + dr, ne01);
// src1 rows
const int64_t nr1 = ne11*ne12*ne13;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { const int64_t nr0 = ne01; // src0 rows
const int64_t i13 = (ir1/(ne12*ne11)); const int64_t nr1 = ne11*ne12*ne13; // src1 rows
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
const int64_t ir0 = (ir1/ne11)%(ne02*ne03); //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
const int64_t i03 = (ir0/(ne02));
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
// GG: this is likely the correct way to broadcast, though need some more thought
// therefore leaving the comments to remind us for now
const int64_t i02 = (i12 / (ne12 / ne02));
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
// const int64_t i02 = (ir0 - i03*ne02);
const int64_t i1 = i11; // distribute the thread work across the inner or outer loop based on which one is larger
const int64_t i2 = i12;
const int64_t i3 = i13;
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides const int64_t ith0 = ith % nth0;
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using const int64_t ith1 = ith / nth0;
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
for (int64_t ir = ir10; ir < ir11; ++ir) { const int64_t ir010 = dr0*ith0;
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); const int64_t ir011 = MIN(ir010 + dr0, nr0);
}
const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {
sched_yield();
return;
} }
//int64_t t1 = ggml_time_us(); assert(ne12 % ne02 == 0);
//static int64_t acc = 0; assert(ne13 % ne03 == 0);
//acc += t1 - t0;
//if (t1 - t0 > 10) {
// printf("\n");
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); // broadcast factors
//} const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
// broadcast src0 into src1
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
}
} }
// ggml_compute_forward_out_prod // ggml_compute_forward_out_prod
static void ggml_compute_forward_out_prod_f32( static void ggml_compute_forward_out_prod_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
@ -12894,7 +13103,7 @@ static void ggml_compute_forward_pool_1d(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const int32_t* opts = (const int32_t*)dst->op_params; const int32_t * opts = (const int32_t *)dst->op_params;
enum ggml_op_pool op = opts[0]; enum ggml_op_pool op = opts[0];
const int k0 = opts[1]; const int k0 = opts[1];
const int s0 = opts[2]; const int s0 = opts[2];
@ -14227,24 +14436,6 @@ static void ggml_compute_forward_map_custom1_f32(
fun(dst, a); fun(dst, a);
} }
static void ggml_compute_forward_map_custom1(
const struct ggml_compute_params * params,
const struct ggml_tensor * a,
struct ggml_tensor * dst,
const ggml_custom1_op_f32_t fun) {
switch (a->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_map_custom2 // ggml_compute_forward_map_custom2
static void ggml_compute_forward_map_custom2_f32( static void ggml_compute_forward_map_custom2_f32(
@ -14263,24 +14454,6 @@ static void ggml_compute_forward_map_custom2_f32(
} }
static void ggml_compute_forward_map_custom2(
const struct ggml_compute_params * params,
const struct ggml_tensor * a,
const struct ggml_tensor * b,
struct ggml_tensor * dst,
const ggml_custom2_op_f32_t fun) {
switch (a->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_map_custom3 // ggml_compute_forward_map_custom3
static void ggml_compute_forward_map_custom3_f32( static void ggml_compute_forward_map_custom3_f32(
@ -14299,24 +14472,52 @@ static void ggml_compute_forward_map_custom3_f32(
fun(dst, a, b, c); fun(dst, a, b, c);
} }
// ggml_compute_forward_map_custom1
static void ggml_compute_forward_map_custom1(
const struct ggml_compute_params * params,
const struct ggml_tensor * a,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
p->fun(dst, a, params->ith, params->nth, p->userdata);
}
// ggml_compute_forward_map_custom2
static void ggml_compute_forward_map_custom2(
const struct ggml_compute_params * params,
const struct ggml_tensor * a,
const struct ggml_tensor * b,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
}
// ggml_compute_forward_map_custom3
static void ggml_compute_forward_map_custom3( static void ggml_compute_forward_map_custom3(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * a, const struct ggml_tensor * a,
const struct ggml_tensor * b, const struct ggml_tensor * b,
const struct ggml_tensor * c, const struct ggml_tensor * c,
struct ggml_tensor * dst, struct ggml_tensor * dst) {
const ggml_custom3_op_f32_t fun) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
switch (a->type) { return;
case GGML_TYPE_F32:
{
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
} break;
default:
{
GGML_ASSERT(false);
} break;
} }
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
} }
// ggml_compute_forward_cross_entropy_loss // ggml_compute_forward_cross_entropy_loss
@ -14838,25 +15039,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
} }
break; break;
case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM1_F32:
{ {
ggml_custom1_op_f32_t fun; ggml_custom1_op_f32_t fun;
memcpy(&fun, tensor->op_params, sizeof(fun)); memcpy(&fun, tensor->op_params, sizeof(fun));
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
}
break;
case GGML_OP_MAP_CUSTOM2_F32:
{
ggml_custom2_op_f32_t fun;
memcpy(&fun, tensor->op_params, sizeof(fun));
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
}
break;
case GGML_OP_MAP_CUSTOM3_F32:
{
ggml_custom3_op_f32_t fun;
memcpy(&fun, tensor->op_params, sizeof(fun));
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
}
break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
} }
break; break;
case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM2:
{ {
ggml_custom2_op_f32_t fun; ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
memcpy(&fun, tensor->op_params, sizeof(fun));
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
} }
break; break;
case GGML_OP_MAP_CUSTOM3: case GGML_OP_MAP_CUSTOM3:
{ {
ggml_custom3_op_f32_t fun; ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
memcpy(&fun, tensor->op_params, sizeof(fun));
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
} }
break; break;
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
@ -15664,6 +15880,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break; } break;
case GGML_OP_MAP_UNARY: case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY: case GGML_OP_MAP_BINARY:
case GGML_OP_MAP_CUSTOM1_F32:
case GGML_OP_MAP_CUSTOM2_F32:
case GGML_OP_MAP_CUSTOM3_F32:
case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM1:
case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM2:
case GGML_OP_MAP_CUSTOM3: case GGML_OP_MAP_CUSTOM3:
@ -16449,12 +16668,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
case GGML_OP_WIN_UNPART: case GGML_OP_WIN_UNPART:
case GGML_OP_MAP_UNARY: case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY: case GGML_OP_MAP_BINARY:
case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM1_F32:
case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM2_F32:
case GGML_OP_MAP_CUSTOM3: case GGML_OP_MAP_CUSTOM3_F32:
{ {
n_tasks = 1; n_tasks = 1;
} break; } break;
case GGML_OP_MAP_CUSTOM1:
{
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
if (p->n_tasks == GGML_N_TASKS_MAX) {
n_tasks = n_threads;
} else {
n_tasks = MIN(p->n_tasks, n_threads);
}
} break;
case GGML_OP_MAP_CUSTOM2:
{
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
if (p->n_tasks == GGML_N_TASKS_MAX) {
n_tasks = n_threads;
} else {
n_tasks = MIN(p->n_tasks, n_threads);
}
} break;
case GGML_OP_MAP_CUSTOM3:
{
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
if (p->n_tasks == GGML_N_TASKS_MAX) {
n_tasks = n_threads;
} else {
n_tasks = MIN(p->n_tasks, n_threads);
}
} break;
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
{ {
n_tasks = n_threads; n_tasks = n_threads;

145
ggml.h
View file

@ -183,6 +183,15 @@
# define GGML_API # define GGML_API
#endif #endif
// TODO: support for clang
#ifdef __GNUC__
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define GGML_DEPRECATED(func, hint) func
#endif
#include <stdint.h> #include <stdint.h>
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
@ -374,6 +383,10 @@ extern "C" {
GGML_OP_MAP_UNARY, GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY, GGML_OP_MAP_BINARY,
GGML_OP_MAP_CUSTOM1_F32,
GGML_OP_MAP_CUSTOM2_F32,
GGML_OP_MAP_CUSTOM3_F32,
GGML_OP_MAP_CUSTOM1, GGML_OP_MAP_CUSTOM1,
GGML_OP_MAP_CUSTOM2, GGML_OP_MAP_CUSTOM2,
GGML_OP_MAP_CUSTOM3, GGML_OP_MAP_CUSTOM3,
@ -570,6 +583,8 @@ extern "C" {
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
@ -1240,7 +1255,7 @@ extern "C" {
// conv_1d with padding = half // conv_1d with padding = half
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph( GGML_API struct ggml_tensor * ggml_conv_1d_ph(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
@ -1253,7 +1268,7 @@ extern "C" {
GGML_OP_POOL_COUNT, GGML_OP_POOL_COUNT,
}; };
GGML_API struct ggml_tensor* ggml_pool_1d( GGML_API struct ggml_tensor * ggml_pool_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_op_pool op, enum ggml_op_pool op,
@ -1261,7 +1276,7 @@ extern "C" {
int s0, // stride int s0, // stride
int p0); // padding int p0); // padding
GGML_API struct ggml_tensor* ggml_pool_2d( GGML_API struct ggml_tensor * ggml_pool_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_op_pool op, enum ggml_op_pool op,
@ -1315,15 +1330,6 @@ extern "C" {
int h0, int h0,
int w); int w);
// custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
GGML_API struct ggml_tensor * ggml_unary( GGML_API struct ggml_tensor * ggml_unary(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1334,63 +1340,138 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_unary_op op); enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_map_unary_f32( // custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
ggml_unary_op_f32_t fun); ggml_unary_op_f32_t fun),
"use ggml_map_custom1 instead");
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
ggml_unary_op_f32_t fun); ggml_unary_op_f32_t fun),
"use ggml_map_custom1_inplace instead");
GGML_API struct ggml_tensor * ggml_map_binary_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
ggml_binary_op_f32_t fun); ggml_binary_op_f32_t fun),
"use ggml_map_custom2 instead");
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
ggml_binary_op_f32_t fun); ggml_binary_op_f32_t fun),
"use ggml_map_custom2_inplace instead");
GGML_API struct ggml_tensor * ggml_map_custom1_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
ggml_custom1_op_f32_t fun); ggml_custom1_op_f32_t fun),
"use ggml_map_custom1 instead");
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
ggml_custom1_op_f32_t fun); ggml_custom1_op_f32_t fun),
"use ggml_map_custom1_inplace instead");
GGML_API struct ggml_tensor * ggml_map_custom2_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
ggml_custom2_op_f32_t fun); ggml_custom2_op_f32_t fun),
"use ggml_map_custom2 instead");
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
ggml_custom2_op_f32_t fun); ggml_custom2_op_f32_t fun),
"use ggml_map_custom2_inplace instead");
GGML_API struct ggml_tensor * ggml_map_custom3_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c, struct ggml_tensor * c,
ggml_custom3_op_f32_t fun); ggml_custom3_op_f32_t fun),
"use ggml_map_custom3 instead");
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c, struct ggml_tensor * c,
ggml_custom3_op_f32_t fun); ggml_custom3_op_f32_t fun),
"use ggml_map_custom3_inplace instead");
// custom operators v2
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
#define GGML_N_TASKS_MAX -1
GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom2(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom3(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_t fun,
int n_tasks,
void * userdata);
// loss function // loss function

View file

@ -219,7 +219,7 @@ struct llama_mmap {
// prefetch/readahead impairs performance on NUMA systems // prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; } if (numa) { prefetch = 0; }
#ifdef __linux__ #ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; } if (prefetch >= file->size) { flags |= MAP_POPULATE; }
#endif #endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {

View file

@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
} }
// amount of VRAM needed per batch size to hold temporary results // amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB }, { MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB }, { MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB }, { MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB }, { MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) { MODEL_70B, 1280ull * kB },
}; };
return k_sizes; return k_sizes;
} }
// amount of VRAM needed per batch size and context to hold temporary results // amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull }, { MODEL_7B, 128ull },
{ MODEL_13B, 160ull }, { MODEL_13B, 160ull },
{ MODEL_30B, 208ull }, { MODEL_30B, 208ull },
{ MODEL_65B, 416ull }, { MODEL_65B, 256ull },
{ MODEL_70B, 416ull }, // TODO (likely can be reduced) { MODEL_70B, 256ull },
}; };
return k_sizes; return k_sizes;
} }
@ -747,12 +747,12 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0; size_t prefetch_size = file_loader->file.size;
size_t lock_size = 0; size_t lock_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
prefetch_size += lt.size; prefetch_size -= lt.size;
} }
} }