diff --git a/.flake8 b/.flake8 index 113ca5fd3..18fba2c15 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,3 @@ [flake8] max-line-length = 125 +ignore = W503 diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 56d17b66c..ea0a05ea1 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -16,5 +16,5 @@ jobs: - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/Package.swift b/Package.swift index 37524edee..b24c9204a 100644 --- a/Package.swift +++ b/Package.swift @@ -13,17 +13,31 @@ let package = Package( products: [ .library(name: "llama", targets: ["llama"]), ], - dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .branch("release")) - ], targets: [ .target( name: "llama", - dependencies: ["ggml"], path: ".", - exclude: ["ggml-metal.metal"], + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "ggml-cuda.cu", + "ggml-cuda.h", + "Makefile" + ], sources: [ + "ggml.c", "llama.cpp", + "ggml-alloc.c", + "ggml-backend.c", + "ggml-quants.c", + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") ], publicHeadersPath: "spm-headers", cSettings: [ diff --git a/README.md b/README.md index 66166c01b..0b4efdd33 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ Typically finetunes of the base models below are supported as well. - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) +- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) diff --git a/common/common.cpp b/common/common.cpp index 26042ff4c..360470c12 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - sparams.samplers_sequence = parse_samplers_input(argv[i]); + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names); } else if (arg == "--sampling-seq") { if (++i >= argc) { invalid_param = true; break; } - sparams.samplers_sequence = argv[i]; + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -915,6 +916,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -956,8 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str()); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -1107,45 +1116,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) { } // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input) { - std::string output = ""; +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::vector sampler_types_from_names(const std::vector & names) { // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map samplers_symbols { - {"top_k", 'k'}, - {"top-k", 'k'}, - {"top_p", 'p'}, - {"top-p", 'p'}, - {"nucleus", 'p'}, - {"typical_p", 'y'}, - {"typical-p", 'y'}, - {"typical", 'y'}, - {"min_p", 'm'}, - {"min-p", 'm'}, - {"tfs_z", 'f'}, - {"tfs-z", 'f'}, - {"tfs", 'f'}, - {"temp", 't'}, - {"temperature",'t'} + std::unordered_map sampler_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top-k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMP}, + {"temperature", llama_sampler_type::TEMP} }; - // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" - size_t separator = input.find(';'); - while (separator != input.npos) { - std::string name = input.substr(0,separator); - input = input.substr(separator+1); - separator = input.find(';'); - if (samplers_symbols.find(name) != samplers_symbols.end()) { - output += samplers_symbols[name]; + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto& name : names) { + const auto sampler_item = sampler_name_map.find(name); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); } } - if (samplers_symbols.find(input) != samplers_symbols.end()) { - output += samplers_symbols[input]; + return sampler_types; +} + +std::vector sampler_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMP} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + +std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMP: return "temp"; + default : return ""; } - return output; } // @@ -1560,6 +1609,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); diff --git a/common/common.h b/common/common.h index 58500d920..cf2198c8b 100644 --- a/common/common.h +++ b/common/common.h @@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input); +std::vector sampler_types_from_names(const std::vector & names); +std::vector sampler_types_from_chars(const std::string & names_string); +std::vector string_split(std::string input, char separator); +std::string sampler_type_to_name_string(llama_sampler_type sampler_type); // // Model utils diff --git a/common/sampling.cpp b/common/sampling.cpp index 844ad7c53..a001750da 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { - for (auto s : params.samplers_sequence) { - switch (s) { - case 'k': result += "-> top_k "; break; - case 'f': result += "-> tfs_z "; break; - case 'y': result += "-> typical_p "; break; - case 'p': result += "-> top_p "; break; - case 'm': result += "-> min_p "; break; - case 't': result += "-> temp "; break; - default : break; + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; } } } else { @@ -127,8 +122,6 @@ static void sampler_queue( const llama_sampling_params & params, llama_token_data_array & cur_p, size_t & min_keep) { - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; const float dynatemp_range = params.dynatemp_range; const float dynatemp_exponent = params.dynatemp_exponent; @@ -137,16 +130,16 @@ static void sampler_queue( const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const std::string & samplers_sequence = params.samplers_sequence; + const std::vector & samplers_sequence = params.samplers_sequence; - for (auto s : samplers_sequence) { - switch (s){ - case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; - case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; - case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; - case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; - case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMP: if (dynatemp_range > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range); diff --git a/common/sampling.h b/common/sampling.h index 88899c094..2bd6a75d2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -8,6 +8,16 @@ #include #include +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMP = 't' +}; + // sampling parameters typedef struct llama_sampling_params { int32_t n_prev = 64; // number of previous tokens to remember @@ -28,7 +38,15 @@ typedef struct llama_sampling_params { float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate bool penalize_nl = true; // consider newlines as a repeatable token - std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp + + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMP + }; std::string grammar; // optional BNF-like grammar to constrain sampling diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 0d4ea03b4..cae1551a2 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -209,6 +209,8 @@ class Model: return InternLM2Model if model_architecture == "MiniCPMForCausalLM": return MiniCPMModel + if model_architecture == "BertModel": + return BertModel return Model def _is_model_safetensors(self) -> bool: @@ -264,6 +266,8 @@ class Model: return gguf.MODEL_ARCH.INTERNLM2 if arch == "MiniCPMForCausalLM": return gguf.MODEL_ARCH.MINICPM + if arch == "BertModel": + return gguf.MODEL_ARCH.BERT raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1629,6 +1633,96 @@ in chat mode so that the conversation can end normally.") self.post_write_tensors(tensor_map, name, data_torch) +class BertModel(Model): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + + def set_gguf_parameters(self): + # TODO(cebtenzzre): merge with parent class + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + path = self.dir_model + added_tokens_path = self.dir_model if self.dir_model.exists() else None + + # use huggingface vocab to get all tokens + vocab = HfVocab(path, added_tokens_path) + tokens, scores, toktypes = zip(*vocab.all_tokens()) + assert len(tokens) == vocab.vocab_size + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + n_token_types = len(set(toktypes)) + self.gguf_writer.add_token_type_count(n_token_types) + + # convert to phantom space vocab + def phantom(tok, typ): + if tok.startswith(b"[") and tok.endswith(b"]"): + return tok + if tok.startswith(b"##"): + return tok[2:] + return b"\xe2\x96\x81" + tok + tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)] + + # set up bos and eos tokens (cls and sep) + self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) + self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def write_tensors(self): + tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + tensors = dict(self.get_tensors()) + for name, data_torch in tensors.items(): + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + continue # we don't need these + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + new_dtype: type[np.floating[Any]] + + if ( + self.ftype == 1 and name.endswith(".weight") and n_dims == 2 + and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 + ): + # if f16 desired, convert any float32 2-dim weight tensors to float16 + new_dtype = np.float16 + else: + # if f32 desired, convert any float16 to float32 + new_dtype = np.float32 + + print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") + + if data.dtype != new_dtype: + data = data.astype(new_dtype) + + self.gguf_writer.add_tensor(new_name, data) + + ###### CONVERSION LOGIC ###### diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index d2be805d1..def210531 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -88,7 +88,8 @@ def main(): gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) - gguf_writer.add_rope_dimension_count(hidden_size // head_count) + # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443 + gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 3295cd240..27376c8f0 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -87,7 +87,17 @@ int main(int argc, char ** argv) { } const int n_embd = llama_n_embd(model); - const auto * embeddings = llama_get_embeddings(ctx); + auto * embeddings = llama_get_embeddings(ctx); + + // l2-normalize embeddings + float norm = 0; + for (int i = 0; i < n_embd; i++) { + norm += embeddings[i] * embeddings[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n_embd; i++) { + embeddings[i] /= norm; + } for (int i = 0; i < n_embd; i++) { printf("%f ", embeddings[i]); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 4cd5d99bb..2f7be8a13 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -337,24 +337,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int params.mem_buffer = NULL; params.no_alloc = true; struct ggml_context * ctx = NULL; - struct ggml_allocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; + struct ggml_gallocr * alloc = NULL; + struct ggml_cgraph * gf = NULL; ctx = ggml_init(params); - alloc = ggml_allocr_new_measure(tensor_alignment); + alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); - ggml_free(ctx); - static std::vector data_compute; - data_compute.resize(alloc_size + tensor_alignment); - - ctx = ggml_init(params); - alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); + ggml_gallocr_alloc_graph(alloc, gf); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); static std::vector data_work; @@ -363,6 +353,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int ggml_graph_compute(gf, &cplan); + ggml_gallocr_free(alloc); ggml_free(ctx); return true; } diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b7e19c5fe..b11c56020 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include "llama.h" #include "common.h" #include "train.h" @@ -13,8 +14,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -128,7 +127,7 @@ struct my_llama_lora_layer { struct my_llama_lora { struct ggml_context * ctx = NULL; - std::vector data; + ggml_backend_buffer_t data; my_llama_lora_hparams hparams; @@ -372,63 +371,6 @@ static void set_param_lora(struct my_llama_lora * lora) { } } -static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) { - ggml_allocr_alloc(alloc, lora->tok_embeddings_a); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b); - ggml_allocr_alloc(alloc, lora->norm_a); - ggml_allocr_alloc(alloc, lora->norm_b); - ggml_allocr_alloc(alloc, lora->output_a); - ggml_allocr_alloc(alloc, lora->output_b); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a); - ggml_allocr_alloc(alloc, layer.attention_norm_b); - ggml_allocr_alloc(alloc, layer.wq_a); - ggml_allocr_alloc(alloc, layer.wq_b); - ggml_allocr_alloc(alloc, layer.wk_a); - ggml_allocr_alloc(alloc, layer.wk_b); - ggml_allocr_alloc(alloc, layer.wv_a); - ggml_allocr_alloc(alloc, layer.wv_b); - ggml_allocr_alloc(alloc, layer.wo_a); - ggml_allocr_alloc(alloc, layer.wo_b); - ggml_allocr_alloc(alloc, layer.ffn_norm_a); - ggml_allocr_alloc(alloc, layer.ffn_norm_b); - ggml_allocr_alloc(alloc, layer.w1_a); - ggml_allocr_alloc(alloc, layer.w1_b); - ggml_allocr_alloc(alloc, layer.w2_a); - ggml_allocr_alloc(alloc, layer.w2_b); - ggml_allocr_alloc(alloc, layer.w3_a); - ggml_allocr_alloc(alloc, layer.w3_b); - } - ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad); - ggml_allocr_alloc(alloc, lora->norm_a->grad); - ggml_allocr_alloc(alloc, lora->norm_b->grad); - ggml_allocr_alloc(alloc, lora->output_a->grad); - ggml_allocr_alloc(alloc, lora->output_b->grad); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a->grad); - ggml_allocr_alloc(alloc, layer.attention_norm_b->grad); - ggml_allocr_alloc(alloc, layer.wq_a->grad); - ggml_allocr_alloc(alloc, layer.wq_b->grad); - ggml_allocr_alloc(alloc, layer.wk_a->grad); - ggml_allocr_alloc(alloc, layer.wk_b->grad); - ggml_allocr_alloc(alloc, layer.wv_a->grad); - ggml_allocr_alloc(alloc, layer.wv_b->grad); - ggml_allocr_alloc(alloc, layer.wo_a->grad); - ggml_allocr_alloc(alloc, layer.wo_b->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad); - ggml_allocr_alloc(alloc, layer.w1_a->grad); - ggml_allocr_alloc(alloc, layer.w1_b->grad); - ggml_allocr_alloc(alloc, layer.w2_a->grad); - ggml_allocr_alloc(alloc, layer.w2_b->grad); - ggml_allocr_alloc(alloc, layer.w3_a->grad); - ggml_allocr_alloc(alloc, layer.w3_b->grad); - } -} - static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { const auto & lparams = lora->hparams; @@ -522,18 +464,8 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora set_param_lora(lora); - // measure data size - size_t size = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - size += GGML_PAD(ggml_nbytes(t), tensor_alignment); - } - - // allocate data - struct ggml_allocr * alloc = NULL; - lora->data.resize(size + tensor_alignment); - alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment); - alloc_lora(alloc, lora); - ggml_allocr_free(alloc); + // allocate data for lora tensors + lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); } static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { @@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl static struct ggml_tensor * llama_build_lora_finetune_graphs( struct my_llama_model * model, struct my_llama_lora * lora, - struct ggml_allocr * alloc, + ggml_gallocr_t alloc, struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, @@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_tokens, const int n_batch, const bool enable_flash_attn, - const bool enable_checkpointing) { + const bool enable_checkpointing, + const bool measure_only) { ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; @@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_allocr_alloc(alloc, KQ_pos); - if (!ggml_allocr_is_measure(alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } + ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] @@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // input gradient ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_allocr_alloc(alloc, t36->grad); + ggml_set_input(t36->grad); // KQ_pos ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); @@ -805,11 +732,23 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // note: they will be freed in reverse order for (unsigned int i = 0; i < checkpoints.size(); ++i) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_allocr_alloc(alloc, checkpoints[i]); + ggml_set_input(checkpoints[i]); } } - ggml_allocr_alloc_graph(alloc, gb); + if (measure_only) { + ggml_gallocr_reserve(alloc, gb); + } else { + ggml_gallocr_alloc_graph(alloc, gb); + + // set KQ_pos + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + } // remove the additional nodes and leafs for (int i = n_leafs_before; i < gb->n_leafs; ++i) { @@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) { printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); if (params.only_write_lora) { save_train_files_data save_data; @@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) { int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; - - std::vector mem_input_data; - std::vector mem_compute_data; - // context for input tensors without their data struct ggml_init_params ctx_input_params = { ggml_tensor_overhead() * 2, // mem_size @@ -1706,17 +1641,11 @@ int main(int argc, char ** argv) { struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - // measure required memory for input tensors - size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) + - GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) + - tensor_alignment; - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - // allocate input tensors - mem_input_data.resize(max_input_size); - ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc_inps, tokens_input); - ggml_allocr_alloc(alloc_inps, target_probs); + // measure required memory for input tensors + ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); + size_t max_input_size = ggml_backend_buffer_get_size(input_data); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); // context for compute tensors without their data const size_t estimated_compute_size_wo_data = ( @@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + true ); - size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer if (max_compute_size < best_compute_size) { best_compute_size = max_compute_size; best_order = gf->order; } - ggml_allocr_free(alloc); + ggml_gallocr_free(alloc); ggml_free(ctx_compute); } size_t max_compute_size = best_compute_size; @@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) { "invalid"); // allocate compute tensors - mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + false ); - ggml_allocr_free(alloc); - ggml_allocr_free(alloc_inps); - // tokenize data std::vector train_tokens; @@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) { ggml_free(ctx_work); ggml_free(ctx_compute); ggml_free(ctx_input); + ggml_gallocr_free(alloc); + int64_t t1 = ggml_time_ms(); printf("%s: total training time: ", __func__); diff --git a/examples/llava/README.md b/examples/llava/README.md index 721d5e613..19f1a50a2 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b git clone https://huggingface.co/openai/clip-vit-large-patch14-336 ``` -2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +2. Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b ``` -3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: +4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh python ./convert.py ../llava-v1.5-7b diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9129052a2..ccd0d85ad 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -367,7 +367,7 @@ struct clip_ctx { ggml_backend_buffer_t params_buffer = NULL; ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_t backend = NULL; - ggml_allocr * compute_alloc = NULL; + ggml_gallocr_t compute_alloc = NULL; }; static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { @@ -405,31 +405,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); - ggml_allocr_alloc(ctx->compute_alloc, inp_raw); - - if (!ggml_allocr_is_measure(ctx->compute_alloc)) { - float * data = (float *)malloc(ggml_nbytes(inp_raw)); - - for (size_t i = 0; i < imgs->size; i++) { - const int nx = imgs->data[i].nx; - const int ny = imgs->data[i].ny; - GGML_ASSERT(nx == image_size && ny == image_size); - - const int n = nx * ny; - - for (int b = 0; b < batch_size; b++) { - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; - } - } - } - } - } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); - } + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); @@ -438,13 +415,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // concat class_embeddings and patch_embeddings struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_allocr_alloc(ctx->compute_alloc, embeddings); - if (!ggml_allocr_is_measure(ctx->compute_alloc)) { - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); @@ -453,15 +425,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_allocr_alloc(ctx->compute_alloc, positions); - if (!ggml_allocr_is_measure(ctx->compute_alloc)) { - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } + ggml_set_name(positions, "positions"); + ggml_set_input(positions); embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); @@ -560,15 +525,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_allocr_alloc(ctx->compute_alloc, patches); - if (!ggml_allocr_is_measure(ctx->compute_alloc)) { - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; - } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } + ggml_set_name(patches, "patches"); + ggml_set_input(patches); // shape [1, 576, 1024] // ne is whcn, ne = [1024, 576, 1, 1] @@ -809,7 +767,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } // data - size_t buffer_size = 0; + size_t model_size = 0; { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); @@ -817,7 +775,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { enum ggml_type type = gguf_get_tensor_type(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(meta, name); size_t tensor_size = ggml_nbytes(cur); - buffer_size += tensor_size; + model_size += tensor_size; if (verbosity >= 3) { printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); @@ -825,8 +783,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } - buffer_size += n_tensors * 128 /* CLIP PADDING */; - clip_ctx * new_clip = new clip_ctx; // update projector type @@ -886,12 +842,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - printf("%s: model size: %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0); + printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors); + printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -925,12 +881,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } // alloc memory and offload data - new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size); - ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer); + new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); - ggml_allocr_alloc(alloc, cur); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { @@ -949,7 +903,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } - ggml_allocr_free(alloc); fin.close(); } @@ -1077,15 +1030,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // measure mem requirement and allocate { new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); + new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); - size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf); - ggml_allocr_free(new_clip->compute_alloc); - new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size); - new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer); - + ggml_gallocr_reserve(new_clip->compute_alloc, gf); + size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } @@ -1267,12 +1217,72 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(batch_size == 1); // TODO: support multiple images } - // reset alloc buffer to clean the memory from previous invocations - ggml_allocr_reset(ctx->compute_alloc); - // build the inference graph ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); - ggml_allocr_alloc_graph(ctx->compute_alloc, gf); + ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); + + // set inputs + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_positions = num_patches + 1; + + { + struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); + float * data = (float *)malloc(ggml_nbytes(inp_raw)); + + for (size_t i = 0; i < imgs->size; i++) { + const int nx = imgs->data[i].nx; + const int ny = imgs->data[i].ny; + GGML_ASSERT(nx == image_size && ny == image_size); + + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + for (int k = 0; k < 3; k++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; + } + } + } + } + } + ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + free(data); + } + + { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); + } if (ggml_backend_is_cpu(ctx->backend)) { ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index f5a3c9b46..e204b56be 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -71,7 +71,7 @@ def bytes_to_unicode(): return dict(zip(bs, cs)) -ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") +ap = argparse.ArgumentParser() ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") ap.add_argument("--text-only", action="store_true", required=False, diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py index 515f6b58d..0a61efdfe 100644 --- a/examples/llava/llava-surgery.py +++ b/examples/llava/llava-surgery.py @@ -42,5 +42,5 @@ if len(clip_tensors) > 0: torch.save(checkpoint, path) print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt new file mode 100644 index 000000000..f80f727a7 --- /dev/null +++ b/examples/llava/requirements.txt @@ -0,0 +1,3 @@ +-r ../../requirements/requirements-convert.txt +pillow~=10.2.0 +torch~=2.1.1 diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index d8de7dd38..18235b8a1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,7 +1,9 @@ #include "common.h" +#include "ggml.h" #include "llama.h" #include +#include #include #include #include @@ -73,6 +75,8 @@ int main(int argc, char ** argv){ int n_drafted = 0; int n_accept = 0; + int64_t t_draft_us = 0; + int n_past = inp.size(); bool has_eos = false; @@ -160,7 +164,7 @@ int main(int argc, char ** argv){ // generate n_pred tokens through prompt lookup auto prompt_lookup = [&]() -> void { - int inp_size = inp.size(); + const int inp_size = inp.size(); for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ const llama_token * ngram = &inp[inp_size - ngram_size]; @@ -191,8 +195,12 @@ int main(int argc, char ** argv){ return; }; + const int64_t t_start_draft_us = ggml_time_us(); + prompt_lookup(); + t_draft_us += ggml_time_us() - t_start_draft_us; + llama_decode(ctx, batch_tgt); ++n_past; @@ -210,6 +218,8 @@ int main(int argc, char ** argv){ LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0ed4d79f9..e8ab8cbae 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -98,7 +98,7 @@ static void write_logfile( #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { - if (!is_interacting) { + if (!is_interacting && g_params->interactive) { is_interacting = true; } else { console::cleanup(); @@ -392,7 +392,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); } - if (params.interactive) { + // ctrl+C handling + { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = sigint_handler; @@ -405,7 +406,9 @@ int main(int argc, char ** argv) { }; SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + } + if (params.interactive) { LOG_TEE("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { diff --git a/examples/server/README.md b/examples/server/README.md index 1db7cdf21..0f7373ae8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -185,7 +185,7 @@ node index.js `ignore_eos`: Ignore end of stream token and continue generating (default: false). - `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp index 43410f803..2eca8a9fb 100644 --- a/examples/server/oai.hpp +++ b/examples/server/oai.hpp @@ -15,9 +15,13 @@ using json = nlohmann::json; inline static json oaicompat_completion_params_parse( - const json &body /* openai api json semantics */) + const json &body, /* openai api json semantics */ + const std::string &chat_template) { json llama_params; + std::string formatted_prompt = chat_template == "chatml" + ? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...) + : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...) llama_params["__oaicompat"] = true; @@ -30,7 +34,7 @@ inline static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["prompt"] = formatted_prompt; llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 28684ead3..72c54b42d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -36,6 +36,7 @@ struct server_params std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; + std::string chat_template = "chatml"; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; @@ -625,18 +626,36 @@ struct llama_server_context const int n_vocab = llama_n_vocab(model); for (const auto &el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) + if (el.is_array() && el.size() == 2) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) + float bias; + if (el[1].is_number()) { - if (el[1].is_number()) + bias = el[1].get(); + } + else if (el[1].is_boolean() && !el[1].get()) + { + bias = -INFINITY; + } + else + { + continue; + } + + if (el[0].is_number_integer()) + { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { - slot->sparams.logit_bias[tok] = el[1].get(); + slot->sparams.logit_bias[tok] = bias; } - else if (el[1].is_boolean() && !el[1].get()) + } + else if (el[0].is_string()) + { + auto toks = llama_tokenize(model, el[0].get(), false); + for (auto tok : toks) { - slot->sparams.logit_bias[tok] = -INFINITY; + slot->sparams.logit_bias[tok] = bias; } } } @@ -1592,10 +1611,6 @@ struct llama_server_context LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); - slot.cache_tokens = prompt_tokens; if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) @@ -1609,6 +1624,10 @@ struct llama_server_context } } + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); + + llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, @@ -1862,6 +1881,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); + printf(" --chat-template FORMAT_NAME"); + printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); printf("\n"); } @@ -2301,6 +2322,21 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, log_set_target(stdout); LOG_INFO("logging to file is disabled.", {}); } + else if (arg == "--chat-template") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::string value(argv[i]); + if (value != "chatml" && value != "llama2") { + fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str()); + invalid_param = true; + break; + } + sparams.chat_template = value; + } else if (arg == "--override-kv") { if (++i >= argc) { @@ -2754,13 +2790,13 @@ int main(int argc, char **argv) // TODO: add mount point without "/v1" prefix -- how? - svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) + svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } - json data = oaicompat_completion_params_parse(json::parse(req.body)); + json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template); const int task_id = llama.queue_tasks.get_new_id(); llama.queue_results.add_waiting_task_id(task_id); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 70cce0721..548548962 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -167,6 +167,34 @@ static T json_value(const json &body, const std::string &key, const T &default_v : default_value; } +inline std::string format_llama2(std::vector messages) +{ + std::ostringstream output; + bool is_inside_turn = false; + + for (auto it = messages.begin(); it != messages.end(); ++it) { + if (!is_inside_turn) { + output << "[INST] "; + } + std::string role = json_value(*it, "role", std::string("user")); + std::string content = json_value(*it, "content", std::string("")); + if (role == "system") { + output << "<>\n" << content << "\n<>\n\n"; + is_inside_turn = true; + } else if (role == "user") { + output << content << " [/INST]"; + is_inside_turn = true; + } else { + output << " " << content << " "; + is_inside_turn = false; + } + } + + LOG_VERBOSE("format_llama2", {{"text", output.str()}}); + + return output.str(); +} + inline std::string format_chatml(std::vector messages) { std::ostringstream chatml_msgs; @@ -180,6 +208,8 @@ inline std::string format_chatml(std::vector messages) chatml_msgs << "<|im_start|>assistant" << '\n'; + LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}}); + return chatml_msgs.str(); } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index eee9d4de3..2e2a8ce08 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include "common.h" #include "train.h" #include "llama.h" @@ -19,8 +20,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -58,7 +57,7 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; - std::vector data; + ggml_backend_buffer_t data = NULL; my_llama_hparams hparams; @@ -147,39 +146,6 @@ static void set_param_model(struct my_llama_model * model) { } } -static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) { - ggml_allocr_alloc(alloc, model->tok_embeddings); - ggml_allocr_alloc(alloc, model->norm); - ggml_allocr_alloc(alloc, model->output); - for (uint32_t i = 0; i < model->layers.size(); ++i) { - auto & layer = model->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm); - ggml_allocr_alloc(alloc, layer.wq); - ggml_allocr_alloc(alloc, layer.wk); - ggml_allocr_alloc(alloc, layer.wv); - ggml_allocr_alloc(alloc, layer.wo); - ggml_allocr_alloc(alloc, layer.ffn_norm); - ggml_allocr_alloc(alloc, layer.w1); - ggml_allocr_alloc(alloc, layer.w2); - ggml_allocr_alloc(alloc, layer.w3); - } - ggml_allocr_alloc(alloc, model->tok_embeddings->grad); - ggml_allocr_alloc(alloc, model->norm->grad); - ggml_allocr_alloc(alloc, model->output->grad); - for (uint32_t i = 0; i < model->layers.size(); ++i) { - auto & layer = model->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm->grad); - ggml_allocr_alloc(alloc, layer.wq->grad); - ggml_allocr_alloc(alloc, layer.wk->grad); - ggml_allocr_alloc(alloc, layer.wv->grad); - ggml_allocr_alloc(alloc, layer.wo->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm->grad); - ggml_allocr_alloc(alloc, layer.w1->grad); - ggml_allocr_alloc(alloc, layer.w2->grad); - ggml_allocr_alloc(alloc, layer.w3->grad); - } -} - static void init_model(struct my_llama_model * model) { const auto & hparams = model->hparams; @@ -252,17 +218,8 @@ static void init_model(struct my_llama_model * model) { set_param_model(model); - // measure data size - size_t size = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - size += GGML_PAD(ggml_nbytes(t), tensor_alignment); - } - // allocate data - struct ggml_allocr * alloc = NULL; - model->data.resize(size + tensor_alignment); - alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment); - alloc_model(alloc, model); + model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); } static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { @@ -297,7 +254,7 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean, static struct ggml_tensor * llama_build_train_graphs( struct my_llama_model * model, - struct ggml_allocr * alloc, + ggml_gallocr_t alloc, struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, @@ -308,7 +265,8 @@ static struct ggml_tensor * llama_build_train_graphs( const int n_tokens, const int n_batch, const bool enable_flash_attn, - const bool enable_checkpointing) { + const bool enable_checkpointing, + const bool measure_only) { ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; @@ -334,13 +292,7 @@ static struct ggml_tensor * llama_build_train_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_allocr_alloc(alloc, KQ_pos); - if (!ggml_allocr_is_measure(alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } + ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] @@ -448,21 +400,31 @@ static struct ggml_tensor * llama_build_train_graphs( // KQ_pos ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - - ggml_allocr_alloc(alloc, t36->grad); + ggml_set_input(t36->grad); // allocating checkpoints in one block to reduce memory fragmentation // note: they will be freed in reverse order for (int i = 0; i < (int) checkpoints.size(); ++i) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_allocr_alloc(alloc, checkpoints[i]); + ggml_set_input(checkpoints[i]); } } //int n_leafs_after = gb->n_leafs; //int n_nodes_after = gb->n_nodes; + if (measure_only) { + // FIXME: will still allocate + ggml_gallocr_reserve(alloc, gb); + } else { + ggml_gallocr_alloc_graph(alloc, gb); - ggml_allocr_alloc_graph(alloc, gb); + if (!measure_only) { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + } // remove the additional nodes and leafs for (int i = n_leafs_before; i < gb->n_leafs; ++i) { @@ -1046,7 +1008,7 @@ int main(int argc, char ** argv) { printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); + printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f)); if (params.only_write_model) { save_train_files_data save_data; @@ -1073,11 +1035,6 @@ int main(int argc, char ** argv) { int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; - std::vector mem_input_data; - std::vector mem_compute_data; - - ggml_allocr * alloc = NULL; - // context for input tensors without their data struct ggml_init_params ctx_input_params = { ggml_tensor_overhead() * 2, // mem_size @@ -1091,16 +1048,10 @@ int main(int argc, char ** argv) { struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); // measure required memory for input tensors - size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) + - GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) + - tensor_alignment; - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - // allocate input tensors - mem_input_data.resize(max_input_size); - alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc, tokens_input); - ggml_allocr_alloc(alloc, target_probs); + ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); + size_t max_input_size = ggml_backend_buffer_get_size(input_data); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); // context for compute tensors without their data const size_t estimated_compute_size_wo_data = ( @@ -1127,7 +1078,7 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1140,9 +1091,10 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + true ); - size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer if (max_compute_size < best_compute_size) { best_compute_size = max_compute_size; best_order = gf->order; @@ -1157,9 +1109,8 @@ int main(int argc, char ** argv) { "invalid"); // allocate compute tensors - mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1172,7 +1123,8 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + false ); std::vector train_tokens; diff --git a/flake.lock b/flake.lock index 8cfc78273..239d0686c 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1706732774, - "narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=", + "lastModified": 1707268954, + "narHash": "sha256-2en1kvde3cJVc3ZnTy8QeD2oKcseLFjYPLKhIGDanQ0=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d", + "rev": "f8e2ebd66d097614d51a56a755450d4ae1632df1", "type": "github" }, "original": { diff --git a/ggml-alloc.c b/ggml-alloc.c index f9be6e1cb..c28c37c4f 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -17,397 +17,11 @@ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) #define AT_PRINTF(...) -// TODO: GGML_PAD ? -static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { - assert(alignment && !(alignment & (alignment - 1))); // power of 2 - size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; - return offset + align; -} -struct free_block { - void * addr; - size_t size; -}; - -struct ggml_tallocr { - struct ggml_backend_buffer * buffer; - bool buffer_owned; - void * base; - size_t alignment; - - int n_free_blocks; - struct free_block free_blocks[MAX_FREE_BLOCKS]; - - size_t max_size; - - bool measure; - -#ifdef GGML_ALLOCATOR_DEBUG - struct ggml_tensor * allocated_tensors[1024]; -#endif -}; - -#ifdef GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == NULL) { - alloc->allocated_tensors[i] = tensor; - return; - } - } - GGML_ASSERT(!"out of allocated_tensors"); -} -static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == tensor || - (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { - alloc->allocated_tensors[i] = NULL; - return; - } - } - printf("tried to free tensor %s not found\n", tensor->name); - GGML_ASSERT(!"tensor not found"); -} -#endif - -// check if a tensor is allocated by this buffer -static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); -} - -static bool ggml_is_view(struct ggml_tensor * t) { +static bool ggml_is_view(const struct ggml_tensor * t) { return t->view_src != NULL; } -void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources - GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated - - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); - size = aligned_offset(NULL, size, alloc->alignment); - - AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); - - size_t max_avail = 0; - - // find the best fitting free block besides the last block - int best_fit_block = -1; - size_t best_fit_size = SIZE_MAX; - for (int i = 0; i < alloc->n_free_blocks - 1; i++) { - struct free_block * block = &alloc->free_blocks[i]; - max_avail = MAX(max_avail, block->size); - if (block->size >= size && block->size <= best_fit_size) { - best_fit_block = i; - best_fit_size = block->size; - } - } - - if (best_fit_block == -1) { - // the last block is our last resort - struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; - max_avail = MAX(max_avail, block->size); - if (block->size >= size) { - best_fit_block = alloc->n_free_blocks - 1; - } else { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n", - __func__, tensor->name, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); - return; - } - } - - struct free_block * block = &alloc->free_blocks[best_fit_block]; - void * addr = block->addr; - block->addr = (char*)block->addr + size; - block->size -= size; - if (block->size == 0) { - // remove block if empty - alloc->n_free_blocks--; - for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - - AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); - - tensor->data = addr; - tensor->buffer = alloc->buffer; - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, tensor); - } - -#ifdef GGML_ALLOCATOR_DEBUG - add_allocated_tensor(alloc, tensor); - size_t cur_max = (char*)addr - (char*)alloc->base + size; - if (cur_max > alloc->max_size) { - printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); - for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i]) { - printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); - } - } - printf("\n"); - } -#endif - - alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size); -} - -// this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - if (ggml_tallocr_is_own(alloc, tensor) == false) { - // the tensor was not allocated in this buffer - // this can happen because the graph allocator will try to free weights and other tensors from different buffers - // the easiest way to deal with this is just to ignore it - // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); - return; - } - - void * ptr = tensor->data; - - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); - size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); - -#ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, tensor); -#endif - - // see if we can merge with an existing block - for (int i = 0; i < alloc->n_free_blocks; i++) { - struct free_block * block = &alloc->free_blocks[i]; - // check if ptr is at the end of the block - if ((char*)block->addr + block->size == ptr) { - block->size += size; - // check if we can merge with the next block - if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { - block->size += alloc->free_blocks[i+1].size; - alloc->n_free_blocks--; - for (int j = i+1; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - return; - } - // check if ptr is at the beginning of the block - if ((char*)ptr + size == block->addr) { - block->addr = ptr; - block->size += size; - // check if we can merge with the previous block - if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { - alloc->free_blocks[i-1].size += block->size; - alloc->n_free_blocks--; - for (int j = i; j < alloc->n_free_blocks; j++) { - alloc->free_blocks[j] = alloc->free_blocks[j+1]; - } - } - return; - } - } - // otherwise, add a new block - GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); - // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) - int insert_pos = 0; - while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { - insert_pos++; - } - // shift all blocks from insert_pos onward to make room for the new block - for (int i = alloc->n_free_blocks; i > insert_pos; i--) { - alloc->free_blocks[i] = alloc->free_blocks[i-1]; - } - // insert the new block - alloc->free_blocks[insert_pos].addr = ptr; - alloc->free_blocks[insert_pos].size = size; - alloc->n_free_blocks++; -} - -void ggml_tallocr_reset(ggml_tallocr_t alloc) { - alloc->n_free_blocks = 1; - size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment); - alloc->free_blocks[0].addr = (char *)alloc->base + align_offset; - - if (alloc->measure) { - alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows - } else { - alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; - ggml_backend_buffer_reset(alloc->buffer); - } -} - -ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) { - struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size); - - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); - - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ true, - /*.base = */ ggml_backend_buffer_get_base(buffer), - /*.alignment = */ alignment, - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, - /*.measure = */ false, -#ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, -#endif - }; - - ggml_tallocr_reset(alloc); - - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { - ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment); - alloc->measure = true; - - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); - - // TODO: move alloc initialization to a common ggml_tallocr_new_impl function - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - alloc->measure = true; - ggml_tallocr_reset(alloc); - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); -} - -ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); -} - -ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); - - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ false, - /*.base = */ ggml_backend_buffer_get_base(buffer), - /*.alignment = */ ggml_backend_buffer_get_alignment(buffer), - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, - /*.measure = */ false, -#ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, -#endif - }; - - ggml_tallocr_reset(alloc); - - return alloc; -} - -struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) { - return alloc->buffer; -} - -void ggml_tallocr_free(ggml_tallocr_t alloc) { - if (alloc == NULL) { - return; - } - - if (alloc->buffer_owned) { - ggml_backend_buffer_free(alloc->buffer); - } - free(alloc); -} - -bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) { - return alloc->measure; -} - -size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { - // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail - // to avoid this, we add a 10% margin to the buffer size - return alloc->max_size + alloc->max_size/10; -} - -// graph allocator - -struct hash_node { - int n_children; - int n_views; -}; - -struct ggml_gallocr { - ggml_tallocr_t talloc; - struct ggml_hash_set hash_set; - struct hash_node * hash_values; - size_t hash_values_size; - ggml_tallocr_t * hash_allocs; - int * parse_seq; - int parse_seq_len; -}; - -ggml_gallocr_t ggml_gallocr_new(void) { - ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr)); - - *galloc = (struct ggml_gallocr) { - /*.talloc = */ NULL, - /*.hash_set = */ {0}, - /*.hash_values = */ NULL, - /*.hash_values_size = */ 0, - /*.hash_allocs = */ NULL, - /*.parse_seq = */ NULL, - /*.parse_seq_len = */ 0, - }; - - return galloc; -} - -void ggml_gallocr_free(ggml_gallocr_t galloc) { - if (galloc == NULL) { - return; - } - - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - if (galloc->hash_allocs != NULL) { - free(galloc->hash_allocs); - } - if (galloc->parse_seq != NULL) { - free(galloc->parse_seq); - } - free(galloc); -} - -void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) { - free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); - - for (int i = 0; i < n; i++) { - galloc->parse_seq[i] = list[i]; - } - galloc->parse_seq_len = n; -} - -static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { - size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); - return &galloc->hash_values[i]; -} - static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { if (a->type != b->type) { return false; @@ -447,106 +61,511 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } -static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) { - if (galloc->talloc != NULL) { - return galloc->talloc; - } - - return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)]; +// TODO: GGML_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; } -static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) { - ggml_tallocr_t alloc = node_tallocr(galloc, view); +// tallocr +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; - GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); - if (update_backend) { - view->backend = view->view_src->backend; +ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) { + ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr)); + if (talloc == NULL) { + return NULL; } - // views are initialized in the alloc buffer rather than the view_src buffer - view->buffer = alloc->buffer; - view->data = (char *)view->view_src->data + view->view_offs; - assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); + void * base = ggml_backend_buffer_get_base(buffer); + size_t align = ggml_backend_buffer_get_alignment(buffer); - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, view); - } + assert(align && !(align & (align - 1))); // power of 2 + + *talloc = (struct ggml_tallocr) { + /*.buffer = */ buffer, + /*.base = */ base, + /*.alignment = */ align, + /*.offset = */ aligned_offset(base, 0, align), + }; + return talloc; } -static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); +void ggml_tallocr_free(ggml_tallocr_t talloc) { + free(talloc); +} - if (node->data == NULL) { - if (ggml_is_view(node)) { - init_view(galloc, node, true); +void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) { + size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); + size = GGML_PAD(size, talloc->alignment); + + if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { + fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + + void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; + talloc->offset += size; + + assert(((uintptr_t)addr % talloc->alignment) == 0); + + ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); +} + +// dynamic tensor allocator + +struct free_block { + size_t offset; + size_t size; +}; + +struct ggml_dyn_tallocr { + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + size_t max_size; + +#ifdef GGML_ALLOCATOR_DEBUG + struct { + const struct ggml_tensor * tensor; + size_t offset; + } allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i].tensor == NULL) { + alloc->allocated_tensors[i].tensor = tensor; + alloc->allocated_tensors[i].offset = offset; + return; + } + } + GGML_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i].offset == offset) { + alloc->allocated_tensors[i].tensor = NULL; + return; + } + } + fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); + GGML_ASSERT(!"tensor not found"); +} +#endif + +static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block besides the last block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_blocks - 1; i++) { + struct free_block * block = &alloc->free_blocks[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + if (best_fit_block == -1) { + // the last block is our last resort + struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size) { + best_fit_block = alloc->n_free_blocks - 1; } else { - // see if we can reuse a parent's buffer (inplace) - if (ggml_op_can_inplace(node->op)) { - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } + // this should never happen + fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + GGML_UNREACHABLE(); + } + } - // if the node's data is external, then we cannot re-use it - if (ggml_tallocr_is_own(alloc, parent) == false) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); - continue; - } + struct free_block * block = &alloc->free_blocks[best_fit_block]; + size_t offset = block->offset; + block->offset = offset + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_blocks--; + for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } - struct hash_node * p_hn = hash_get(galloc, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->view_src = view_src; - view_src_hn->n_views += 1; - init_view(galloc, node, false); - return; - } - } else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->view_src = parent; - p_hn->n_views += 1; - init_view(galloc, node, false); + AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset); + +#ifdef GGML_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, offset, tensor); + size_t cur_max = offset + size; + if (cur_max > alloc->max_size) { + // sort allocated_tensors by offset + for (int i = 0; i < 1024; i++) { + for (int j = i + 1; j < 1024; j++) { + if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) { + const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; + size_t tmp_offset = alloc->allocated_tensors[i].offset; + alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; + alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset; + alloc->allocated_tensors[j].tensor = tmp_tensor; + alloc->allocated_tensors[j].offset = tmp_offset; + } + } + } + fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i].tensor) { + fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + alloc->allocated_tensors[i].offset, + alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), + ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); + } + } + fprintf(stderr, "\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, offset + size); + + return offset; + + GGML_UNUSED(tensor); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) { + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); + +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, offset, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + // check if ptr is at the end of the block + if (block->offset + block->size == offset) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { + block->size += alloc->free_blocks[i+1].size; + alloc->n_free_blocks--; + for (int j = i+1; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if (offset + size == block->offset) { + block->offset = offset; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { + alloc->free_blocks[i-1].size += block->size; + alloc->n_free_blocks--; + for (int j = i; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_blocks; i > insert_pos; i--) { + alloc->free_blocks[i] = alloc->free_blocks[i-1]; + } + // insert the new block + alloc->free_blocks[insert_pos].offset = offset; + alloc->free_blocks[insert_pos].size = size; + alloc->n_free_blocks++; + + GGML_UNUSED(tensor); +} + +static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { + alloc->n_free_blocks = 1; + alloc->free_blocks[0].offset = 0; + alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows + alloc->max_size = 0; +} + +static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { + struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); + + *alloc = (struct ggml_dyn_tallocr) { + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.max_size = */ 0, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {{0}}, +#endif + }; + + ggml_dyn_tallocr_reset(alloc); + + return alloc; +} + +static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { + free(alloc); +} + +static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { + return alloc->max_size; +} + + +///////////////////////////////////// + +// graph allocator + +struct hash_node { + int n_children; + int n_views; + int buffer_id; + size_t offset; // offset within the buffer + bool allocated; +}; + +// +struct tensor_alloc { + size_t offset; + size_t size_max; // 0 = pre-allocated, unused, or view +}; + +struct node_alloc { + int buffer_id; + struct tensor_alloc dst; + struct tensor_alloc src[GGML_MAX_SRC]; +}; + +struct ggml_gallocr { + ggml_backend_buffer_type_t * bufts; // [n_buffers] + ggml_backend_buffer_t * buffers; // [n_buffers] + struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] + int n_buffers; + + struct ggml_hash_set hash_set; + struct hash_node * hash_values; // [hash_set.size] + + struct node_alloc * node_allocs; // [n_nodes] + int n_nodes; +}; + +ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { + ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); + GGML_ASSERT(galloc != NULL); + + galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); + GGML_ASSERT(galloc->bufts != NULL); + + galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); + GGML_ASSERT(galloc->buffers != NULL); + + galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); + GGML_ASSERT(galloc->buf_tallocs != NULL); + + for (int i = 0; i < n_bufs; i++) { + galloc->bufts[i] = bufts[i]; + galloc->buffers[i] = NULL; + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } + galloc->n_buffers = n_bufs; + + return galloc; +} + +ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) { + return ggml_gallocr_new_n(&buft, 1); +} + +void ggml_gallocr_free(ggml_gallocr_t galloc) { + if (galloc == NULL) { + return; + } + + for (int i = 0; i < galloc->n_buffers; i++) { + if (galloc->buffers != NULL) { + ggml_backend_buffer_free(galloc->buffers[i]); + } + if (galloc->buf_tallocs != NULL) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } + } + + free(galloc->hash_set.keys); + free(galloc->hash_values); + free(galloc->bufts); + free(galloc->buffers); + free(galloc->buf_tallocs); + free(galloc->node_allocs); + free(galloc); +} + +typedef struct ggml_gallocr * ggml_gallocr_t; + +static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { + size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); + return &galloc->hash_values[i]; +} + +static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return ggml_gallocr_hash_get(galloc, t)->allocated; +} + +static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + hn->allocated = true; +} + +static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; +} + +static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + + if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { + hn->allocated = true; + assert(hn->offset == 0); + + // try to reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + + // if the node's data is external, then we cannot re-use it + if (!ggml_gallocr_is_own(galloc, parent)) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + + // outputs cannot be reused + if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { + AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); + continue; + } + + if (!ggml_are_same_layout(node, parent)) { + AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); + continue; + } + + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + if (p_hn->n_children == 1 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + assert(view_src_hn->offset == p_hn->offset); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + view_src_hn->allocated = false; return; } + } else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + return; } } } - ggml_tallocr_alloc(alloc, node); } + // allocate tensor from the buffer + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + return; } } -static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + // graph outputs are never freed + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + AT_PRINTF("not freeing output %s\n", node->name); + return; + } - ggml_tallocr_free_tensor(alloc, node); + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + size_t offset = hn->offset; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); + hn->allocated = false; } -static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) { - const int * parse_seq = galloc->parse_seq; - int parse_seq_len = galloc->parse_seq_len; +static int get_node_buffer_id(const int * node_buffer_ids, int i) { + return node_buffer_ids ? node_buffer_ids[i] : 0; +} + +static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { + // clear hash tables + memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); + memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + + // allocate all graph inputs first to avoid overwriting them + for (int i = 0; i < graph->n_nodes; i++) { + if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (graph->nodes[i]->src[j] == NULL) { + break; + } + if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); + } + } + } // count number of children and views - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view(node)) { struct ggml_tensor * view_src = node->view_src; - hash_get(galloc, view_src)->n_views += 1; - if (node->buffer == NULL && node->data != NULL) { - // view of a pre-allocated tensor, didn't call init_view() yet - init_view(galloc, node, true); - } + ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr if (parent == NULL) { break; } - hash_get(galloc, parent)->n_children += 1; - if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(galloc, parent, true); - } + ggml_gallocr_hash_get(galloc, parent)->n_children += 1; } } // allocate tensors - // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers - int last_barrier_pos = 0; - int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + int buffer_id = get_node_buffer_id(node_buffer_ids, i); - for (int ind = 0; ind < n_nodes; ind++) { - // allocate a node if there is no parse_seq or this is not a barrier - if (parse_seq_len == 0 || parse_seq[ind] != -1) { - int i = parse_seq_len ? parse_seq[ind] : ind; - struct ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - allocate_node(galloc, parent); + // allocate parents (only leafs need to be allocated at this point) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } - - // allocate node - allocate_node(galloc, node); - - AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } - } - AT_PRINTF("\n"); + ggml_gallocr_allocate_node(galloc, parent, buffer_id); } + // allocate node + ggml_gallocr_allocate_node(galloc, node, buffer_id); + + AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + // update parents - // update immediately if there is no parse_seq - // update only at barriers if there is parse_seq - if ((parse_seq_len == 0) || parse_seq[ind] == -1) { - int update_start = parse_seq_len ? last_barrier_pos : ind; - int update_end = parse_seq_len ? ind : ind + 1; - for (int i = update_start; i < update_end; i++) { - int node_i = parse_seq_len ? parse_seq[i] : i; - struct ggml_tensor * node = gf->nodes[node_i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + p_hn->n_children -= 1; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(galloc, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) { - free_node(galloc, view_src); - } - } - else { - free_node(galloc, parent); - } + AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", + parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", + view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { + ggml_gallocr_free_node(galloc, view_src, buffer_id); } } + else if (p_hn->allocated) { + ggml_gallocr_free_node(galloc, parent, buffer_id); + } } AT_PRINTF("\n"); - if (parse_seq_len) { - last_barrier_pos = ind + 1; + } + } +} + +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { + size_t hash_size = graph->visited_hash_table.size; + + // initialize hash table + if (galloc->hash_set.size < hash_size) { + free(galloc->hash_set.keys); + free(galloc->hash_values); + galloc->hash_set.size = hash_size; + galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); + galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + GGML_ASSERT(galloc->hash_set.keys != NULL); + GGML_ASSERT(galloc->hash_values != NULL); + } else { + // reset hash table + memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); + } + + // reset allocators + for (int i = 0; i < galloc->n_buffers; i++) { + ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); + } + + // allocate in hash table + ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); + + // set the node_allocs from the hash table + if (galloc->n_nodes < graph->n_nodes) { + free(galloc->node_allocs); + galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + GGML_ASSERT(galloc->node_allocs != NULL); + } + galloc->n_nodes = graph->n_nodes; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); + if (node->view_src || node->data) { + node_alloc->dst.offset = SIZE_MAX; + node_alloc->dst.size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (!src || src->view_src || src->data) { + node_alloc->src[j].offset = SIZE_MAX; + node_alloc->src[j].size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].offset = hn->offset; + node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); } } } -} -size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) { - size_t hash_size = graph->visited_hash_table.size; + // reallocate buffers if needed + for (int i = 0; i < galloc->n_buffers; i++) { + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; + size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - // check if the hash table is initialized and large enough - if (galloc->hash_set.size < hash_size) { - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); + if (new_size > cur_size) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + ggml_backend_buffer_free(galloc->buffers[i]); + galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); + if (galloc->buffers[i] == NULL) { + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); + } + + return true; +} + +bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { + return ggml_gallocr_reserve_n(galloc, graph, NULL); +} + +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); + + if (node->view_src != NULL) { + if (node->buffer == NULL) { + assert(tensor_alloc->offset == SIZE_MAX); + if (node->view_src->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); } - galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size); - galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + } else { + if (node->data == NULL) { + assert(tensor_alloc->offset != SIZE_MAX); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + void * addr = (char *)base + tensor_alloc->offset; + ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + } else { + if (node->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + +#ifndef NDEBUG + size_t offset = + (char *)node->data - + (char *)ggml_backend_buffer_get_base(node->buffer); + size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); + assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); + assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); +#endif + } + } +} + +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + return talloc->size_max >= node_size; +} + +static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (galloc->n_nodes != graph->n_nodes) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of nodes\n", __func__); +#endif + return true; } - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; - galloc->talloc = talloc; - ggml_tallocr_alloc_graph_impl(galloc, graph); - galloc->talloc = NULL; + if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); +#endif + return true; + } - size_t max_size = ggml_tallocr_max_size(talloc); - - return max_size; -} - -void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) { - const size_t hash_size = hash_set.size; - - GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); - - galloc->talloc = NULL; - - // alloc hash_values if needed - if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { - free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); - galloc->hash_values_size = hash_size; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { +#ifndef NDEBUG + fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); +#endif + return true; + } + } } - // free hash_set.keys if needed - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - galloc->hash_set = hash_set; - - // reset hash values - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - - galloc->hash_allocs = hash_node_talloc; - - ggml_tallocr_alloc_graph_impl(galloc, graph); - - // remove unowned resources - galloc->hash_set.keys = NULL; - galloc->hash_allocs = NULL; + return false; } -// legacy API wrapper - -struct ggml_allocr { - ggml_tallocr_t talloc; - ggml_gallocr_t galloc; -}; - -static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) { - ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr)); - *alloc = (struct ggml_allocr) { - /*.talloc = */ talloc, - /*.galloc = */ ggml_gallocr_new(), - }; - return alloc; -} - -ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment)); -} - -ggml_allocr_t ggml_allocr_new_measure(size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment)); -} - -ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer)); -} - -ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size)); -} - -ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend)); -} - -struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) { - return ggml_tallocr_get_buffer(alloc->talloc); -} - -void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { - ggml_gallocr_set_parse_seq(alloc->galloc, list, n); -} - -void ggml_allocr_free(ggml_allocr_t alloc) { - if (alloc == NULL) { - return; +bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (ggml_gallocr_needs_realloc(galloc, graph)) { + if (galloc->n_buffers == 1) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); +#endif + if (!ggml_gallocr_reserve(galloc, graph)) { + return false; + } + } else { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); +#endif + return false; + } } - ggml_gallocr_free(alloc->galloc); - ggml_tallocr_free(alloc->talloc); - free(alloc); + // reset buffers + for (int i = 0; i < galloc->n_buffers; i++) { + // zero size buffers are not allocated + if (galloc->buffers[i] != NULL) { + ggml_backend_buffer_reset(galloc->buffers[i]); + } + } + + // allocate the graph tensors from the previous assignments + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + } + ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + } + + return true; } -bool ggml_allocr_is_measure(ggml_allocr_t alloc) { - return ggml_tallocr_is_measure(alloc->talloc); -} +size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { + GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); -void ggml_allocr_reset(ggml_allocr_t alloc) { - ggml_tallocr_reset(alloc->talloc); -} - -void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) { - ggml_tallocr_alloc(alloc->talloc, tensor); -} - -size_t ggml_allocr_max_size(ggml_allocr_t alloc) { - return ggml_tallocr_max_size(alloc->talloc); -} - -size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) { - return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph); + if (galloc->buffers[buffer_id] == NULL) { + return 0; + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } // utils @@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return false; } - ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); + struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer); for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { ggml_tallocr_alloc(tallocr, t); - } else { + } else if (t->buffer == NULL) { ggml_backend_view_init(buffer, t); } } else { - if (t->view_src != NULL) { + if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor ggml_backend_view_init(buffer, t); } @@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - // tensor is too large to fit in a single buffer fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), @@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (n_buffers == 0) { - // all the tensors in the context are already allocated #ifndef NDEBUG fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); #endif diff --git a/ggml-alloc.h b/ggml-alloc.h index 4e5997521..1d9085d15 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -6,88 +6,62 @@ extern "C" { #endif -struct ggml_backend; -struct ggml_backend_buffer; -struct ggml_backend_buffer_type; - -// -// Legacy API -// - -typedef struct ggml_allocr * ggml_allocr_t; - -// initialize allocator for use with CPU backend only -GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment); -GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment); - -// initialize allocator for use with ggml-backend -GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); -GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend); - -GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc); - -// tell the allocator to parse nodes following the order described in the list -// you should call this if your graph are optimized to execute out-of-order -GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n); - -GGML_API void ggml_allocr_free (ggml_allocr_t alloc); -GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc); -GGML_API void ggml_allocr_reset (ggml_allocr_t alloc); -GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc); - -GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph); - -// -// ggml-backend v2 API -// - -// Separate tensor and graph allocator objects -// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators -// The original API is kept as a wrapper around the new API +typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; // Tensor allocator typedef struct ggml_tallocr * ggml_tallocr_t; -GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); - -GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); - -GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc); -GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc); -GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc); -GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc); - +GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer); +GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc); +GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor); // Graph allocator +/* + Example usage: + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); + + // optional: create a worst-case graph and reserve the buffers to avoid reallocations + ggml_gallocr_reserve(galloc, build_graph(max_batch)); + + // allocate the graph + struct ggml_cgraph * graph = build_graph(batch); + ggml_gallocr_alloc_graph(galloc, graph); + + printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); + + // evaluate the graph + ggml_backend_graph_compute(backend, graph); +*/ + +// special tensor flags for use with the graph allocator: +// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +// ggml_set_output(): output tensors are never freed and never overwritten + typedef struct ggml_gallocr * ggml_gallocr_t; -GGML_API ggml_gallocr_t ggml_gallocr_new(void); -GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); +GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); +GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); +GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); -GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); -GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); +// pre-allocate buffers from a measure graph - does not allocate or modify the graph +// call with a worst-case graph to avoid buffer reallocations +// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +// returns false if the buffer allocation failed +GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids); -// Allocate tensors from the allocators given by the hash table -GGML_API void ggml_gallocr_alloc_graph_n( - ggml_gallocr_t galloc, - struct ggml_cgraph * graph, - struct ggml_hash_set hash_set, - ggml_tallocr_t * hash_node_talloc); +// automatic reallocation if the topology changes when using a single buffer +// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) +GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); // Utils // Create a buffer and allocate all the tensors in a ggml_context -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); #ifdef __cplusplus } diff --git a/ggml-backend.c b/ggml-backend.c index 0764dfebc..9ee81b766 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU +static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment + GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { return "CPU"; @@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t } GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)buffer->context; + uintptr_t data = (uintptr_t)buffer->context; + + // align the buffer + if (data % TENSOR_ALIGNMENT != 0) { + data = GGML_PAD(data, TENSOR_ALIGNMENT); + } + + return (void *)data; } GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .reset = */ NULL, }; -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; @@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? - - GGML_ASSERT(data != NULL && "failed to allocate buffer"); + void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } @@ -653,6 +662,9 @@ struct ggml_backend_cpu_context { int n_threads; void * work_data; size_t work_size; + + ggml_abort_callback abort_callback; + void * abort_callback_data; }; GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { @@ -691,6 +703,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); } + cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + return cpu_plan; } @@ -721,9 +736,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.abort_callback = cpu_ctx->abort_callback; + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + ggml_graph_compute(cgraph, &cplan); return true; } @@ -758,12 +775,21 @@ static struct ggml_backend_i cpu_backend_i = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + if (ctx == NULL) { + return NULL; + } - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + if (cpu_backend == NULL) { + free(ctx); + return NULL; + } *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, @@ -783,7 +809,16 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { + GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } @@ -847,6 +882,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back ctx->n_buffers = n_buffers; ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); + GGML_ASSERT(ctx->buffers != NULL); + size_t total_size = 0; for (size_t i = 0; i < n_buffers; i++) { ctx->buffers[i] = buffers[i]; @@ -868,6 +905,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, } } +// creates a copy of the tensor with the same memory layout +static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { + struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + dup->nb[i] = tensor->nb[i]; + } + return dup; +} + +static bool ggml_is_view_op(enum ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; +} // scheduler @@ -876,7 +925,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, #define GGML_MAX_SPLIT_INPUTS 16 struct ggml_backend_sched_split { - ggml_tallocr_t tallocr; + int backend_id; int i_start; int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; @@ -891,15 +940,17 @@ struct ggml_backend_sched { int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS]; - ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; ggml_gallocr_t galloc; // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - // hash values (arrays of [hash_set.size]) - ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // hash values + int * tensor_backend_id; + struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS]; + + int * node_backend_ids; // [n_nodes] + int n_nodes; // copy of the graph with modified inputs struct ggml_cgraph * graph; @@ -909,77 +960,46 @@ struct ggml_backend_sched { struct ggml_context * ctx; + ggml_backend_sched_eval_callback callback_eval; + void * callback_eval_user_data; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) #else __attribute__((aligned(GGML_MEM_ALIGN))) #endif - char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; - - ggml_backend_sched_eval_callback callback_eval; - void * callback_eval_user_data; + char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; }; #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) -#define node_allocr(node) sched->node_talloc[hash_id(node)] +#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)] +#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)]) -static bool ggml_is_view_op(enum ggml_op op) { - return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; -} - -// returns the priority of the backend, lower is better -static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) { +// returns the priority of the backend, lower id is higher priority +static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { for (int i = 0; i < sched->n_backends; i++) { if (sched->backends[i] == backend) { return i; } } - return INT_MAX; + return -1; } -static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return i; - } - } - return INT_MAX; -} - -static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { if (buffer == NULL) { - return NULL; - } - - // check if this is already allocate in a allocr buffer (from user manual allocations) - for (int i = 0; i < sched->n_backends; i++) { - if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { - return sched->tallocs[i]; - } + return -1; } // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->tallocs[i]; + return i; } } GGML_ASSERT(false && "tensor buffer type not supported by any backend"); } -static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { - if (allocr == NULL) { - return NULL; - } - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return sched->backends[i]; - } - } - GGML_UNREACHABLE(); -} - #if 0 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) @@ -990,37 +1010,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I #endif // returns the backend that should be used for the node based on the current locations -static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { +static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { + // TODO: use supports_op to check if the backend supports the op + // assign pre-allocated nodes to their backend // dst - ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); - if (cur_allocr != NULL) { + int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.dst"); - return cur_allocr; + return cur_backend; } // view_src - if (node->view_src != NULL) { - cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); - if (cur_allocr != NULL) { + if (tensor->view_src != NULL) { + cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.vsrc"); - return cur_allocr; + return cur_backend; } } // assign nodes that use weights to the backend of the weights for (int i = 0; i < GGML_MAX_SRC; i++) { - const struct ggml_tensor * src = node->src[i]; + const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { break; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer); // operations with weights are always run on the same backend as the weights SET_CAUSE(node, "1.wgt%d", i); - return src_allocr; + return src_backend; } } - return NULL; + return -1; } static char * fmt_size(size_t size) { @@ -1033,11 +1055,11 @@ static char * fmt_size(size_t size) { return buffer; } -static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { int cur_split = 0; for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { - ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr); + ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { @@ -1051,17 +1073,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: + ggml_backend_t tensor_backend = tensor_backend(node); fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; + ggml_backend_t src_backend = tensor_backend(src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } @@ -1069,23 +1089,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } } -// creates a copy of the tensor with the same memory layout -static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { - struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); - for (int i = 0; i < GGML_MAX_DIMS; i++) { - dup->nb[i] = tensor->nb[i]; - } - return dup; -} - - //#define DEBUG_PASS1 //#define DEBUG_PASS2 //#define DEBUG_PASS3 //#define DEBUG_PASS4 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits sched->n_splits = 0; sched->is_reset = false; @@ -1107,28 +1117,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; - if (node_allocr(leaf) != NULL) { + if (tensor_backend_id(leaf) != -1) { // do not overwrite user assignments continue; } - node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); + tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (node_allocr(node) != NULL) { + if (tensor_backend_id(node) != -1) { // do not overwrite user assignments continue; } - node_allocr(node) = sched_allocr_from_cur(sched, node); + tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - if (node_allocr(src) == NULL) { - node_allocr(src) = sched_allocr_from_cur(sched, src); + if (tensor_backend_id(src) == -1) { + tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1143,22 +1153,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.1 expand gpu up { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.1"); } } @@ -1166,22 +1176,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.2 expand gpu down { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.2"); } } @@ -1189,17 +1199,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.3 expand rest up { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1207,17 +1217,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.4 expand rest down { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.4"); } } @@ -1229,9 +1239,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t cur_allocr = node_allocr(node); - if (node->view_src != NULL && cur_allocr == NULL) { - cur_allocr = node_allocr(node) = node_allocr(node->view_src); + int cur_backend_id = tensor_backend_id(node); + if (node->view_src != NULL && cur_backend_id == -1) { + cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1239,14 +1249,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr == NULL) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - node_allocr(src) = node_allocr(src->view_src); + tensor_backend_id(src) = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - node_allocr(src) = cur_allocr; + tensor_backend_id(src) = cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1263,15 +1273,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - sched->splits[0].tallocr = node_allocr(node); + sched->splits[0].backend_id = tensor_backend_id(node); break; } } sched->splits[0].i_start = 0; sched->splits[0].n_inputs = 0; memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + int cur_backend_id = sched->splits[0].backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1279,19 +1288,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g continue; } - ggml_tallocr_t node_allocr = node_allocr(node); + int tensor_backend_id = tensor_backend_id(node); - GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now + GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now - if (node_allocr != cur_allocr) { + if (tensor_backend_id != cur_backend_id) { sched->splits[cur_split].i_end = i; cur_split++; GGML_ASSERT(cur_split < GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].backend_id = tensor_backend_id; sched->splits[cur_split].i_start = i; sched->splits[cur_split].n_inputs = 0; - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); + cur_backend_id = tensor_backend_id; } // find inputs that are not on the same backend @@ -1300,43 +1308,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now - if (src_allocr != node_allocr) { + int src_backend_id = tensor_backend_id(src); + assert(src_backend_id != -1); // all inputs should be assigned by now + if (src_backend_id != tensor_backend_id) { // create a copy of the input in the split's backend size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + if (sched->tensor_copies[id][cur_backend_id] == NULL) { + ggml_backend_t backend = sched->backends[cur_backend_id]; struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; + sched->tensor_copies[id][cur_backend_id] = tensor_copy; + tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); int n_inputs = sched->splits[cur_split].n_inputs++; GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); sched->splits[cur_split].inputs[n_inputs] = src; } - node->src[j] = sched->node_copies[id][cur_backend_id]; - -#if 0 - // check if the input is already in the split - bool found = false; - for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { - if (sched->splits[cur_split].inputs[k] == src) { - found = true; - break; - } - } - - if (!found) { - int n_inputs = sched->splits[cur_split].n_inputs++; - //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr))); - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; - } -#endif + node->src[j] = sched->tensor_copies[id][cur_backend_id]; } } } @@ -1351,30 +1341,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { + ggml_backend_t tensor_backend = tensor_backend(node); + if (tensor_backend == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } - if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) { fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", + node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now + ggml_backend_t src_backend = tensor_backend(src); + if (src_backend != tensor_backend /* && src_backend != NULL */) { fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); + node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", + j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); } - if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) { fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", - src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", + src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL"); } } } @@ -1388,32 +1378,45 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); - // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id]; + // add a dependency to the input source so that it is not freed before the copy is done - GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); - input_cpy->src[0] = input; + struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + graph_copy->nodes[graph_copy->n_nodes++] = input_dep; + + // add a dependency to the input copy so that it is allocated at the start of the split + sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } for (int j = split->i_start; j < split->i_end; j++) { + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } } sched->graph = graph_copy; } -static void sched_alloc_splits(ggml_backend_sched_t sched) { - ggml_gallocr_alloc_graph_n( - sched->galloc, - sched->graph, - sched->hash_set, - sched->node_talloc); +static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { + // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { +#ifndef NDEBUG + fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n"); +#endif + ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n"); + return false; + } + } + + return true; } -static void sched_compute_splits(ggml_backend_sched_t sched) { +static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; @@ -1421,20 +1424,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; - ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr); - int split_backend_id = sched_backend_prio(sched, split_backend); + int split_backend_id = split->backend_id; + ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id]; GGML_ASSERT(input->buffer != NULL); GGML_ASSERT(input_cpy->buffer != NULL); - // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change - // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } //ggml_backend_synchronize(split_backend); // necessary to measure copy time @@ -1450,7 +1451,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { - ggml_backend_graph_compute(split_backend, &split->graph); + if (!ggml_backend_graph_compute(split_backend, &split->graph)) { + return false; + } //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to ggml_backend_compare_graph_backend @@ -1470,7 +1473,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - ggml_backend_graph_compute(split_backend, &gv); + if (!ggml_backend_graph_compute(split_backend, &gv)) { + return false; + } if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { break; @@ -1492,19 +1497,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { } } #endif -} -static void sched_reset(ggml_backend_sched_t sched) { - for (int i = 0; i < sched->n_backends; i++) { - ggml_tallocr_reset(sched->tallocs[i]); - } - // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); - - sched->is_reset = true; + return true; } ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { @@ -1514,9 +1508,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); - sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); - sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); + sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); + sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { @@ -1524,14 +1519,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]); } - sched->galloc = ggml_gallocr_new(); + sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); - // init measure allocs for each backend - for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]); - } - - sched_reset(sched); + ggml_backend_sched_reset(sched); return sched; } @@ -1540,49 +1530,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } - for (int i = 0; i < sched->n_backends; i++) { - ggml_tallocr_free(sched->tallocs[i]); - } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); free(sched->hash_set.keys); - free(sched->node_talloc); - free(sched->node_copies); + free(sched->tensor_backend_id); + free(sched->tensor_copies); + free(sched->node_backend_ids); free(sched); } -void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once +void ggml_backend_sched_reset(ggml_backend_sched_t sched) { + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched_split_graph(sched, measure_graph); - sched_alloc_splits(sched); - - // allocate buffers and reset allocators - for (int i = 0; i < sched->n_backends; i++) { - size_t size = ggml_tallocr_max_size(sched->tallocs[i]); - ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size); - } - - sched_reset(sched); + sched->is_reset = true; } -void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + ggml_backend_sched_split_graph(sched, measure_graph); + + if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) { + return false; + } + + ggml_backend_sched_reset(sched); + return true; +} + +bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); if (!sched->is_reset) { - sched_reset(sched); + ggml_backend_sched_reset(sched); } - sched_split_graph(sched, graph); - sched_alloc_splits(sched); - sched_compute_splits(sched); -} + ggml_backend_sched_split_graph(sched, graph); + if (!ggml_backend_sched_alloc_splits(sched)) { + return false; + } -void ggml_backend_sched_reset(ggml_backend_sched_t sched) { - sched_reset(sched); -} + if (!ggml_backend_sched_compute_splits(sched)) { + return false; + } + return true; +} void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { sched->callback_eval = callback; @@ -1593,37 +1588,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { return sched->n_splits; } -ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); +size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { + int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return sched->tallocs[backend_index]; -} - -ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); - GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); + return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); + int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - node_allocr(node) = sched->tallocs[backend_index]; + tensor_backend_id(node) = backend_index; } ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { - ggml_tallocr_t allocr = node_allocr(node); - if (allocr == NULL) { + int backend_index = tensor_backend_id(node); + if (backend_index == -1) { return NULL; } - return get_allocr_backend(sched, allocr); + return sched->backends[backend_index]; } // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); @@ -1647,7 +1635,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor ggml_backend_buffer_init_tensor(buffer, tensor); } -static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, +static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) { GGML_ASSERT(src != NULL); @@ -1660,7 +1648,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); if (src->view_src != NULL) { - dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); + dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); dst->view_offs = src->view_offs; } dst->op = src->op; @@ -1673,14 +1661,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru if (s == NULL) { break; } - dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); + dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); } node_copies[id] = dst; return dst; } -static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { +static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -1689,7 +1677,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { - graph_init_tensor(hash_set, node_copies, node_init, src->view_src); + graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1702,17 +1690,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor if (s == NULL) { break; } - graph_init_tensor(hash_set, node_copies, node_init, s); + graph_copy_init_tensor(hash_set, node_copies, node_init, s); } } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = { /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1) + /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT }; - struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1); - bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1); + struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT + bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), @@ -1741,7 +1729,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); + graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); } // allocate nodes @@ -1766,7 +1754,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(hash_set, node_copies, node_init, node); } // build graph copy diff --git a/ggml-backend.h b/ggml-backend.h index 8b8160fcf..f13c69bff 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -83,8 +83,9 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); @@ -129,11 +130,7 @@ extern "C" { // in build_graph: build_graph(...) { - // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) - alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu); - ggml_allocr_alloc(alloc_cpu, tensor); - - // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) + // manually assign nodes to a backend (optional, should not be needed in most cases) struct ggml_tensor * node = ggml_mul_mat(ctx, ...); ggml_backend_sched_set_node_backend(sched, node, backend_gpu); } @@ -163,20 +160,19 @@ extern "C" { GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); - GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + // Reset all assignments and allocators - must be called before changing the node backends GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5053757e6..96976f248 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -150,8 +150,8 @@ #define CUDA_USE_TENSOR_CORES #endif -// max batch size to use MMQ kernels when tensor cores are available -#define MMQ_MAX_BATCH_SIZE 32 +#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels +#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -5310,51 +5310,59 @@ template static __global__ void #endif // __CUDA_ARCH__ >= CC_VOLTA } -#define MMVQ_NWARPS_NVIDIA 4 -#define MMVQ_NWARPS_AMD_RDNA2 1 -#define MMVQ_NWARPS_AMD_OLD 4 - -template +template #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -__launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants +// tell the compiler to use as many registers as it wants, see nwarps definition below +__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) { + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { - const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par; +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) + constexpr int nwarps = 1; + constexpr int rows_per_cuda_block = 1; +#else + constexpr int nwarps = ncols_y <= 4 ? 4 : 2; + constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - const int row = blockIdx.x; - - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + const int row0 = rows_per_cuda_block*blockIdx.x; + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; // partial sum for each thread - float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f}; + float tmp[ncols_y][rows_per_cuda_block] = {0.0f}; const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; - for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) { - const int ibx = row*blocks_per_row_x + i; // x block index + for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { + const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx - - const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int + // x block quant index when casting the quants to int + const int kqs = vdr * (tid % (qi/vdr)); #pragma unroll for (int j = 0; j < ncols_y; ++j) { - tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs); +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp[j][i] += vec_dot_q_cuda( + &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs); + } } } - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE]; + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE]; if (threadIdx.y > 0) { #pragma unroll for (int j = 0; j < ncols_y; ++j) { - tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j]; +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; + } } } __syncthreads(); @@ -5366,13 +5374,16 @@ static __global__ void mul_mat_vec_q( #pragma unroll for (int j = 0; j < ncols_y; ++j) { #pragma unroll - for (int i = 0; i < nwarps-1; ++i) { - tmp[j] += tmp_shared[i][j][threadIdx.x]; + for (int i = 0; i < rows_per_cuda_block; ++i) { +#pragma unroll + for (int l = 0; l < nwarps-1; ++l) { + tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; + } + tmp[j][i] = warp_reduce_sum(tmp[j][i]); } - tmp[j] = warp_reduce_sum(tmp[j]); - if (threadIdx.x == 0) { - dst[j*nrows_dst + row] = tmp[j]; + if (threadIdx.x < rows_per_cuda_block) { + dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; } } } @@ -6851,65 +6862,75 @@ static void mul_mat_vec_q_cuda( const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { GGML_ASSERT(ncols_x % qk == 0); - GGML_ASSERT(ncols_y <= 4); + GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); int id; CUDA_CHECK(cudaGetDevice(&id)); - int nwarps; - if (g_device_caps[id].cc >= CC_OFFSET_AMD) { - nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD; - } else { - nwarps = MMVQ_NWARPS_NVIDIA; - } + int64_t nwarps = 1; + int64_t rows_per_cuda_block = 1; - const dim3 block_nums(nrows_x, 1, 1); + if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 + switch(ncols_y) { + case 1: + nwarps = 4; + rows_per_cuda_block = 1; + break; + case 2: + case 3: + case 4: + nwarps = 4; + rows_per_cuda_block = 2; + break; + case 5: + case 6: + case 7: + case 8: + nwarps = 2; + rows_per_cuda_block = 2; + break; + default: + GGML_ASSERT(false); + break; + } + } + const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block; + const dim3 block_nums(nblocks, 1, 1); const dim3 block_dims(WARP_SIZE, nwarps, 1); - switch (nwarps) { - case 1: switch(ncols_y) { - case 1: - mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 2: - mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 3: - mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 4: - mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - default: - GGML_ASSERT(false); - break; - } break; - case 4: switch(ncols_y) { - case 1: - mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 2: - mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 3: - mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - case 4: - mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); - break; - default: - GGML_ASSERT(false); - break; - } break; - + switch (ncols_y) { + case 1: + mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 2: + mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 3: + mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 4: + mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 5: + mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 6: + mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 7: + mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 8: + mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; default: GGML_ASSERT(false); break; @@ -9735,7 +9756,7 @@ static __global__ void k_compute_batched_ptrs( ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); @@ -9893,39 +9914,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 int64_t min_compute_capability = INT_MAX; + bool any_pascal_with_slow_fp16 = false; if (split) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; auto & tensor_split = buft_ctx->tensor_split; for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + // skip devices that are not going to do any work: + if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + continue; + } + + if (min_compute_capability > g_device_caps[id].cc) { min_compute_capability = g_device_caps[id].cc; } + if (g_device_caps[id].cc == 610) { + any_pascal_with_slow_fp16 = true; + } } } else { - min_compute_capability = g_device_caps[g_main_device].cc; + min_compute_capability = g_device_caps[g_main_device].cc; + any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610; } + // check data types and tensor shapes for custom matrix multiplication kernels: + bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + + bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; + + bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) const bool fp16_performance_good = min_compute_capability >= CC_RDNA1; - bool use_mul_mat_q = ggml_is_quantized(src0->type); + #ifdef CUDA_USE_TENSOR_CORES use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3; #endif // CUDA_USE_TENSOR_CORES #else - const bool fp16_performance_good = min_compute_capability >= CC_VOLTA; - bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0) + const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16; + + // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1 + use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A; + use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A; + #ifdef CUDA_USE_TENSOR_CORES // when tensor cores are available, use them for large batch size // ref: https://github.com/ggerganov/llama.cpp/pull/3776 - use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE); + use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE); #endif // CUDA_USE_TENSOR_CORES #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type); + // if mmvq is available it's a better choice than dmmv: +#ifndef GGML_CUDA_FORCE_DMMV + use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; +#endif // GGML_CUDA_FORCE_DMMV // debug helpers //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); @@ -9943,33 +9994,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_cuda_mul_mat_vec_nc(src0, src1, dst); } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch - ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); - } else if (src0->type == GGML_TYPE_F32) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); - } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { - if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) { -#ifdef GGML_CUDA_FORCE_DMMV - const bool use_mul_mat_vec_q = false; -#else - const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); -#endif // GGML_CUDA_FORCE_DMMV - - if (use_mul_mat_vec_q) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); - } else { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); - } - } else { - if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); - } else if (use_mul_mat_q) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); - } else { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); - } - } + ggml_cuda_mul_mat_batched_cublas(src0, src1, dst); + } else if (use_dequantize_mul_mat_vec) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } else if (use_mul_mat_vec_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + } else if (use_mul_mat_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { - GGML_ASSERT(false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } diff --git a/ggml-metal.m b/ggml-metal.m index 5260ed827..c1d8e2de8 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { + @autoreleasepool { MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; edesc.dispatchType = MTLDispatchTypeSerial; @@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute( [[MTLCaptureManager sharedCaptureManager] stopCapture]; } + } return true; } diff --git a/ggml-quants.c b/ggml-quants.c index 101d3e783..b2a309bf8 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -49,6 +49,8 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define UNUSED GGML_UNUSED + #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) @@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__ARM_NEON) + +#ifdef _MSC_VER + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif + #if !defined(__aarch64__) // 64-bit compatibility @@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_0 * restrict vx0 = vx; + const block_q4_0 * restrict vx1 = vx + bx; + + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_0 * restrict b_x0 = &vx0[i]; + const block_q4_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); + const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); + const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); + const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_1 * restrict vx0 = vx; + const block_q4_1 * restrict vx1 = vx + bx; + const block_q8_1 * restrict vy0 = vy; + const block_q8_1 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t summs0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_1 * restrict b_x0 = &vx0[i]; + const block_q4_1 * restrict b_x1 = &vx1[i]; + const block_q8_1 * restrict b_y0 = &vy0[i]; + const block_q8_1 * restrict b_y1 = &vy1[i]; + + float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s, + GGML_FP16_TO_FP32(b_x1->m) * b_y0->s, + GGML_FP16_TO_FP32(b_x0->m) * b_y1->s, + GGML_FP16_TO_FP32(b_x1->m) * b_y1->s}; + summs0 += summs_t; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + // mmla into int32x4_t + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + sumv2 = sumv2 + summs0; + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif // TODO: add WASM SIMD #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q8_0 * restrict vx0 = vx; + const block_q8_0 * restrict vx1 = vx + bx; + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q8_0 * restrict b_x0 = &vx0[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + + const block_q8_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const int8x16_t x0_l = vld1q_s8(b_x0->qs); + const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); + const int8x16_t x1_l = vld1q_s8(b_x1->qs); + const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q3_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; -void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xxs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res #endif } -void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest } // TODO -void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq3_xxs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); - const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; - const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; - const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; - const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); q3 += 16; q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); diff --git a/ggml-quants.h b/ggml-quants.h index bfdf3c997..68f09b1e1 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index dd562a898..cd4b3a1e1 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, } char * dst_ptr = (char *) dst; - const int64_t ne0 = src->ne[0]; - const int64_t nb0 = src->nb[0]; - const int64_t nb1 = src->nb[1]; - const int64_t nb2 = src->nb[2]; - const int64_t nb3 = src->nb[3]; + GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne); + GGML_TENSOR_LOCALS(int64_t, nb, src, nb); const enum ggml_type type = src->type; const int64_t ts = ggml_type_size(type); const int64_t bs = ggml_blck_size(type); @@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1, GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; + GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne); const int64_t nrows = ggml_nrows(src0); //const int n_past = ((int32_t *) dst->op_params)[0]; @@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, ggml_sycl_op_mul_mat_t op, const bool convert_src1_to_q8_1) try { - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); const int64_t nrows1 = ggml_nrows(src1); GGML_ASSERT(ne03 == ne13); @@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - const int64_t nb01 = src0->nb[1]; - const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); - const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); + GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb); - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); - const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb); const int64_t ne1 = ggml_nelements(src1); const int64_t ne = ggml_nelements(dst); @@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); - const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); - const int64_t ne01 = src00->ne[1]; - const int64_t ne02 = src00->ne[2]; - const int64_t ne03 = src00->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne); //const int64_t nb01 = src00->nb[1]; - const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); - const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03); + GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb); - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb); //const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); - const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); const int64_t ne1 = ggml_nelements(src1); const int64_t ne = ggml_nelements(dst); @@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - - - const int64_t nb00 = src0->nb[0]; - const int64_t nb01 = src0->nb[1]; - const int64_t nb02 = src0->nb[2]; - const int64_t nb03 = src0->nb[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - - - const int64_t nb10 = src1->nb[0]; - const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; - const int64_t nb13 = src1->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 9e2846ee4..7834e635c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -27,6 +27,7 @@ #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) #define VK_VENDOR_ID_AMD 0x1002 +#define VK_VENDOR_ID_APPLE 0x106b #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de @@ -744,6 +745,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz } if (memory_type_index >= mem_props.memoryTypeCount) { + ctx->device.lock()->device.destroyBuffer(buf->buffer); + buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } @@ -2032,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct return ctx->pipeline_matmul_f32_aligned_l.align; } -static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; -#endif +static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { if (bit16_x && bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; + } + if (bit16_x && !bit16_y) { + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + if (bit16_x && bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; + } + if (bit16_x && !bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + if (bit16_x && bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; + } + if (bit16_x && !bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; +#endif + switch (ctx->device.lock()->vendor_id) { + case VK_VENDOR_ID_AMD: + return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned); + case VK_VENDOR_ID_APPLE: + return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned); + case VK_VENDOR_ID_INTEL: + return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned); + } + + if (bit16_x && bit16_y) { + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; + } + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif @@ -2055,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif @@ -2076,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, GGML_ASSERT(false); } - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif @@ -3875,7 +3960,7 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_ctx->preallocate_buffers_graph(" << node << ")" << std::endl; + std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) @@ -3994,8 +4079,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { return; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_ctx->preallocate_buffers()" << std::endl; - std::cerr << "qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << std::endl; + std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); diff --git a/ggml.c b/ggml.c index 452cec282..d70150988 100644 --- a/ggml.c +++ b/ggml.c @@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, + .nrows = 1, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, }, [5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, }, [GGML_TYPE_Q5_0] = { .type_name = "q5_0", @@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", @@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest *s = sumf; } -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #endif } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } @@ -2631,7 +2673,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, /*.op_params =*/ { 0 }, - /*.is_param =*/ false, + /*.flags =*/ 0, /*.grad =*/ NULL, /*.src =*/ { NULL }, /*.perf_runs =*/ 0, @@ -6533,7 +6575,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( void ggml_set_param( struct ggml_context * ctx, struct ggml_tensor * tensor) { - tensor->is_param = true; + tensor->flags |= GGML_TENSOR_FLAG_PARAM; GGML_ASSERT(tensor->grad == NULL); tensor->grad = ggml_dup_tensor(ctx, tensor); @@ -10016,6 +10058,7 @@ static void ggml_compute_forward_mul_mat( ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + int64_t const vec_dot_num_rows = type_traits[type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -10183,12 +10226,23 @@ static void ggml_compute_forward_mul_mat( const int64_t blck_0 = 16; const int64_t blck_1 = 16; + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t nrc = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + nrc = 1; + } + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + // attempt to reduce false-sharing (does not seem to make a difference) - float tmp[16]; + // 16 * 2, accounting for mmla kernels + float tmp[32]; for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) { const int64_t i13 = (ir1/(ne12*ne1)); const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); @@ -10211,17 +10265,19 @@ static void ggml_compute_forward_mul_mat( (src1_cont || src1->type != vec_dot_type ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size : (i11*nb11 + i12*nb12 + i13*nb13)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) { + vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc); + } + + for (int cn = 0; cn < nrc; ++cn) { + memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } - memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } } } @@ -10410,7 +10466,7 @@ static void ggml_compute_forward_mul_mat_id( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } @@ -11592,7 +11648,7 @@ static void ggml_compute_forward_soft_max_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_acc1_f32(nc, dx, -dot_y_dy); ggml_vec_mul_f32 (nc, dx, dx, y); @@ -12393,9 +12449,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne02, &v, - (ggml_fp16_t *) wdata_src + i1n, - (ggml_fp16_t *) wdata_kernel + i00*ne02); + ggml_vec_dot_f16(ne02, &v, 0, + (ggml_fp16_t *) wdata_src + i1n, 0, + (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12490,9 +12546,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f32(ne02, &v, - wdata_src + i1n, - wdata_kernel + i00*ne02); + ggml_vec_dot_f32(ne02, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12807,9 +12863,9 @@ static void ggml_compute_forward_conv_transpose_2d( for (int i01 = 0; i01 < ne01; i01++) { for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne03, &v, - wdata_src + i1n, - wdata_kernel + i01*ne00*ne03 + i00*ne03); + ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -13238,9 +13294,9 @@ static void ggml_compute_forward_flash_attn_f32( const int i1 = ik1; ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -13323,9 +13379,9 @@ static void ggml_compute_forward_flash_attn_f32( const int iv3 = iq3; ggml_vec_dot_f32(masked_begin, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S, 0, 1); } } } @@ -13428,9 +13484,9 @@ static void ggml_compute_forward_flash_attn_f16( const int i1 = ik1; ggml_vec_dot_f16(neq0, - S + i1, - (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } } else { for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { @@ -13532,9 +13588,9 @@ static void ggml_compute_forward_flash_attn_f16( const int iv3 = iq3; ggml_vec_dot_f16(nev0, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S16, 0, 1); } } else { for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { @@ -13676,9 +13732,9 @@ static void ggml_compute_forward_flash_ff_f16( const int i1 = ib01; ggml_vec_dot_f16(nea0, - S + i1, - (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), - (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + S + i1, 0, + (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0, + (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1); } ggml_vec_add_f32(neb01, S, S, (float *) b1->data); @@ -13701,9 +13757,9 @@ static void ggml_compute_forward_flash_ff_f16( for (int64_t ic = 0; ic < nec01; ++ic) { ggml_vec_dot_f16(neb01, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0, + S16, 0, 1); } ggml_vec_add_f32(nec01, @@ -13890,9 +13946,9 @@ static void ggml_compute_forward_flash_attn_back_f32( const int i1 = ik1; ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -14037,7 +14093,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); ggml_vec_mul_f32 (masked_begin, S, S, SM); @@ -15335,7 +15391,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return NULL; } - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { return node; } @@ -15369,7 +15425,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( clone->op = node->op; clone->grad = node->grad; - clone->is_param = node->is_param; + clone->flags = node->flags; clone->extra = node->extra; for (int k = 0; k < GGML_MAX_DIMS; ++k) { clone->nb[k] = node->nb[k]; @@ -16401,7 +16457,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * for (int i = 0; i < gf->n_nodes; i++) { struct ggml_tensor * node = gf->nodes[i]; - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); ggml_build_forward_expand(gb, node->grad); } @@ -16692,7 +16748,7 @@ struct ggml_compute_state_shared { atomic_int node_n; // active graph node atomic_int node_task; // active graph node task phase - bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void * abort_callback_data; }; @@ -17905,7 +17961,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -17998,7 +18054,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph continue; } - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { snprintf(color, sizeof(color), "yellow"); } else if (node->grad) { if (ggml_graph_find(gf, node)) { @@ -18172,7 +18228,7 @@ static enum ggml_opt_result ggml_opt_adam( int np = 0; int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_ASSERT(np < GGML_MAX_PARAMS); @@ -18425,7 +18481,7 @@ static enum ggml_opt_result linesearch_backtracking( } // compute the initial gradient in the search direction - ggml_vec_dot_f32(nx, &dginit, g, d); + ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1); // make sure that d points to a descent direction if (0 < dginit) { @@ -18475,7 +18531,7 @@ static enum ggml_opt_result linesearch_backtracking( return count; } - ggml_vec_dot_f32(nx, &dg, g, d); + ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1); // check the Wolfe condition if (dg < params->lbfgs.wolfe * dginit) { @@ -18535,7 +18591,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_ASSERT(np < GGML_MAX_PARAMS); @@ -18736,8 +18792,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); - ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); + ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1); + ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1); lm_ys[end[0]] = ys; @@ -18756,7 +18812,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1); lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); @@ -18766,7 +18822,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1); beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); @@ -19010,6 +19066,16 @@ enum ggml_opt_result ggml_opt_resume_g( //////////////////////////////////////////////////////////////////////////////// +void ggml_set_input(struct ggml_tensor * tensor) { + tensor->flags |= GGML_TENSOR_FLAG_INPUT; +} + +void ggml_set_output(struct ggml_tensor * tensor) { + tensor->flags |= GGML_TENSOR_FLAG_OUTPUT; +} + +//////////////////////////////////////////////////////////////////////////////// + void ggml_quantize_init(enum ggml_type type) { ggml_critical_section_start(); @@ -20654,4 +20720,12 @@ int ggml_cpu_has_vsx(void) { #endif } +int ggml_cpu_has_matmul_int8(void) { +#if defined(__ARM_FEATURE_MATMUL_INT8) + return 1; +#else + return 0; +#endif +} + //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml.h b/ggml.h index 9b3393ca8..a5bb4c1ab 100644 --- a/ggml.h +++ b/ggml.h @@ -505,11 +505,17 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4, GGML_LOG_LEVEL_DEBUG = 5 }; + enum ggml_tensor_flag { + GGML_TENSOR_FLAG_INPUT = 1, + GGML_TENSOR_FLAG_OUTPUT = 2, + GGML_TENSOR_FLAG_PARAM = 4, + }; + // ggml object struct ggml_object { size_t offs; @@ -543,7 +549,7 @@ extern "C" { // op params - allocated as int32_t for alignment int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - bool is_param; + int32_t flags; struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; @@ -567,6 +573,11 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // Abort callback + // If not NULL, called before ggml computation + // If it returns true, the computation is aborted + typedef bool (*ggml_abort_callback)(void * data); + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -576,8 +587,8 @@ extern "C" { int n_threads; // abort ggml_graph_compute when true - bool (*abort_callback)(void * data); - void * abort_callback_data; + ggml_abort_callback abort_callback; + void * abort_callback_data; }; enum ggml_cgraph_eval_order { @@ -2097,6 +2108,12 @@ extern "C" { ggml_opt_callback callback, void * callback_data); + // + // tensor flags + // + GGML_API void ggml_set_input(struct ggml_tensor * tensor); + GGML_API void ggml_set_output(struct ggml_tensor * tensor); + // // quantization // @@ -2283,6 +2300,7 @@ extern "C" { GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_vsx (void); + GGML_API int ggml_cpu_has_matmul_int8(void); // // Internal types and functions exposed for tests and benchmarks @@ -2296,7 +2314,8 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); - typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); typedef struct { const char * type_name; @@ -2308,6 +2327,7 @@ extern "C" { ggml_from_float_t from_float_reference; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously; } ggml_type_traits_t; GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 4abb0383f..b2e86e182 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -2067,6 +2067,8 @@ type_names = { K_QUANTS_PER_ITERATION = 2 +ASYNCIO_CONCURRENCY = 64 + output_dir = gettempdir() lock = asyncio.Lock() @@ -2291,7 +2293,14 @@ async def main(): tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - await asyncio.gather(*tasks) + # Helper to decorate tasks with semaphore acquisition. + async def withSemaphore(sem, task): + async with sem: + return await task + + # Run tasks concurrently guarded by a concurrency limit. + sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY) + await asyncio.gather(*(withSemaphore(sem, task) for task in tasks)) with open("ggml-vulkan-shaders.hpp", "w") as f: f.write("#include \n\n") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1cfd41c0b..a9c13dd38 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -50,6 +50,7 @@ class Keys: VALUE_LENGTH = "{arch}.attention.value_length" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + CAUSAL = "{arch}.attention.causal" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -60,22 +61,23 @@ class Keys: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # @@ -122,6 +124,7 @@ class MODEL_TENSOR(IntEnum): ATTN_OUT = auto() ATTN_NORM = auto() ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() ATTN_ROT_EMBD = auto() FFN_GATE_INP = auto() FFN_NORM = auto() @@ -134,6 +137,7 @@ class MODEL_TENSOR(IntEnum): FFN_UP_EXP = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -178,6 +182,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", @@ -187,6 +192,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -262,17 +268,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { ], MODEL_ARCH.BERT: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_TYPES, MODEL_TENSOR.POS_EMBD, MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_NORM, ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 16808196e..7af58a46c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -357,6 +357,9 @@ class GGUFWriter: def add_layer_norm_rms_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + def add_causal_attention(self, value: bool) -> None: + self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) @@ -387,6 +390,9 @@ class GGUFWriter: def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) + def add_token_type_count(self, value: int) -> None: + self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value) + def add_token_scores(self, scores: Sequence[float]) -> None: self.add_array(Keys.Tokenizer.SCORES, scores) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 4f16d8504..c7ba1420e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -30,6 +30,7 @@ class TensorNameMap: # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom + "embeddings.LayerNorm", # bert ), # Position embeddings @@ -54,7 +55,6 @@ class TensorNameMap: "transformer.ln_f", # gpt2 gpt-j falcon "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth - "embeddings.LayerNorm", # bert "transformer.norm_f", # mpt "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon @@ -79,7 +79,6 @@ class TensorNameMap: "transformer.h.{bid}.ln_mlp", # falcon40b "model.layers.{bid}.input_layernorm", # llama-hf "layers.{bid}.attention_norm", # llama-pth - "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi "h.{bid}.ln_1", # gpt2 @@ -155,6 +154,11 @@ class TensorNameMap: "model.layers.{bid}.attention.wo", # internlm2 ), + # Attention output norm + MODEL_TENSOR.ATTN_OUT_NORM: ( + "encoder.layer.{bid}.attention.output.LayerNorm", # bert + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf @@ -171,7 +175,6 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf "layers.{bid}.ffn_norm", # llama-pth - "encoder.layer.{bid}.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 @@ -266,6 +269,10 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), + + MODEL_TENSOR.LAYER_OUT_NORM: ( + "encoder.layer.{bid}.output.LayerNorm", # bert + ) } mapping: dict[str, tuple[MODEL_TENSOR, str]] diff --git a/llama.cpp b/llama.cpp index f14e7df1c..bc0d62bb3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -196,6 +196,7 @@ enum llm_arch { LLM_ARCH_STARCODER, LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, + LLM_ARCH_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -220,6 +221,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_PERSIMMON, "persimmon" }, { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -261,6 +263,7 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + LLM_KV_ATTENTION_CAUSAL, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -273,6 +276,7 @@ enum llm_kv { LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, LLM_KV_TOKENIZER_SCORES, LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, @@ -316,6 +320,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -328,6 +333,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, @@ -355,6 +361,7 @@ struct LLM_KV { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, @@ -536,6 +543,23 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_BERT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_BLOOM, { @@ -748,22 +772,37 @@ struct LLM_TN { llm_arch arch; std::string operator()(llm_tensor tensor) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor); } std::string operator()(llm_tensor tensor, const std::string & suffix) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; } }; @@ -1440,6 +1479,11 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_17M, + MODEL_22M, + MODEL_33M, + MODEL_109M, + MODEL_335M, MODEL_0_5B, MODEL_1B, MODEL_2B, @@ -1481,6 +1525,7 @@ struct llama_hparams { uint32_t n_ff; uint32_t n_expert = 0; uint32_t n_expert_used = 0; + uint32_t n_vocab_type = 0; // for BERT-style token types float f_norm_eps; float f_norm_rms_eps; @@ -1493,6 +1538,8 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; + bool causal_attn = true; + bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1720,6 +1767,7 @@ struct llama_model { llama_vocab vocab; struct ggml_tensor * tok_embd; + struct ggml_tensor * type_embd; struct ggml_tensor * pos_embd; struct ggml_tensor * tok_norm; struct ggml_tensor * tok_norm_b; @@ -1839,8 +1887,6 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; - // allocator for the input tensors - ggml_tallocr * alloc = nullptr; // input tensors ggml_backend_buffer_t buf_input = nullptr; @@ -1850,6 +1896,7 @@ struct llama_context { struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] + struct ggml_tensor * inp_sum; // F32 [1, n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -2829,6 +2876,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { case LLAMA_VOCAB_TYPE_SPM: return "SPM"; case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; default: return "unknown"; } } @@ -3000,6 +3048,26 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_BERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + + switch (hparams.n_layer) { + case 3: + model.type = e_model::MODEL_17M; break; // bge-micro + case 6: + model.type = e_model::MODEL_22M; break; // MiniLM-L6 + case 12: + switch (hparams.n_embd) { + case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small + case 768: model.type = e_model::MODEL_109M; break; // bge-base + } break; + case 24: + model.type = e_model::MODEL_335M; break; // bge-large + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3204,6 +3272,16 @@ static void llm_load_vocab( vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; + } else if (tokenizer_name == "bert") { + vocab.type = LLAMA_VOCAB_TYPE_WPM; + + // default special tokens + vocab.special_bos_id = 101; + vocab.special_eos_id = 102; + vocab.special_unk_id = 100; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.add_space_prefix = false; } else { LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); @@ -3232,6 +3310,8 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { + vocab.linefeed_id = vocab.special_pad_id; } else { const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); @@ -3569,6 +3649,7 @@ static bool llm_load_tensors( const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = hparams.n_vocab; + const int64_t n_vocab_type = hparams.n_vocab_type; const int64_t n_ff = hparams.n_ff; GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); @@ -3783,11 +3864,50 @@ static bool llm_load_tensors( layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; - case LLM_ARCH_BLOOM: + case LLM_ARCH_BERT: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); - model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); + model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + } + } break; + case LLM_ARCH_BLOOM: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output { @@ -4739,6 +4859,7 @@ struct llm_build_context { const int32_t n_orig_ctx; const bool do_rope_shift; + const bool causal_attn; const llm_build_cb & cb; @@ -4782,6 +4903,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), + causal_attn (hparams.causal_attn), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5625,6 +5747,100 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_bert() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + // get input vectors with right size + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0); + + // construct input embeddings (token, type, position) + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + // token types are hardcoded to zero ("Sentence A") + struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); + inpL = ggml_add(ctx0, inpL, type_row0); + inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + cb(inpL, "inp_embd", -1); + + // embed layer norm + inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + cb(inpL, "inp_norm", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens] + + // iterate layers + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur = inpL; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); + cb(Vcur, "Vcur", il); + + // seems like we just need to do this for Q? + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // re-add the layer input + cur = ggml_add(ctx0, cur, inpL); + + // attention layer norm + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il); + + struct ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + + // attentions bypass the intermediate layer + cur = ggml_add(ctx0, cur, ffn_inp); + + // output layer norm + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); + + // input for next layer + inpL = cur; + } + + // final output + cur = inpL; + + // pooling + cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); + cb(cur, "result_embed", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_bloom() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6996,12 +7212,10 @@ struct llm_build_context { static struct ggml_cgraph * llama_build_graph( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; - // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { @@ -7022,67 +7236,6 @@ static struct ggml_cgraph * llama_build_graph( struct llm_build_context llm(lctx, batch, cb, worst_case); - // - // set input data - // - - if (!ggml_tallocr_is_measure(lctx.alloc)) { - if (batch.token) { - const int64_t n_tokens = batch.n_tokens; - - ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); - } - - if (batch.embd) { - const int64_t n_embd = llm.n_embd; - const int64_t n_tokens = batch.n_tokens; - - ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); - } - - if (batch.pos) { - const int64_t n_tokens = batch.n_tokens; - - ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); - } - - { - const int64_t n_kv = llm.n_kv; - const int64_t n_tokens = batch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - f = -INFINITY; - } else { - f = 0; - } - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; - } - } - } - } - - if (llm.do_rope_shift) { - const int64_t n_ctx = llm.n_ctx; - - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - } - llm.init(); switch (model.arch) { @@ -7110,6 +7263,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_refact(); } break; + case LLM_ARCH_BERT: + { + result = llm.build_bert(); + } break; case LLM_ARCH_BLOOM: { result = llm.build_bloom(); @@ -7167,6 +7324,83 @@ static struct ggml_cgraph * llama_build_graph( return result; } +static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { + // + // set input data + // + + const auto & hparams = lctx.model.hparams; + const auto & cparams = lctx.cparams; + const auto & kv_self = lctx.kv_self; + + if (batch.token) { + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + } + + if (batch.embd) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + } + + if (batch.pos) { + const int64_t n_tokens = batch.n_tokens; + + ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); + } + + { + const int64_t n_kv = kv_self.n; + const int64_t n_tokens = batch.n_tokens; + + assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + + float * data = (float *) lctx.inp_KQ_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + float f; + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + f = -INFINITY; + } else { + f = 0; + } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; + } + } + } + } + + + { + assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); + float * data = (float *) lctx.inp_sum->data; + + for (int i = 0; i < batch.n_tokens; ++i) { + data[i] = 1.0f/float(batch.n_tokens); + } + } + + if (kv_self.has_shift) { + const int64_t n_ctx = cparams.n_ctx; + + assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } +} + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -7265,17 +7499,22 @@ static int llama_decode_internal( ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - ggml_cgraph * gf = llama_build_graph(lctx, batch); + ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - - // the embeddings could be the second to last tensor, or the third to last tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - if (strcmp(embeddings->name, "result_norm") != 0) { - embeddings = gf->nodes[gf->n_nodes - 3]; - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + if (strcmp(res->name, "result_output") == 0) { + // the embeddings could be the second to last tensor, or the third to last tensor + if (strcmp(embeddings->name, "result_norm") != 0) { + embeddings = gf->nodes[gf->n_nodes - 3]; + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + } + } else if (strcmp(res->name, "result_embed") == 0) { + embeddings = res; + res = nullptr; + } else { + GGML_ASSERT(false); } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -7285,7 +7524,9 @@ static int llama_decode_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is + // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. + if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } @@ -7303,6 +7544,9 @@ static int llama_decode_internal( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); } + + llama_set_inputs(lctx, batch); + ggml_backend_sched_graph_compute(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); @@ -7342,7 +7586,7 @@ static int llama_decode_internal( // extract logits // TODO: do not compute and extract logits if only embeddings are needed // need to update the graphs to skip "result_output" - { + if (res) { auto & logits_out = lctx.logits; #ifndef NDEBUG @@ -7386,9 +7630,11 @@ static int llama_decode_internal( if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; + const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0; + embedding_out.resize(n_embd); ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); - ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float)); ggml_backend_synchronize(embeddings_backend); } @@ -7452,6 +7698,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(false); return unicode_to_bytes_bpe(token_data.text); } + case LLAMA_VOCAB_TYPE_WPM: { + GGML_ASSERT(false); + } default: GGML_ASSERT(false); } @@ -7464,6 +7713,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; return vocab.token_to_id.at(buf); } + case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); } @@ -7934,12 +8184,212 @@ private: llm_bigram_bpe::queue work_queue; }; -typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ +struct llm_tokenizer_wpm { + llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + auto * token_map = &vocab.token_to_id; + + // normalize and split by whitespace + std::vector words = preprocess(text); + + // bos token prepended already + + // find the longest tokens that form the words + for (const std::string &word : words) { + // skip empty words + if (word.size() == 0) { + continue; + } + + // prepend phantom space + std::string word1 = "\xe2\x96\x81" + word; + int n = word1.size(); + + // we're at the start of a new word + int i = 0; + bool match_any = false; + + // move through character position in word + while (i < n) { + // loop through possible match length + bool match = false; + for (int j = n; j > i; j--) { + auto it = token_map->find(word1.substr(i, j - i)); + if (it != token_map->end()) { + output.push_back(it->second); + match = true; + match_any = true; + i = j; + break; + } + } + + // must be an unknown character + if (!match) { + i++; + } + } + + // we didn't find any matches for this word + if (!match_any) { + output.push_back(vocab.special_unk_id); + } + } + + // append eos token + output.push_back(vocab.special_eos_id); + } + + std::vector preprocess(const std::string & text) { + std::string ori_str = normalize(text); + uint64_t ori_size = ori_str.size(); + + // single punct / single symbol / single digit + // baseline: add whitespace on the left and right of punct and chinese characters + std::vector words; + std::string new_str = ""; + uint64_t i = 0; + while (i < ori_size) { + int utf_char_len = utf8_len(ori_str[i]); + if ((utf_char_len == 1) && ispunct(ori_str[i])) { + new_str += " "; + new_str += ori_str[i]; + new_str += " "; + i += 1; + } + else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) { + new_str += " "; + new_str += ori_str.substr(i, 3); + new_str += " "; + i += 3; + } + else { + new_str += ori_str[i]; + i += 1; + } + } + + // split by whitespace + uint64_t l = 0; + uint64_t r = 0; + while (r < new_str.size()) { + // if is whitespace + if (isspace(new_str[r])) { + if (r > l) words.push_back(new_str.substr(l, (r - l))); + l = r + 1; + r = l; + } + else { + r += 1; + } + } + if (r > l) { + words.push_back(new_str.substr(l, (r - l))); + } + return words; + } + + std::string normalize(const std::string & text) { + // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98 + std::string text2 = strip_accents(text); + for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) { + char c = text2[i]; + if (c >= 'A' && c <= 'Z') { + text2[i] = c - 'A' + 'a'; + } + } + return text2; + } + + bool is_chinese_char(const std::string & str) { + int len = str.length(); + unsigned int codepoint = 0; + int num_bytes = 0; + int i = 0; + unsigned char ch = static_cast(str[i]); + if (ch <= 0x7f) { + codepoint = ch; + num_bytes = 1; + } else if ((ch >> 5) == 0x06) { + codepoint = ch & 0x1f; + num_bytes = 2; + } else if ((ch >> 4) == 0x0e) { + codepoint = ch & 0x0f; + num_bytes = 3; + } else if ((ch >> 3) == 0x1e) { + codepoint = ch & 0x07; + num_bytes = 4; + } + for (int j = 1; j < num_bytes; ++j) { + if (i + j >= len) { + return false; // incomplete UTF-8 character + } + unsigned char next_ch = static_cast(str[i + j]); + if ((next_ch >> 6) != 0x02) { + return false; // invalid trailing byte + } + codepoint = (codepoint << 6) | (next_ch & 0x3f); + } + if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) || + (codepoint >= 0x3400 && codepoint <= 0x4DBF) || + (codepoint >= 0x20000 && codepoint <= 0x2A6DF) || + (codepoint >= 0x2A700 && codepoint <= 0x2B73F) || + (codepoint >= 0x2B740 && codepoint <= 0x2B81F) || + (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920 + (codepoint >= 0xF900 && codepoint <= 0xFAFF) || + (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) || + (codepoint >= 0x3000 && codepoint <= 0x303F) || + (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) { + return true; // NOLINT + } + return false; + } + + std::string strip_accents(const std::string & input_string) { + std::string resultString; + std::map accent_map = { + {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'}, + {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'}, + {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'}, + {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'}, + {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'}, + {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'}, + {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'}, + {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'}, + {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'}, + }; + + for (size_t i = 0; i < input_string.length();) { + int len = utf8_len(input_string[i]); + std::string curChar = input_string.substr(i, len); + auto iter = accent_map.find(curChar); + if (iter != accent_map.end()) { + resultString += iter->second; + } else { + resultString += curChar; + } + i += len; + } + + return resultString; + } + + static size_t utf8_len(char src) { + const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; + } + + const llama_vocab & vocab; +}; + +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT } FRAGMENT_BUFFER_VARIANT_TYPE; -struct fragment_buffer_variant{ +struct fragment_buffer_variant { fragment_buffer_variant(llama_vocab::id _token) : type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), @@ -7969,8 +8419,7 @@ struct fragment_buffer_variant{ // #define PRETOKENIZERDEBUG -static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) -{ +static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for (const auto & st: vocab.special_tokens_cache) { const auto & special_token = st.first; @@ -8088,10 +8537,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - for (const auto & fragment: fragment_buffer) - { - if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) - { + for (const auto & fragment: fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer // TODO: It's likely possible to get rid of this string copy entirely @@ -8111,19 +8558,15 @@ static std::vector llama_tokenize_internal(const llama_vocab & llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); tokenizer.tokenize(raw_text, output); - } - else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - { + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } } } break; case LLAMA_VOCAB_TYPE_BPE: { - for (const auto & fragment: fragment_buffer) - { - if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) - { + for (const auto & fragment: fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG @@ -8131,9 +8574,23 @@ static std::vector llama_tokenize_internal(const llama_vocab & #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + output.push_back(fragment.token); } - else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - { + } + } break; + case LLAMA_VOCAB_TYPE_WPM: + { + for (const auto & fragment: fragment_buffer) { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + llm_tokenizer_wpm tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } } @@ -9785,6 +10242,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } ++qs.i_ffn_up; } + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; //} // IK: let's remove this, else Q2_K is almost the same as Q3_K_S @@ -9844,19 +10302,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_XS: case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -10797,7 +11255,7 @@ struct llama_context * llama_new_context_with_model( // graph inputs { ggml_init_params init_params = { - /* .mem_size */ ggml_tensor_overhead()*5, + /* .mem_size */ ggml_tensor_overhead()*7, /* .mem_buffer */ nullptr, /* .no_alloc */ true, }; @@ -10808,12 +11266,14 @@ struct llama_context * llama_new_context_with_model( ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); + ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch); ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_embd, "inp_embd"); ggml_set_name(ctx->inp_pos, "inp_pos"); ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); + ggml_set_name(ctx->inp_sum, "inp_sum"); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); @@ -10839,23 +11299,27 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); - ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_past = cparams.n_ctx - n_tokens; llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); + ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); // initialize scheduler with the worst-case graph - ggml_backend_sched_init_measure(ctx->sched, gf); - ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); + if (!ggml_backend_sched_reserve(ctx->sched, gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + llama_free(ctx); + return nullptr; + } - for (ggml_backend_t backend : ctx->backends) { - ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); + for (size_t i = 0; i < ctx->backends.size(); i++) { + ggml_backend_t backend = ctx->backends[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(buf), - ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); } // note: the number of splits during measure is higher than during inference due to the kv shift @@ -11744,6 +12208,7 @@ static std::string llama_decode_text(const std::string & text) { int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { + case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_SPM: { // NOTE: we accept all unsupported token types, // suppressing them like CONTROL tokens. @@ -11867,6 +12332,7 @@ const char * llama_print_system_info(void) { s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; return s.c_str(); } diff --git a/llama.h b/llama.h index 70b8f5d68..ae134a365 100644 --- a/llama.h +++ b/llama.h @@ -61,6 +61,7 @@ extern "C" { enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; enum llama_token_type { diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 111770d55..1a52ff5e9 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data()); - else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data()); + if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); + else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 73ffcd1ca..17e9e4482 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -284,8 +284,8 @@ int main(int argc, char** argv) { else { auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data()); - else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data()); + if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 6b2514a11..2c391e641 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.h -> ggml-cuda.h # src/ggml-impl.h -> ggml-impl.h + # src/ggml-kompute.cpp -> ggml-kompute.cpp + # src/ggml-kompute.h -> ggml-kompute.h # src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.m -> ggml-metal.m # src/ggml-mpi.h -> ggml-mpi.h @@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-opencl.h -> ggml-opencl.h # src/ggml-quants.c -> ggml-quants.c # src/ggml-quants.h -> ggml-quants.h + # src/ggml-sycl.cpp -> ggml-sycl.cpp + # src/ggml-sycl.h -> ggml-sycl.h + # src/ggml-vulkan.cpp -> ggml-vulkan.cpp + # src/ggml-vulkan.h -> ggml-vulkan.h # include/ggml/ggml.h -> ggml.h # include/ggml/ggml-alloc.h -> ggml-alloc.h # include/ggml/ggml-backend.h -> ggml-backend.h @@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ + -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \ + -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ @@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ + -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \ + -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \ + -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \ + -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \ -e 's/include\/ggml\/ggml\.h/ggml.h/g' \ -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7b6c17915..7a23ab162 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -475cbad5c1c834e31e26a2283bc1413181644360 +5070f078a67c18c11736e78316ab715ca9afde16 diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 0097db435..feb34bbc8 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h +cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp +cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal @@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h +cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp +cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h +cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp +cp -rpv ../ggml/src/ggml-vulkan.h ./ggml-vulkan.h cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h new file mode 120000 index 000000000..a49d385a1 --- /dev/null +++ b/spm-headers/ggml-alloc.h @@ -0,0 +1 @@ +../ggml-alloc.h \ No newline at end of file diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h new file mode 120000 index 000000000..17c2cf14f --- /dev/null +++ b/spm-headers/ggml-backend.h @@ -0,0 +1 @@ +../ggml-backend.h \ No newline at end of file diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h new file mode 120000 index 000000000..39215298f --- /dev/null +++ b/spm-headers/ggml.h @@ -0,0 +1 @@ +../ggml.h \ No newline at end of file diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 43df8022d..5e92d5742 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -87,7 +87,7 @@ static float dot_product_error( vdot.from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data()); + qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); const float dot_ref = dot_product(test_data1, test_data2, test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 8ec817344..48d9fae3d 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -346,7 +346,7 @@ int main(int argc, char * argv[]) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns.vec_dot(size, &result, test_q1, test_q2); + qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); return result; }; size_t quantized_size = ggml_row_size(type, size);