From de280085e7917dbb7f5753de5842ff4455f82a81 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jul 2024 23:48:37 +0200 Subject: [PATCH 001/154] examples : Fix `llama-export-lora` example (#8607) * fix export-lora example * add more logging * reject merging subset * better check * typo --- Makefile | 2 +- common/common.cpp | 19 +- common/common.h | 3 +- examples/export-lora/README.md | 15 +- examples/export-lora/export-lora.cpp | 775 +++++++++++++-------------- 5 files changed, 378 insertions(+), 436 deletions(-) diff --git a/Makefile b/Makefile index 52b55dd89..58a93db1a 100644 --- a/Makefile +++ b/Makefile @@ -1322,7 +1322,7 @@ llama-finetune: examples/finetune/finetune.cpp \ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-export-lora: examples/export-lora/export-lora.cpp \ - $(OBJ_GGML) common/log.h + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index dbb724fbb..4c19132f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); return true; } - if (arg == "--lora-base") { - CHECK_ARG - params.lora_base = argv[i]; - return true; - } if (arg == "--control-vector") { CHECK_ARG params.control_vectors.push_back({ 1.0f, argv[i], }); @@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; + params.lora_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { @@ -1583,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); - options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", @@ -1676,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); + options.push_back({ "export-lora" }); + options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); + options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); + options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); + printf("usage: %s [options]\n", argv[0]); for (const auto & o : options) { @@ -3166,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 184a53dc0..8240ff99b 100644 --- a/common/common.h +++ b/common/common.h @@ -128,7 +128,6 @@ struct gpt_params { // TODO: avoid tuple, use struct std::vector> lora_adapter; // lora adapter path with user defined scale - std::string lora_base = ""; // base model path for the lora adapter std::vector control_vectors; // control vector with user defined scale @@ -255,6 +254,8 @@ struct gpt_params { std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; bool spm_infill = false; // suffix/prefix/middle pattern for infill + + std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; void gpt_params_handle_hf_token(gpt_params & params); diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 1fb17feec..6d51f4b24 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model. usage: llama-export-lora [options] options: - -h, --help show this help message and exit - -m FNAME, --model-base FNAME model path from which to load base model (default '') - -o FNAME, --model-out FNAME path to save exported model (default '') - -l FNAME, --lora FNAME apply LoRA adapter - -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S - -t N, --threads N number of threads to use during computation (default: 4) + -m, --model model path from which to load base model (default '') + --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) + --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) + -t, --threads N number of threads to use during computation (default: 4) + -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') ``` For example: @@ -20,7 +19,7 @@ For example: ./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin + --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin ``` -Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters. +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters. diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 16f27aa77..124ee167d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,465 +1,406 @@ - #include "common.h" #include "ggml.h" #include "ggml-alloc.h" +#include #include #include #include +#include -struct lora_info { - std::string filename; +static bool g_verbose = false; + +static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); +} + +static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { + throw std::runtime_error("failed to load input GGUF from " + fname); + } + return ctx_gguf; +} + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + +struct file_input { + struct ggml_context * ctx_meta = nullptr; + struct gguf_context * ctx_gguf = nullptr; + std::ifstream f_in; + std::map tensors; + float alpha; float scale; + + file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { + if (!f_in.is_open()) { + throw std::runtime_error("failed to open input gguf from " + fname); + } + + ctx_gguf = load_gguf(fname, &ctx_meta); + alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha"); + printf("%s: loaded gguf from %s\n", __func__, fname.c_str()); + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) { + std::string name(cur->name); + tensors[name] = cur; + if (g_verbose) { + printf("%s: %s\n", __func__, cur->name); + } + } + } + + ggml_tensor * get_tensor(std::string name) { + if (tensors.find(name) == tensors.end()) { + return nullptr; + } + return tensors[name]; + } + + void read_tensor_data(std::string name, std::vector & buf) { + if (tensors.find(name) == tensors.end()) { + throw std::runtime_error("cannot find tensor with name: " + name); + } + auto len = ggml_nbytes(tensors[name]); + if (buf.size() < len) { + buf.resize(len); + } + auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + f_in.seekg(offset); + f_in.read((char* )buf.data(), len); + } + + ~file_input() { + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } }; -struct export_lora_params { - std::string fn_model_base; - std::string fn_model_out; - std::vector lora; +struct lora_merge_ctx { + // input base model + adapters + file_input base_model; + std::vector> adapters; + + // for computing merged tensor int n_threads; -}; + ggml_backend_t backend = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector read_buf; -struct lora_data { - struct lora_info info; - std::vector data; - struct ggml_context * ctx; + // output file + struct gguf_context * ctx_out; + struct ggml_context * ctx_out_ggml; + std::ofstream fout; - uint32_t lora_r; - uint32_t lora_alpha; -}; + lora_merge_ctx( + std::string & base_fname, + std::vector> & lora_files, + std::string & outfile, + int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { + fout.exceptions(std::ofstream::failbit); // fail fast on write errors -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; + if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { + throw std::runtime_error("split model is not yet supported"); + } - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; + for (auto lora_inp : lora_files) { + auto fname = std::get<0>(lora_inp); + auto scale = std::get<1>(lora_inp); + std::unique_ptr adapter(new file_input(fname, scale)); + check_metadata_lora(adapter.get()); + adapters.push_back(std::move(adapter)); + } + + ctx_out = gguf_init_empty(); + struct ggml_init_params params = { + /*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_out_ggml = ggml_init(params); + backend = ggml_backend_cpu_init(); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + + void check_metadata_lora(file_input * adapter) { + auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type"); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture"); + auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture"); + if (general_arch_base != general_arch_lora) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + } + + ggml_type get_out_tensor_type(struct ggml_tensor * t) { + if (t->type == GGML_TYPE_F32) { + return GGML_TYPE_F32; } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); + return GGML_TYPE_F16; } } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } + void run_merge() { + // prepare metadata + gguf_set_kv(ctx_out, base_model.ctx_gguf); + // output is forced to f16 for now + gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; + // check if all lora adapters have the same tensors + // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777 + static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; + if (adapters.size() > 1) { + for (size_t i = 1; i < adapters.size(); ++i) { + if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) { + throw std::runtime_error(err_no_subset_adapter); + } + for (auto & it : adapters[i]->tensors) { + if (adapters[0]->get_tensor(it.first) == nullptr) { + throw std::runtime_error(err_no_subset_adapter); + } + } + } } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); + + // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile + std::vector> base_tensors; + for (auto & it : base_model.tensors) { + bool t_a = true; + bool t_b = true; + for (auto & adapter : adapters) { + t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a"); + t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); + } + auto base_tensor = it.second; + struct ggml_tensor * out_tensor; + if (!t_a && !t_b) { + // only copy + out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(out_tensor, base_tensor->name); + base_tensors.push_back(std::make_pair(out_tensor, false)); + } else if (t_a && t_b) { + // need merging + out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + out_tensor->type = get_out_tensor_type(base_tensor); + ggml_set_name(out_tensor, base_tensor->name); + base_tensors.push_back(std::make_pair(out_tensor, true)); + } else { + throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); + } + gguf_add_tensor(ctx_out, out_tensor); } - if (ret != 1) { - die("unexpectedly reached end of file"); + + // placeholder for the meta data + { + size_t meta_size = gguf_get_meta_size(ctx_out); + zeros(fout, meta_size); } - } - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; + // process base model tensors + size_t n_merged = 0; + for (auto & it : base_tensors) { + if (it.second) { + merge_tensor(it.first); + n_merged++; + } else { + copy_tensor(it.first); + } } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); + + // write output metadata + { + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.seekp(0); + fout.write((const char *)data.data(), data.size()); } + + printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size()); } - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); + void copy_tensor(struct ggml_tensor * base) { + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + size_t len = ggml_nbytes(base); + base_model.read_tensor_data(base->name, read_buf); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - bool eof() { - return tell() >= size; - } + void merge_tensor(struct ggml_tensor * base) { + std::string name_base(base->name); + std::string name_lora_a = name_base + ".lora_a"; + std::string name_lora_b = name_base + ".lora_b"; - ~llama_file() { - if (fp) { - std::fclose(fp); + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + + // context for input tensor + std::vector inp_a(adapters.size()); + std::vector inp_b(adapters.size()); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + // alloc tensors + struct ggml_tensor * inp = ggml_dup_tensor(ctx, base); + for (size_t i = 0; i < adapters.size(); ++i) { + auto t_a = adapters[i]->get_tensor(name_lora_a); + auto t_b = adapters[i]->get_tensor(name_lora_b); + inp_a[i] = ggml_dup_tensor(ctx, t_a); + inp_b[i] = ggml_dup_tensor(ctx, t_b); } + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // load data to backend buffer + base_model.read_tensor_data(name_base, read_buf); + ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp)); + for (size_t i = 0; i < adapters.size(); ++i) { + adapters[i]->read_tensor_data(name_lora_a, read_buf); + ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); + adapters[i]->read_tensor_data(name_lora_b, read_buf); + ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i])); + } + + // build graph + struct ggml_cgraph * gf; + { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(params0); + gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur = inp; + for (size_t i = 0; i < adapters.size(); ++i) { + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i])); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]); + // scale + const float alpha = adapters[i]->alpha; + const float rank = (float) inp_b[i]->ne[0]; + const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; + delta = ggml_scale(ctx0, delta, scale); + cur = ggml_add(ctx0, cur, delta); + printf("%s : + merging from adapter[%ld]\n", __func__, i); + printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); + } + cur = ggml_cast(ctx0, cur, get_out_tensor_type(base)); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + } + + // compute + { + ggml_gallocr_alloc_graph(allocr, gf); + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, gf); + } + + // write data to output file + { + auto result = gf->nodes[gf->n_nodes - 1]; + size_t len = ggml_nbytes(result); + if (read_buf.size() < len) { + read_buf.resize(len); + } + ggml_backend_tensor_get(result, read_buf.data(), 0, len); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + } + + ~lora_merge_ctx() { + ggml_gallocr_free(allocr); + ggml_backend_free(backend); + gguf_free(ctx_out); + ggml_free(ctx_out_ggml); } }; -static struct export_lora_params get_default_export_lora_params() { - struct export_lora_params result; - result.fn_model_base = ""; - result.fn_model_out = ""; - result.n_threads = GGML_DEFAULT_N_THREADS; - return result; -} +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); -static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str()); - fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str()); - fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n"); - fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads); -} - -static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) { - bool invalid_param = false; - std::string arg; - struct export_lora_params default_params = get_default_export_lora_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "-m" || arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "-o" || arg == "--model-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_out = argv[i]; - } else if (arg == "-l" || arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - lora.scale = 1.0f; - params->lora.push_back(lora); - } else if (arg == "-s" || arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - lora.scale = std::stof(argv[i]); - params->lora.push_back(lora); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_threads = std::stoi(argv[i]); - if (params->n_threads <= 0) { - params->n_threads = std::thread::hardware_concurrency(); - } - } else if (arg == "-h" || arg == "--help") { - export_lora_print_usage(argc, argv, &default_params); - exit(0); - } else { - fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - } - - if (params->fn_model_base == default_params.fn_model_base) { - fprintf(stderr, "error: please specify a filename for model-base.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (params->fn_model_out == default_params.fn_model_out) { - fprintf(stderr, "error: please specify a filename for model-out.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - return true; -} - -static void free_lora(struct lora_data * lora) { - if (lora->ctx != NULL) { - ggml_free(lora->ctx); - } - delete lora; -} - -static struct lora_data * load_lora(struct lora_info * info) { - struct lora_data * result = new struct lora_data; - result->info = *info; - result->ctx = NULL; - result->lora_r = 1; - result->lora_alpha = 1; - - struct llama_file file(info->filename.c_str(), "rb"); - if (file.fp == NULL) { - fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", - info->filename.c_str()); - free_lora(result); - return NULL; - } - - struct ggml_init_params params_ggml; - params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; - params_ggml.mem_buffer = NULL; - params_ggml.no_alloc = true; - result->ctx = ggml_init(params_ggml); - - uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); - } - uint32_t version = file.read_u32(); - if (version != 1) { - die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str()); - } - result->lora_r = file.read_u32(); - result->lora_alpha = file.read_u32(); - // read tensor infos from file - std::vector name_buf; - std::vector tensors; - std::vector tensors_offset; - size_t total_nbytes_pad = 0; - while(!file.eof()) { - int64_t ne[4] = {1,1,1,1}; - uint32_t n_dims = file.read_u32(); - uint32_t namelen = file.read_u32(); - uint32_t type = file.read_u32(); - for (uint32_t k = 0; k < n_dims; ++k) { - ne[k] = (int64_t)file.read_u32(); - } - name_buf.clear(); - name_buf.resize(namelen + 1, '\0'); - file.read_raw(name_buf.data(), namelen); - file.seek((0-file.tell()) & 31, SEEK_CUR); - size_t offset = file.tell(); - struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); - ggml_set_name(tensor, name_buf.data()); - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - total_nbytes_pad += nbytes_pad; - tensors.push_back(tensor); - tensors_offset.push_back(offset); - file.seek(nbytes, SEEK_CUR); - } - // read tensor data - result->data.resize(total_nbytes_pad); - size_t data_offset = 0; - for (size_t i = 0; i < tensors.size(); ++i) { - struct ggml_tensor * tensor = tensors[i]; - size_t offset = tensors_offset[i]; - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - file.seek(offset, SEEK_SET); - tensor->data = result->data.data() + data_offset; - file.read_raw(tensor->data, nbytes); - data_offset += nbytes_pad; - } - return result; -} - - -static struct ggml_cgraph * build_graph_lora( - struct ggml_context * ctx, - struct ggml_tensor * tensor, - struct ggml_tensor * lora_a, - struct ggml_tensor * lora_b, - float scaling -) { - struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); - if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, scaling); - } - struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand (gf, res); - return gf; -} - -static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) { - if (lora->ctx == NULL) { - return false; - } - std::string name = ggml_get_name(tensor); - std::string name_a = name + std::string(".loraA"); - std::string name_b = name + std::string(".loraB"); - struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str()); - struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str()); - if (lora_a == NULL || lora_b == NULL) { - return false; - } - - float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; - - struct ggml_init_params params; - params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; - params.mem_buffer = NULL; - params.no_alloc = true; - struct ggml_context * ctx = NULL; - struct ggml_gallocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; - - ctx = ggml_init(params); - alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - - ggml_gallocr_alloc_graph(alloc, gf); - - struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); - static std::vector data_work; - data_work.resize(cplan.work_size); - cplan.work_data = data_work.data(); - - ggml_graph_compute(gf, &cplan); - - ggml_gallocr_free(alloc); - ggml_free(ctx); - return true; -} - -static void export_lora(struct export_lora_params * params) { - // load all loras - std::vector loras; - for (size_t i = 0; i < params->lora.size(); ++i) { - struct lora_data * lora = load_lora(¶ms->lora[i]); - if (lora != NULL) { - loras.push_back(lora); - } - } - if (loras.size() == 0) { - fprintf(stderr, "warning: no lora adapters will be applied.\n"); - } - - // open input file - struct llama_file fin(params->fn_model_base.c_str(), "rb"); - if (!fin.fp) { - die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str()); - } - - // open base model gguf, read tensors without their data - struct ggml_context * ctx_in; - struct gguf_init_params params_gguf; - params_gguf.no_alloc = true; - params_gguf.ctx = &ctx_in; - struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); - - // create new gguf - struct gguf_context * gguf_out = gguf_init_empty(); - - // copy meta data from base model: kv and tensors - gguf_set_kv(gguf_out, gguf_in); - int n_tensors = gguf_get_n_tensors(gguf_in); - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - gguf_add_tensor(gguf_out, tensor); - } - - // create output file - struct llama_file fout(params->fn_model_out.c_str(), "wb"); - if (!fout.fp) { - die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str()); - } - - // write gguf meta data - std::vector meta; - meta.resize(gguf_get_meta_size(gguf_out)); - gguf_get_meta_data(gguf_out, meta.data()); - fout.write_raw(meta.data(), meta.size()); - - std::vector data; - std::vector padding; - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - - // read tensor data - data.resize(ggml_nbytes(tensor)); - tensor->data = data.data(); - size_t offset = gguf_get_tensor_offset(gguf_in, i); - fin.seek(offset + meta.size(), SEEK_SET); - fin.read_raw(data.data(), data.size()); - - // apply all loras - for (size_t k = 0; k < loras.size(); ++k) { - apply_lora(tensor, loras[k], params->n_threads); - } - - // write tensor data + padding - padding.clear(); - padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0); - - GGML_ASSERT(fout.tell() == offset + meta.size()); - // fout.seek(offset + meta.size(), SEEK_SET); - fout.write_raw(data.data(), data.size()); - fout.write_raw(padding.data(), padding.size()); - - if (i % 2 == 0) { - printf("."); - } - } + printf("\nexample usage:\n"); + printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); + printf("\nNOTE: output model is F16\n"); printf("\n"); - - // close gguf - gguf_free(gguf_out); - gguf_free(gguf_in); - - // free loras - for (size_t i = 0; i < loras.size(); ++i) { - free_lora(loras[i]); - } } int main(int argc, char ** argv) { - struct export_lora_params params = get_default_export_lora_params(); + gpt_params params; - if (!export_lora_params_parse(argc, argv, ¶ms)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - export_lora(¶ms); + g_verbose = (params.verbosity == 1); + try { + lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); + ctx.run_merge(); + } catch (const std::exception & err) { + fprintf(stderr, "%s\n", err.what()); + exit(EXIT_FAILURE); + } + + printf("done, output file is %s\n", params.lora_outfile.c_str()); return 0; } From b115105f05e3372bc75b2a486c1930c365fd2846 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 24 Jul 2024 11:25:19 +0200 Subject: [PATCH 002/154] add llama_lora_adapter_clear (#8653) --- include/llama.h | 6 +++++- src/llama.cpp | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index e68cd807e..413070d95 100644 --- a/include/llama.h +++ b/include/llama.h @@ -529,12 +529,16 @@ extern "C" { struct llama_lora_adapter * adapter, float scale); - // Remove a LoRA adapter from given context + // Remove a specific LoRA adapter from given context // Return -1 if the adapter is not present in the context LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); + // Remove all LoRA adapters from given context + LLAMA_API void llama_lora_adapter_clear( + struct llama_context * ctx); + // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); diff --git a/src/llama.cpp b/src/llama.cpp index 40c5e8e8d..04eaf6730 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16201,6 +16201,10 @@ int32_t llama_lora_adapter_remove( return -1; } +void llama_lora_adapter_clear(struct llama_context * ctx) { + ctx->lora_adapters.clear(); +} + void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { delete adapter; } From 79167d9e49aef9caa98e13ee7ca067ec9f88b4b5 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 24 Jul 2024 11:55:26 +0100 Subject: [PATCH 003/154] Re-add erroneously removed -fsycl from GGML_EXTRA_LIBS (#8667) --- ggml/src/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 12c440327..c6496c921 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -510,10 +510,10 @@ if (GGML_SYCL) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} pthread m dl onemkl) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) endif() endif() endif() From 96952e7181929c6001b2bc69a33f240de731cc3a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 24 Jul 2024 13:48:46 +0200 Subject: [PATCH 004/154] llama : fix `llama_chat_format_single` for mistral (#8657) * fix `llama_chat_format_single` for mistral * fix typo * use printf --- common/common.cpp | 2 +- examples/main/main.cpp | 1 + tests/test-chat-template.cpp | 30 ++++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4c19132f1..ec44a0552 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2723,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model, const llama_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a0d817b1a..61e960ea2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vectorchat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); + LOG("formatted: %s\n", formatted.c_str()); return formatted; } diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 6583dd0b2..46a7d3aea 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -133,13 +132,31 @@ int main(void) { ); formatted_chat.resize(res); std::string output(formatted_chat.data(), formatted_chat.size()); - std::cout << output << "\n-------------------------\n"; + printf("%s\n", output.c_str()); + printf("-------------------------\n"); assert(output == expected); } - // test llama_chat_format_single - std::cout << "\n\n=== llama_chat_format_single ===\n\n"; + + // test llama_chat_format_single for system message + printf("\n\n=== llama_chat_format_single (system message) ===\n\n"); std::vector chat2; + llama_chat_msg sys_msg{"system", "You are a helpful assistant"}; + + auto fmt_sys = [&](std::string tmpl) { + auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false); + printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str()); + printf("-------------------------\n", output.c_str()); + return output; + }; + assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n"); + assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n"); + assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message + assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>"); + + + // test llama_chat_format_single for user message + printf("\n\n=== llama_chat_format_single (user message) ===\n\n"); chat2.push_back({"system", "You are a helpful assistant"}); chat2.push_back({"user", "Hello"}); chat2.push_back({"assistant", "I am assistant"}); @@ -147,12 +164,13 @@ int main(void) { auto fmt_single = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); - std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; + printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str()); + printf("-------------------------\n", output.c_str()); return output; }; assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); assert(fmt_single("llama2") == "[INST] How are you [/INST]"); - assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); + assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); return 0; From 3a7ac5300a7e8ebbe4a3eb5aff9dba11ed76ea61 Mon Sep 17 00:00:00 2001 From: Thorsten Sommer Date: Wed, 24 Jul 2024 14:52:30 +0200 Subject: [PATCH 005/154] readme : update UI list [no ci] (#8505) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7c233b5e1..b7b9bf588 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well. Unless otherwise noted these projects are open-source with permissive licensing: +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) - [iohub/collama](https://github.com/iohub/coLLaMA) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) From f19bf99c015d3d745143e8bb4f056e0ea015ad40 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 24 Jul 2024 14:36:00 +0100 Subject: [PATCH 006/154] Build Llama SYCL Intel with static libs (#8668) Ensure SYCL CI builds both static & dynamic libs for testing purposes Signed-off-by: Joe Todd --- .devops/llama-cli-intel.Dockerfile | 4 +++- .devops/llama-server-intel.Dockerfile | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index 2bf82bb58..79dba06a7 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + echo "Building with static libs" && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index eb9aba618..f525658dd 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ + echo "Building with dynamic libs" && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server From 68504f0970db5a3602d176953690f503059906b1 Mon Sep 17 00:00:00 2001 From: MorganRO8 <47795945+MorganRO8@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:48:00 -0400 Subject: [PATCH 007/154] readme : update games list (#8673) Added link to game I made that depends on llama --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index b7b9bf588..d0ae2efb9 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +**Games:** +- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. + ## Demo
From 8a4bad50a8ed24ed1e9df003521468dcc37320e8 Mon Sep 17 00:00:00 2001 From: Fan Shupei Date: Thu, 25 Jul 2024 15:21:09 +0800 Subject: [PATCH 008/154] llama: use sliding window for phi3 (#8627) * use sliding window for phi3 * fix typo, "data_swa" -> "data" * [conver_hf_to_gguf.py] add phi3 sliding window --- convert_hf_to_gguf.py | 1 + src/llama.cpp | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dde4fa9c8..4087187c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2084,6 +2084,7 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) diff --git a/src/llama.cpp b/src/llama.cpp index 04eaf6730..9e502018d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4889,6 +4889,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_PHI3: { + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -10748,7 +10749,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -10806,7 +10807,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -14013,18 +14014,23 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { "causal attention is not supported by this model" ); - if (lctx.inp_KQ_mask) { + if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn && !lctx.is_encoding) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = batch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = nullptr; float * data_swa = nullptr; + if (lctx.inp_KQ_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + data = (float *) lctx.inp_KQ_mask->data; + } + if (lctx.inp_KQ_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer)); data_swa = (float *) lctx.inp_KQ_mask_swa->data; } @@ -14047,7 +14053,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { f = 0.0f; } } - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; + + if (data) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; + } // may need to cut off old tokens for sliding window if (data_swa) { @@ -14059,9 +14068,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + if (data) { + for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_kv; ++j) { + data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + } + } + } + + if (data_swa) { + for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_kv; ++j) { + data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + } } } } From 4b0eff3df58d8d86e47348fb73d54da3194d416d Mon Sep 17 00:00:00 2001 From: Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:43:27 +0530 Subject: [PATCH 009/154] docs : Quantum -> Quantized (#8666) * docfix: imatrix readme, quantum models -> quantized models. * docfix: server readme: quantum models -> quantized models. --- examples/imatrix/README.md | 2 +- examples/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 29602881a..bb5faec94 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/imatrix -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 ## Usage diff --git a/examples/server/README.md b/examples/server/README.md index ff4074517..33a2b95cc 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** - * LLM inference of F16 and quantum models on GPU and CPU + * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * Parallel decoding with multi-user support * Continuous batching From be6d7c079173d941b4f784500f9148f46cec2724 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 25 Jul 2024 10:39:04 +0200 Subject: [PATCH 010/154] examples : remove `finetune` and `train-text-from-scratch` (#8669) * examples : remove finetune and train-text-from-scratch * fix build * update help message * fix small typo for export-lora --- .devops/nix/apps.nix | 1 - .devops/tools.sh | 4 - Makefile | 30 +- examples/CMakeLists.txt | 2 - examples/deprecation-warning/README.md | 2 - examples/export-lora/README.md | 12 +- examples/finetune/CMakeLists.txt | 5 - examples/finetune/README.md | 90 - .../convert_finetune_checkpoint_to_gguf.py | 487 ----- examples/finetune/finetune.cpp | 1862 ----------------- examples/finetune/finetune.sh | 34 - .../train-text-from-scratch/CMakeLists.txt | 5 - examples/train-text-from-scratch/README.md | 27 - .../convert_train_checkpoint_to_gguf.py | 499 ----- .../train-text-from-scratch.cpp | 1253 ----------- 15 files changed, 14 insertions(+), 4299 deletions(-) delete mode 100644 examples/finetune/CMakeLists.txt delete mode 100644 examples/finetune/README.md delete mode 100644 examples/finetune/convert_finetune_checkpoint_to_gguf.py delete mode 100644 examples/finetune/finetune.cpp delete mode 100644 examples/finetune/finetune.sh delete mode 100644 examples/train-text-from-scratch/CMakeLists.txt delete mode 100644 examples/train-text-from-scratch/README.md delete mode 100644 examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py delete mode 100644 examples/train-text-from-scratch/train-text-from-scratch.cpp diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index 897fce4d3..0ecf19fc5 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -10,7 +10,6 @@ "llama-embedding" "llama-server" "llama-quantize" - "llama-train-text-from-scratch" ]; mkApp = name: { type = "app"; diff --git a/.devops/tools.sh b/.devops/tools.sh index cf0e8f32d..24dcfd350 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./llama-cli "$@" -elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then - ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -36,8 +34,6 @@ else echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" - echo " --finetune (-f): Run finetune command to create a lora finetune of the model" - echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" diff --git a/Makefile b/Makefile index 58a93db1a..8d2ccddc4 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ BUILD_TARGETS = \ llama-embedding \ llama-eval-callback \ llama-export-lora \ - llama-finetune \ llama-gbnf-validator \ llama-gguf \ llama-gguf-hash \ @@ -37,7 +36,6 @@ BUILD_TARGETS = \ llama-simple \ llama-speculative \ llama-tokenize \ - llama-train-text-from-scratch \ llama-vdot \ llama-cvector-generator \ tests/test-c.o @@ -64,13 +62,13 @@ TEST_TARGETS = \ tests/test-tokenizer-1-spm # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune +LEGACY_TARGETS_BUILD = main quantize perplexity embedding server # Deprecation aliases ifdef LLAMA_CUBLAS @@ -1296,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ $(OBJ_GGML) $(OBJ_LLAMA) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1316,11 +1309,6 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-finetune: examples/finetune/finetune.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1578,7 +1566,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # # Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: main quantize perplexity embedding server finetune +.PHONY: main quantize perplexity embedding server # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. @@ -1621,13 +1609,3 @@ ifneq (,$(wildcard embedding)) @echo " Remove the 'embedding' binary to remove this warning." @echo "#########" endif - -finetune: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard finetune)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead." - @echo " Remove the 'finetune' binary to remove this warning." - @echo "#########" -endif diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 155743639..67b3d2774 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,6 @@ else() add_subdirectory(embedding) add_subdirectory(eval-callback) add_subdirectory(export-lora) - add_subdirectory(finetune) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -53,5 +52,4 @@ else() add_subdirectory(simple) add_subdirectory(speculative) add_subdirectory(tokenize) - add_subdirectory(train-text-from-scratch) endif() diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 1e20feb4a..59918ec2b 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names. | server | llama-server | | llama-bench | llama-bench | | embedding | llama-embedding | -| finetune | llama-finetune | | quantize | llama-quantize | | tokenize | llama-tokenize | | export-lora | llama-export-lora | @@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names. | save-load-state | llama-save-load-state | | simple | llama-simple | | speculative | llama-speculative | -| train-text-from-scratch | llama-train-text-from-scratch | | vdot | llama-vdot | | tests/test-c.o | tests/test-c.o | diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 6d51f4b24..91c33c34a 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -19,7 +19,15 @@ For example: ./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin + --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf ``` -Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters. +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: + +```bash +./bin/llama-export-lora \ + -m your_base_model.gguf \ + -o your_merged_model.gguf \ + --lora-scaled lora_task_A.gguf 0.5 \ + --lora-scaled lora_task_B.gguf 0.5 +``` diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt deleted file mode 100644 index 64afe6ddc..000000000 --- a/examples/finetune/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-finetune) -add_executable(${TARGET} finetune.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/README.md b/examples/finetune/README.md deleted file mode 100644 index 1c27df053..000000000 --- a/examples/finetune/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# finetune - -Basic usage instructions: - -```bash -# get training data -wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt - -# finetune LORA adapter -./bin/llama-finetune \ - --model-base open-llama-3b-v2-q8_0.gguf \ - --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ - --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ - --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \ - --train-data "shakespeare.txt" \ - --save-every 10 \ - --threads 6 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing - -# predict -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin -``` - -**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). -The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. -So in above example after 10 iterations these files will be written: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -After 10 more iterations: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. - -llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. -These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above. - -In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together. - -For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ - --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin -``` - -You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`. - -For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ - --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ - --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin -``` - -The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values. - -Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. -If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. - -The default LORA rank can be specified with `--lora-r N`. -The LORA rank can be configured for each model tensor type separately with these command line options: - -```bash - --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4) - --rank-att-norm N LORA rank for attention norm tensor (default 1) - --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1) - --rank-out-norm N LORA rank for output norm tensor (default 1) - --rank-tok-embd N LORA rank for token embeddings tensor (default 4) - --rank-out N LORA rank for output tensor (default 4) - --rank-wq N LORA rank for wq tensor (default 4) - --rank-wk N LORA rank for wk tensor (default 4) - --rank-wv N LORA rank for wv tensor (default 4) - --rank-wo N LORA rank for wo tensor (default 4) - --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) - --rank-ffn_down N LORA rank for ffn_down tensor (default 4) - --rank-ffn_up N LORA rank for ffn_up tensor (default 4) -``` - -The LORA rank of 'norm' tensors should always be 1. - -To see all available options use `llama-finetune --help`. diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py deleted file mode 100644 index 1b79d6995..000000000 --- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env python3 -# finetune checkpoint --> gguf conversion - -import argparse -import gguf -import struct -import numpy as np -from pathlib import Path - -# gguf constants -LLM_KV_OPTIMIZER_TYPE = "optimizer.type" -LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" -LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" -LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" -LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" -LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" -LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" -LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" -LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" -LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" -LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" -LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" -LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" -LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" - -LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" -LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" -LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" - -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" -LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" - -LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" -LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" -LLM_KV_TRAINING_TYPE = "training.type" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" - -LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd" -LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm" -LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output" -LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm" -LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q" -LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k" -LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v" -LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output" -LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm" -LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate" -LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down" -LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up" - -class Tensor: - def __init__(self, dtype='f', ne=None): - if ne is None: - ne = [] - self.dtype = dtype - self.ne = ne - self.nbytes = 0 - if self.dtype == 'f': - if len(self.ne) == 0: - self.nbytes = 0 - else: - self.nbytes = int(np.prod(self.ne)) * 4 - else: - raise ValueError(f"Unhandled data type '{self.dtype}'") - - def load(self, data, offset): - nd = struct.unpack(' 0 else []) - - self.lbfgs_x = Tensor('f', [self.nx]) - self.lbfgs_xp = Tensor('f', [self.nx]) - self.lbfgs_g = Tensor('f', [self.nx]) - self.lbfgs_gp = Tensor('f', [self.nx]) - self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) - self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) - self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) - - # forgot to save type in version 1: - # guess self.type from number of remaining bytes - size_type_0 = 12 + sum([t.max_storage_size() for t in - [self.adam_m, self.adam_v] - +([self.adam_pf] if (self.past > 0) else [])]) - size_type_1 = 24 + sum([t.max_storage_size() for t in - [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, - self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, - self.lbfgs_lmal, self.lbfgs_lmys, - self.lbfgs_lms, self.lbfgs_lmy] - +([self.lbfgs_pf] if (self.past > 0) else [])]) - # due to alignment padding the size might not by exact - # but the difference in size for both types is significant, - # so we can just use whichever is closest - remaining = len(data) - offset - if abs(remaining - size_type_0) < abs(remaining - size_type_1): - self.type = 0 - else: - self.type = 1 - - if self.type == 0: - offset = self.adam_m.load(data, offset) - offset = self.adam_v.load(data, offset) - offset = self.adam_pf.load(data,offset) - - self.adam_fx_best = struct.unpack(' 0: - self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) - - elif self.type == 1: - gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) - - self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) - self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) - self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) - self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) - self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) - if self.past > 0: - self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) - self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) - self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) - self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) - self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) - else: - raise ValueError('Unknown optimizer type') - -class LoraParams: - def __init__(self): - pass - - def load(self, data, offset): - self.n_rank_attention_norm = struct.unpack(' -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - - // float f_norm_eps = 1e-5f; // falcon - float f_norm_rms_eps = 1e-5f; // llama - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; - - uint32_t n_gqa() const { - return n_head/n_head_kv; - } - - uint32_t n_embd_head() const { - return n_embd/n_head; - } - - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); - } - - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 -}; - -struct my_llama_model { - struct my_llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; -}; - -struct my_llama_lora_hparams { - uint32_t lora_r = 1; - uint32_t lora_alpha = 1; - uint32_t n_rank_attention_norm = 1; - uint32_t n_rank_wq = 4; - uint32_t n_rank_wk = 4; - uint32_t n_rank_wv = 4; - uint32_t n_rank_wo = 4; - uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_ffn_gate = 4; - uint32_t n_rank_ffn_down = 4; - uint32_t n_rank_ffn_up = 4; - uint32_t n_rank_tok_embeddings = 4; - uint32_t n_rank_norm = 1; - uint32_t n_rank_output = 4; - - bool operator!=(const my_llama_lora_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_lora_layer { - // normalization - struct ggml_tensor * attention_norm_a; - struct ggml_tensor * attention_norm_b; - - // attention - struct ggml_tensor * wq_a; - struct ggml_tensor * wq_b; - struct ggml_tensor * wk_a; - struct ggml_tensor * wk_b; - struct ggml_tensor * wv_a; - struct ggml_tensor * wv_b; - struct ggml_tensor * wo_a; - struct ggml_tensor * wo_b; - - // normalization - struct ggml_tensor * ffn_norm_a; - struct ggml_tensor * ffn_norm_b; - - // ff - struct ggml_tensor * ffn_gate_a; - struct ggml_tensor * ffn_gate_b; - struct ggml_tensor * ffn_down_a; - struct ggml_tensor * ffn_down_b; - struct ggml_tensor * ffn_up_a; - struct ggml_tensor * ffn_up_b; -}; - -struct my_llama_lora { - struct ggml_context * ctx = NULL; - ggml_backend_buffer_t data; - - my_llama_lora_hparams hparams; - - struct ggml_tensor * tok_embeddings_a; - struct ggml_tensor * tok_embeddings_b; - - struct ggml_tensor * norm_a; - struct ggml_tensor * norm_b; - struct ggml_tensor * output_a; - struct ggml_tensor * output_b; - - std::vector layers; -}; - -// gguf constants -static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; -static const char * LLM_KV_TRAINING_TYPE = "training.type"; - -static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"; - -// gguf constants (sync with gguf.py) - -static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv"; -static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -static const char * LLM_TENSOR_OUTPUT = "output"; -static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab : %u\n", __func__, params->n_vocab); - printf("%s: n_ctx : %u\n", __func__, params->n_ctx); - printf("%s: n_embd : %u\n", __func__, params->n_embd); - printf("%s: n_ff : %u\n", __func__, params->n_ff); - printf("%s: n_head : %u\n", __func__, params->n_head); - printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); - printf("%s: n_layer : %u\n", __func__, params->n_layer); - printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); - printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); - printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); -} - -static void print_lora_params(struct my_llama_lora_hparams * params) { - printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm); - printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq); - printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk); - printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); - printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); - printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); - printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); - printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); - printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); - printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); - printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); -} - -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - die_fmt("key not found in model: %s", skey.c_str()); \ - } \ -} - -static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) { - std::string arch; - - GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - if (expected_arch != NULL) { - if (arch != expected_arch) { - printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch); - } - GGML_ASSERT(arch == expected_arch); - } - - std::vector keybuf; - keybuf.resize(512); - auto kv = [&arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); - return keybuf.data(); - }; - - GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - - // n_head_kv is optional, default to n_head - hparams->n_head_kv = hparams->n_head; - GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - - float rope_freq_scale = 1.0f; - GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); - GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (rope_freq_scale != 1.0f) { - hparams->rope_freq_scale = 1.0f / rope_freq_scale; - } -} - -static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { - auto & hparams = model->hparams; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); - return tn_buf.data(); - }; - - - // get parameters directly from gguf file - { - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, - }; - struct gguf_context * mctx = gguf_init_from_file(fn_model, params); - - load_model_hparams_gguf(mctx, &hparams, "llama"); - - gguf_free(mctx); - } - hparams.n_vocab = llama_n_vocab(input); - hparams.n_ctx = n_ctx; - - // get tensors from llama_model (possibly mmapped) - model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); - model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM)); - model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT)); - - assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab); - assert_shape_1d(model->norm, hparams.n_embd); - assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab); - - model->layers.resize(hparams.n_layer); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i)); - layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i)); - layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i)); - layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); - layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); - layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); - - assert_shape_1d(layer.attention_norm, hparams.n_embd); - assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); - assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); - assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); - } -} - -static void set_param_lora(struct my_llama_lora * lora) { - const uint32_t n_layer = lora->layers.size(); - - struct ggml_context* ctx = lora->ctx; - - ggml_set_param(ctx, lora->tok_embeddings_a); - ggml_set_param(ctx, lora->tok_embeddings_b); - ggml_set_param(ctx, lora->norm_a); - ggml_set_param(ctx, lora->norm_b); - ggml_set_param(ctx, lora->output_a); - ggml_set_param(ctx, lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - ggml_set_param(ctx, layer.attention_norm_a); - ggml_set_param(ctx, layer.attention_norm_b); - ggml_set_param(ctx, layer.wq_a); - ggml_set_param(ctx, layer.wq_b); - ggml_set_param(ctx, layer.wk_a); - ggml_set_param(ctx, layer.wk_b); - ggml_set_param(ctx, layer.wv_a); - ggml_set_param(ctx, layer.wv_b); - ggml_set_param(ctx, layer.wo_a); - ggml_set_param(ctx, layer.wo_b); - ggml_set_param(ctx, layer.ffn_norm_a); - ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.ffn_gate_a); - ggml_set_param(ctx, layer.ffn_gate_b); - ggml_set_param(ctx, layer.ffn_down_a); - ggml_set_param(ctx, layer.ffn_down_b); - ggml_set_param(ctx, layer.ffn_up_a); - ggml_set_param(ctx, layer.ffn_up_b); - } -} - -static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { - const auto & lparams = lora->hparams; - - const uint32_t n_embd = model->hparams.n_embd; - const uint32_t n_embd_gqa = model->hparams.n_embd_gqa(); - const uint32_t n_layer = model->hparams.n_layer; - const uint32_t n_vocab = model->hparams.n_vocab; - const uint32_t n_ff = model->hparams.n_ff; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // context for lora tensors without their data - struct ggml_init_params ctx_lora_params; - ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); - ctx_lora_params.mem_buffer = NULL; - ctx_lora_params.no_alloc = true; - - struct ggml_context * ctx = ggml_init(ctx_lora_params); - lora->ctx = ctx; - - lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd); - lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab); - lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd); - lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1); - lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd); - lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab); - - ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a")); - ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b")); - ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a")); - ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b")); - ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a")); - ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b")); - - lora->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd); - layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1); - - layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd); - layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa); - layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd); - layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa); - layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - - layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); - layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - - layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); - layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); - layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); - layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); - layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); - layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); - - ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i)); - ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i)); - ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i)); - ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i)); - ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i)); - ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i)); - ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i)); - ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); - } - - set_param_lora(lora); - - // allocate data for lora tensors - lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); -} - -static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { - const uint32_t n_layer = lora->layers.size(); - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(lora->tok_embeddings_a, rnd); - ggml_set_zero(lora->tok_embeddings_b); - randomize_tensor_normal(lora->norm_a, rnd); - ggml_set_zero(lora->norm_b); - randomize_tensor_normal(lora->output_a, rnd); - ggml_set_zero(lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - randomize_tensor_normal(layer.attention_norm_a, rnd); - ggml_set_zero(layer.attention_norm_b); - - randomize_tensor_normal(layer.wq_a, rnd); - ggml_set_zero(layer.wq_b); - randomize_tensor_normal(layer.wk_a, rnd); - ggml_set_zero(layer.wk_b); - randomize_tensor_normal(layer.wv_a, rnd); - ggml_set_zero(layer.wv_b); - randomize_tensor_normal(layer.wo_a, rnd); - ggml_set_zero(layer.wo_b); - - randomize_tensor_normal(layer.ffn_norm_a, rnd); - ggml_set_zero(layer.ffn_norm_b); - - randomize_tensor_normal(layer.ffn_gate_a, rnd); - ggml_set_zero(layer.ffn_gate_b); - randomize_tensor_normal(layer.ffn_down_a, rnd); - ggml_set_zero(layer.ffn_down_b); - randomize_tensor_normal(layer.ffn_up_a, rnd); - ggml_set_zero(layer.ffn_up_b); - } - - free_random_normal_distribution(rnd); -} - -static struct ggml_tensor * llama_build_lora_finetune_graphs( - struct my_llama_model * model, - struct my_llama_lora * lora, - ggml_gallocr_t alloc, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - const int n_tokens, - const int n_batch, - const bool enable_flash_attn, - const bool enable_checkpointing, - const bool measure_only) { - - ggml_set_scratch(ctx, { 0, 0, nullptr, }); - const int n_past = 0; - const int N = n_tokens; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_head_kv = hparams.n_head_kv; - const int n_ff = hparams.n_ff; - const int n_rot = hparams.n_embd_head(); - const int n_embd_head = hparams.n_embd_head(); - const int n_embd_gqa = hparams.n_embd_gqa(); - - const float rms_norm_eps = hparams.f_norm_rms_eps; - const float rope_freq_base = hparams.rope_freq_base; - const float rope_freq_scale = hparams.rope_freq_scale; - - GGML_ASSERT((size_t) n_layer == lora->layers.size()); - - auto set_name = [](struct ggml_tensor * t, const char * n) { - ggml_set_name(t, n); - if (t->grad) { - ggml_format_name(t->grad, "%s->grad", n); - } - }; - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_set_input(KQ_pos); - - // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] - (struct ggml_tensor * t) -> struct ggml_tensor * { - // not capturing these, to silcence warnings - const int rope_mode = 0; - - return ggml_rope_ext(ctx, - t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f - ); - }; - - set_name(tokens_input, "tokens_input"); - set_name(targets, "targets"); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - - auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) { - return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); - } else if (a->type == GGML_TYPE_F32) { - return ggml_add(ctx, a, b); - } else { - die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n", - __func__, ggml_type_name(a->type)); - } - }; - - struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b)); - struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b)); - struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b)); - - struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); - struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); - - struct ggml_tensor * cur = t01; - - std::vector checkpoints; - if (enable_checkpointing) { - checkpoints.push_back(tokens_input); - checkpoints.push_back(targets); - checkpoints.push_back(t00); - checkpoints.push_back(t01); - } - - const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - struct my_llama_lora_layer & llayer = lora->layers[il]; - - struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); - struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); - struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); - struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); - - struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); - struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); - struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); - struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); - struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch); - struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch); - struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch); - - struct ggml_tensor * t11; - if (ggml_is_quantized(wv->type)) { - struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch); - struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa); - t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } else { - t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } - - struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv); - struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch); - struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch); - struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); - struct ggml_tensor * t16; - if (enable_flash_attn) { - GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); - //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } else { - struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); - struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); - struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); - struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); - t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } - struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); - struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); - struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); - struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); - struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); - struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); - struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); - struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); - struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); - cur = t30; - if (enable_checkpointing) { - checkpoints.push_back(cur); - } - } - struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); - struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); - - if (enable_checkpointing) { - checkpoints.push_back(t31); - checkpoints.push_back(t32); - checkpoints.push_back(t33); - checkpoints.push_back(t34); - checkpoints.push_back(t35); - checkpoints.push_back(t36); - } - - ggml_build_forward_expand(gf, t36); - - if (enable_checkpointing) { - ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); - } else { - ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, true); - } - - GGML_ASSERT(alloc != NULL); - - // make sure some tensors are not reallocated by inserting new temporary nodes depending on them - int n_leafs_before = gb->n_leafs; - int n_nodes_before = gb->n_nodes; - - // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); - // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); - GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_set_input(t36->grad); - // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); - - // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); - } - - // allocating checkpoints in one block to reduce memory fragmentation - // note: they will be freed in reverse order - for (unsigned int i = 0; i < checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_set_input(checkpoints[i]); - } - } - - if (measure_only) { - ggml_gallocr_reserve(alloc, gb); - } else { - ggml_gallocr_alloc_graph(alloc, gb); - - // set KQ_pos - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - } - - // remove the additional nodes and leafs - for (int i = n_leafs_before; i < gb->n_leafs; ++i) { - gb->leafs[i] = NULL; - } - for (int i = n_nodes_before; i < gb->n_nodes; ++i) { - gb->nodes[i] = NULL; - } - gb->n_leafs = n_leafs_before; - gb->n_nodes = n_nodes_before; - - *logits = t35; - return t36; -} - -static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - std::string arch; - - std::vector keybuf; - keybuf.resize(512); - - GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - GGML_ASSERT(arch == "llama"); - - uint32_t ftype_u; - GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); - GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - - struct my_llama_hparams hparams; - load_model_hparams_gguf(fctx, &hparams, arch.c_str()); - - // parameters that define tensor shapes must match - GGML_ASSERT(hparams.n_embd == model->hparams.n_embd); - GGML_ASSERT(hparams.n_ff == model->hparams.n_ff); - GGML_ASSERT(hparams.n_head == model->hparams.n_head); - GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv); - GGML_ASSERT(hparams.n_layer == model->hparams.n_layer); - - GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); - - init_lora(model, lora); - - copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); - copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); - copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); - copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); - copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); - copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); - copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); - copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); - copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); - copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); - copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); - copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); - copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); - copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); - copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); - copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); - copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); - copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); - copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); - copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); - copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); - copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); - } -} - -static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) { - const char * arch = "llama"; - enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - - std::vector keybuf; - keybuf.resize(512); - auto kv = [arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch); - return keybuf.data(); - }; - - gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); - gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); - gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); - gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv); - gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer); - gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head()); - gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale); - - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); - - gguf_add_tensor(fctx, lora->tok_embeddings_a); - gguf_add_tensor(fctx, lora->tok_embeddings_b); - gguf_add_tensor(fctx, lora->norm_a); - gguf_add_tensor(fctx, lora->norm_b); - gguf_add_tensor(fctx, lora->output_a); - gguf_add_tensor(fctx, lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - - gguf_add_tensor(fctx, layer.attention_norm_a); - gguf_add_tensor(fctx, layer.attention_norm_b); - gguf_add_tensor(fctx, layer.wq_a); - gguf_add_tensor(fctx, layer.wq_b); - gguf_add_tensor(fctx, layer.wk_a); - gguf_add_tensor(fctx, layer.wk_b); - gguf_add_tensor(fctx, layer.wv_a); - gguf_add_tensor(fctx, layer.wv_b); - gguf_add_tensor(fctx, layer.wo_a); - gguf_add_tensor(fctx, layer.wo_b); - gguf_add_tensor(fctx, layer.ffn_norm_a); - gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.ffn_gate_a); - gguf_add_tensor(fctx, layer.ffn_gate_b); - gguf_add_tensor(fctx, layer.ffn_down_a); - gguf_add_tensor(fctx, layer.ffn_down_b); - gguf_add_tensor(fctx, layer.ffn_up_a); - gguf_add_tensor(fctx, layer.ffn_up_b); - } -} - -static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA; - GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); - GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - - load_train_state_gguf(fctx, f_ggml_ctx, train); - load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora); -} - -static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - save_llama_lora_gguf(fctx, model, lora); - save_train_state_gguf(fctx, train); -} - -static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - struct ggml_context * f_ggml_ctx; - struct gguf_init_params params; - params.no_alloc = false; - params.ctx = &f_ggml_ctx; - struct gguf_context * fctx = gguf_init_from_file(filename, params); - if (fctx == NULL) { - return false; - } - - load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train); - - gguf_free(fctx); - return true; -} - -static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - printf("%s: saving to %s\n", __func__, filename); - struct gguf_context * fctx = gguf_init_empty(); - - save_checkpoint_lora_gguf(fctx, model, lora, train); - - // write file - const bool only_meta = false; - gguf_write_to_file(fctx, filename, only_meta); - gguf_free(fctx); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); - } - if (ret != 1) { - die("unexpectedly reached end of file"); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - if (name == NULL) { - name = ggml_get_name(tensor); - } - uint32_t name_len = strlen(name); - uint32_t nd = ggml_n_dims(tensor); - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - -static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) { - printf("%s: saving to %s\n", __func__, filename); - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - - auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic - file.write_u32(1); // version - // write_hparams - file.write_u32(lora->hparams.lora_r); - file.write_u32(lora->hparams.lora_alpha); - // write tensors - write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA")); - write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB")); - write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA")); - write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB")); - write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA")); - write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB")); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA")); - write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB")); - write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA")); - write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB")); - write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA")); - write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB")); - write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA")); - write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); - } -} - -struct train_params { - struct train_params_common common; - - const char * fn_model_base; - const char * fn_lora_out; - - bool only_write_lora; - - float f_norm_rms_eps; - float rope_freq_base; - float rope_freq_scale; - - bool custom_f_norm_rms_eps; - bool custom_rope_freq_base; - bool custom_rope_freq_scale; - - int32_t lora_r; - int32_t lora_alpha; - bool custom_lora_alpha; - - uint32_t n_rank_attention_norm; - uint32_t n_rank_wq; - uint32_t n_rank_wk; - uint32_t n_rank_wv; - uint32_t n_rank_wo; - uint32_t n_rank_ffn_norm; - uint32_t n_rank_ffn_gate; - uint32_t n_rank_ffn_down; - uint32_t n_rank_ffn_up; - uint32_t n_rank_tok_embeddings; - uint32_t n_rank_norm; - uint32_t n_rank_output; - - bool custom_n_rank_attention_norm; - bool custom_n_rank_wq; - bool custom_n_rank_wk; - bool custom_n_rank_wv; - bool custom_n_rank_wo; - bool custom_n_rank_ffn_norm; - bool custom_n_rank_ffn_gate; - bool custom_n_rank_ffn_down; - bool custom_n_rank_ffn_up; - bool custom_n_rank_tok_embeddings; - bool custom_n_rank_norm; - bool custom_n_rank_output; -}; - -static struct train_params get_default_train_params() { - struct train_params params; - params.common = get_default_train_params_common(); - params.fn_model_base = ""; - params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf"; - - params.only_write_lora = false; - - params.f_norm_rms_eps = 1e-5f; - params.rope_freq_base = 10000.0f; - params.rope_freq_scale = 1.0f; - - params.custom_f_norm_rms_eps = false; - params.custom_rope_freq_base = false; - params.custom_rope_freq_scale = false; - - params.lora_r = 4; - params.lora_alpha = 4; - params.custom_lora_alpha = false; - - params.n_rank_attention_norm = 1; - params.n_rank_wq = 4; - params.n_rank_wk = 4; - params.n_rank_wv = 4; - params.n_rank_wo = 4; - params.n_rank_ffn_norm = 1; - params.n_rank_ffn_gate = 4; - params.n_rank_ffn_down = 4; - params.n_rank_ffn_up = 4; - params.n_rank_tok_embeddings = 4; - params.n_rank_norm = 1; - params.n_rank_output = 4; - - params.custom_n_rank_attention_norm = false; - params.custom_n_rank_wq = false; - params.custom_n_rank_wk = false; - params.custom_n_rank_wv = false; - params.custom_n_rank_wo = false; - params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_ffn_gate = false; - params.custom_n_rank_ffn_down = false; - params.custom_n_rank_ffn_up = false; - params.custom_n_rank_tok_embeddings = false; - params.custom_n_rank_norm = false; - params.custom_n_rank_output = false; - - return params; -} - -static void train_print_usage(int argc, char ** argv, const struct train_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - - fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base); - fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out); - fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n"); - fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); - fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); - fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha); - fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r); - fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); - - print_common_train_usage(argc, argv, ¶ms->common); -} - -static bool train_params_parse(int argc, char ** argv, struct train_params * params) { - bool invalid_param = false; - std::string arg; - struct train_params default_params = get_default_train_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { - if (invalid_param) { - break; - } else if (params->common.print_usage) { - train_print_usage(argc, argv, &default_params); - exit(0); - } - } else if (arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "--lora-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_lora_out = argv[i]; - } else if (arg == "--only-write-lora") { - params->only_write_lora = true; - } else if (arg == "--norm-rms-eps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->f_norm_rms_eps = std::stof(argv[i]); - params->custom_f_norm_rms_eps = true; - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_base = std::stof(argv[i]); - params->custom_rope_freq_base = true; - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_scale = std::stof(argv[i]); - params->custom_rope_freq_scale = true; - } else if (arg == "--lora-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_alpha = std::stoi(argv[i]); - params->custom_lora_alpha = true; - } else if (arg == "--lora-r") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_r = std::stoi(argv[i]); - } else if (arg == "--rank-att-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_attention_norm = std::stoi(argv[i]); - params->custom_n_rank_attention_norm = true; - } else if (arg == "--rank-ffn-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_norm = std::stoi(argv[i]); - params->custom_n_rank_ffn_norm = true; - } else if (arg == "--rank-out-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_norm = std::stoi(argv[i]); - params->custom_n_rank_norm = true; - } else if (arg == "--rank-tok-embd") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_tok_embeddings = std::stoi(argv[i]); - params->custom_n_rank_tok_embeddings = true; - } else if (arg == "--rank-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_output = std::stoi(argv[i]); - params->custom_n_rank_output = true; - } else if (arg == "--rank-wq") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wq = std::stoi(argv[i]); - params->custom_n_rank_wq = true; - } else if (arg == "--rank-wk") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wk = std::stoi(argv[i]); - params->custom_n_rank_wk = true; - } else if (arg == "--rank-wv") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wv = std::stoi(argv[i]); - params->custom_n_rank_wv = true; - } else if (arg == "--rank-wo") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wo = std::stoi(argv[i]); - params->custom_n_rank_wo = true; - } else if (arg == "--rank-ffn_gate") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_gate = std::stoi(argv[i]); - params->custom_n_rank_ffn_gate = true; - } else if (arg == "--rank-ffn_down") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_down = std::stoi(argv[i]); - params->custom_n_rank_ffn_down = true; - } else if (arg == "--rank-ffn_up") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_up = std::stoi(argv[i]); - params->custom_n_rank_ffn_up = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - finish_processing_train_args(¶ms->common); - return true; -} - -struct save_train_files_data { - const char * fn_checkpoint_out; - const char * fn_lora_out; - const char * pattern_fn_it; - const char * fn_latest; - struct my_llama_model * model; - struct my_llama_lora * lora; -}; - -static void save_train_files(void * vdata, struct train_state * train) { - struct save_train_files_data * data = (struct save_train_files_data *) vdata; - - int64_t iter = train->opt->iter; - - if (strlen(data->fn_checkpoint_out) > 0) { - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train); - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train); - } - if (strlen(data->fn_lora_out) > 0) { - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora); - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora); - } -} - -static int64_t get_parameter_count(struct my_llama_lora* lora) { - int64_t nx = 0; - nx += ggml_nelements(lora->tok_embeddings_a); - nx += ggml_nelements(lora->tok_embeddings_b); - nx += ggml_nelements(lora->norm_a); - nx += ggml_nelements(lora->norm_b); - nx += ggml_nelements(lora->output_a); - nx += ggml_nelements(lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - nx += ggml_nelements(layer.attention_norm_a); - nx += ggml_nelements(layer.attention_norm_b); - nx += ggml_nelements(layer.wq_a); - nx += ggml_nelements(layer.wq_b); - nx += ggml_nelements(layer.wk_a); - nx += ggml_nelements(layer.wk_b); - nx += ggml_nelements(layer.wv_a); - nx += ggml_nelements(layer.wv_b); - nx += ggml_nelements(layer.wo_a); - nx += ggml_nelements(layer.wo_b); - nx += ggml_nelements(layer.ffn_norm_a); - nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.ffn_gate_a); - nx += ggml_nelements(layer.ffn_gate_b); - nx += ggml_nelements(layer.ffn_down_a); - nx += ggml_nelements(layer.ffn_down_b); - nx += ggml_nelements(layer.ffn_up_a); - nx += ggml_nelements(layer.ffn_up_b); - } - return nx; -} - -int main(int argc, char ** argv) { - struct train_params params = get_default_train_params(); - - if (!train_params_parse(argc, argv, ¶ms)) { - return 1; - } - - if (params.common.seed == LLAMA_DEFAULT_SEED) { - params.common.seed = time(NULL); - } - printf("%s: seed: %u\n", __func__, params.common.seed); - srand(params.common.seed); - - struct llama_model_params llama_mparams = llama_model_default_params(); - llama_mparams.n_gpu_layers = params.common.n_gpu_layers; - llama_mparams.vocab_only = false; - - printf("%s: model base = '%s'\n", __func__, params.fn_model_base); - struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams); - - struct llama_context_params llama_cparams = llama_context_default_params(); - struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams); - - struct my_llama_model model; - init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx); - - struct my_llama_lora lora; - - struct train_state * train = init_train_state(); - struct ggml_opt_context * opt = train->opt; - - // set params from command line - if (params.custom_f_norm_rms_eps) { - model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; - } - if (params.custom_rope_freq_base) { - model.hparams.rope_freq_base = params.rope_freq_base; - } - if (params.custom_rope_freq_scale) { - model.hparams.rope_freq_scale = params.rope_freq_scale; - } - lora.hparams.lora_r = params.lora_r; - lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r; - uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1; - uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r; - uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r; - uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; - uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; - uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; - uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; - uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; - uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; - uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; - uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; - lora.hparams.n_rank_attention_norm = n_rank_attention_norm; - lora.hparams.n_rank_wq = n_rank_wq; - lora.hparams.n_rank_wk = n_rank_wk; - lora.hparams.n_rank_wv = n_rank_wv; - lora.hparams.n_rank_wo = n_rank_wo; - lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; - lora.hparams.n_rank_ffn_down = n_rank_ffn_down; - lora.hparams.n_rank_ffn_up = n_rank_ffn_up; - lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; - lora.hparams.n_rank_norm = n_rank_norm; - lora.hparams.n_rank_output = n_rank_output; - - // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - opt->params.print_forward_graph = false; - opt->params.print_backward_graph = false; - opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; - opt->params.n_threads = params.common.n_threads; - opt->params.past = params.common.opt_past; - opt->params.delta = params.common.opt_delta; - opt->params.max_no_improvement = params.common.opt_max_no_improvement; - opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; - opt->params.adam.n_iter = params.common.adam_n_iter; - opt->params.adam.sched = 1.0f; - opt->params.adam.alpha = params.common.adam_alpha; - opt->params.adam.decay = params.common.adam_decay; - opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; - opt->params.adam.beta1 = params.common.adam_beta1; - opt->params.adam.beta2 = params.common.adam_beta2; - opt->params.adam.gclip = params.common.adam_gclip; - opt->params.adam.eps_f = params.common.adam_eps_f; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); - - if (existed) { - // overwrite last n_ctx with user provided n_ctx - if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; - } - - const bool opt_param_count_changed = ( - (lora.hparams.n_rank_attention_norm != n_rank_attention_norm) - || (lora.hparams.n_rank_wq != n_rank_wq) - || (lora.hparams.n_rank_wk != n_rank_wk) - || (lora.hparams.n_rank_wv != n_rank_wv) - || (lora.hparams.n_rank_wo != n_rank_wo) - || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) - || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) - || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) - || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) - || (lora.hparams.n_rank_norm != n_rank_norm) - || (lora.hparams.n_rank_output != n_rank_output) - ); - - const bool opt_past_changed = opt->params.past != params.common.opt_past; - - if (opt_param_count_changed) { - print_lora_params(&lora.hparams); - die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting."); - // need to discard previous optimizer gradient statistics and opt_init with new shapes - // TODO - } - if (opt_past_changed) { - die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); - // need to discard previous optimizer past function value statistics and opt_init with new shapes - // TODO - } - } else { // existed == false - init_lora(&model, &lora); - randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); - if (!params.only_write_lora) { - ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora)); - } - } - opt->iter = train->train_its; - - print_params(&model.hparams); - print_lora_params(&lora.hparams); - printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); - printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); - printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); - printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); - - if (params.only_write_lora) { - save_train_files_data save_data; - save_data.fn_checkpoint_out = ""; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - save_train_files(&save_data, train); - - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; - } - - printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); - printf("%s: opt iter %d\n", __func__, opt->iter); - - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.common.n_batch; - - // context for input tensors without their data - struct ggml_init_params ctx_input_params = { - ggml_tensor_overhead() * 2, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_input = ggml_init(ctx_input_params); - - // the input tensors - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - // allocate input tensors - // measure required memory for input tensors - ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); - size_t max_input_size = ggml_backend_buffer_get_size(input_data); - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - - // context for compute tensors without their data - const size_t estimated_compute_size_wo_data = ( - 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + - (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) - ); - struct ggml_init_params ctx_compute_params = { - estimated_compute_size_wo_data, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_compute = NULL; - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - - struct ggml_cgraph * gf = NULL; - struct ggml_cgraph * gb = NULL; - struct ggml_cgraph * gb_tmp = NULL; - - // measure required memory for compute tensors - size_t best_compute_size = SIZE_MAX; - enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; - // find best evaluation order - for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - true - ); - size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer - if (max_compute_size < best_compute_size) { - best_compute_size = max_compute_size; - best_order = gf->order; - } - ggml_gallocr_free(alloc); - ggml_free(ctx_compute); - } - size_t max_compute_size = best_compute_size; - printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); - printf("%s: evaluation order = %s\n", __func__, - (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : - (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : - "invalid"); - - // allocate compute tensors - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = best_order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - false - ); - - // tokenize data - std::vector train_tokens; - std::vector train_samples_begin; - std::vector train_samples_size; - printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); - printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); - printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); - tokenize_file(lctx, - params.common.fn_train_data, - params.common.sample_start, - params.common.include_sample_start, - params.common.overlapping_samples, - n_tokens, - train_tokens, - train_samples_begin, - train_samples_size); - GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); - - printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); - - std::vector token_noccurs; - token_noccurs.resize(model.hparams.n_vocab, 0); - for (unsigned int i = 0; i < train_tokens.size(); ++i) { - ++token_noccurs[train_tokens[i]]; - } - int n_unique_tokens = 0; - for (unsigned int i = 0; i < token_noccurs.size(); ++i) { - if (token_noccurs[i] == 0) continue; - ++n_unique_tokens; - } - printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - - size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); - const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); - if (changed_train_data) { - printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); - } - if (params.common.force_reshuffle) { - printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); - } - if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { - train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); - train->shuffle_sample_count = train_samples_size.size(); - train->shuffle_next_sample = 0; - train->shuffle_samples_hash = shuffle_samples_hash; - } - std::vector train_shuffled_samples_offs; - std::vector train_shuffled_samples_begin; - std::vector train_shuffled_samples_size; - train_shuffled_samples_offs.resize(train_samples_begin.size()); - train_shuffled_samples_begin.resize(train_samples_begin.size()); - train_shuffled_samples_size.resize(train_samples_size.size()); - train->shuffle_rng_state_next = shuffle_samples( - train->shuffle_rng_state_current, - train_shuffled_samples_offs.data(), - train_shuffled_samples_begin.data(), - train_shuffled_samples_size.data(), - train_samples_begin.data(), - train_samples_size.data(), - train_samples_size.size()); - - printf("%s: begin training\n", __func__); - - save_train_files_data save_data; - save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - struct train_opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms.common; - opt_cb_data.train = train; - opt_cb_data.save_cb = &save_train_files; - opt_cb_data.save_data = &save_data; - opt_cb_data.lctx = lctx; - opt_cb_data.last_save_iter = opt->iter; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_begin = train_samples_begin.data(); - opt_cb_data.samples_size = train_samples_size.data(); - opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); - opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); - opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); - opt_cb_data.samples_count = train_samples_size.size(); - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_probs = target_probs; - opt_cb_data.first_iter = opt->iter; - opt_cb_data.first_epoch = train->train_epochs; - opt_cb_data.iter_at_last_epoch = -1; - opt_cb_data.last_time = ggml_time_ms(); - opt_cb_data.millis_per_iter = 0.0; - - // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; - printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); - - // context for work buffer - struct ggml_init_params ctx_work_params = { - max_work_size, // mem_size - NULL, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx_work = ggml_init(ctx_work_params); - - int64_t t0 = ggml_time_ms(); - - ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - - ggml_free(ctx_work); - ggml_free(ctx_compute); - ggml_free(ctx_input); - ggml_gallocr_free(alloc); - - - int64_t t1 = ggml_time_ms(); - printf("%s: total training time: ", __func__); - print_duration((double) (t1 - t0)); - printf("\n"); - - int new_iters = opt->iter - opt_cb_data.last_save_iter; - if (new_iters > 0) { - train->train_its += new_iters; - train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - - save_train_files(&save_data, train); - opt_cb_data.last_save_iter = opt->iter; - } - - ggml_free(opt->ctx); - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; -} diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh deleted file mode 100644 index e3cc7f271..000000000 --- a/examples/finetune/finetune.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -cd `dirname $0` -cd ../.. - -EXE="./llama-finetune" - -if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi -if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi - -# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. -MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing. - -while getopts "dg" opt; do - case $opt in - d) - DEBUGGER="gdb --args" - ;; - g) - EXE="./build/bin/Release/finetune" - GPUARG="--gpu-layers 25" - ;; - esac -done - -$DEBUGGER $EXE \ - --model-base $MODEL \ - $GPUARG \ - --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ - --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ - --lora-out lora-ol3b-shakespeare-ITERATION.bin \ - --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ - --save-every 10 \ - --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt deleted file mode 100644 index 9a1d2a35e..000000000 --- a/examples/train-text-from-scratch/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-train-text-from-scratch) -add_executable(${TARGET} train-text-from-scratch.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md deleted file mode 100644 index 3abae2380..000000000 --- a/examples/train-text-from-scratch/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# train-text-from-scratch - -Basic usage instructions: - -```bash -# get training data -wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt - -# train -./bin/llama-train-text-from-scratch \ - --vocab-model ../models/ggml-vocab-llama.gguf \ - --ctx 64 --embd 256 --head 8 --layer 16 \ - --checkpoint-in chk-shakespeare-256x16-LATEST.gguf \ - --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \ - --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \ - --train-data "shakespeare.txt" \ - -t 6 -b 16 --seed 1 --adam-iter 256 \ - --no-checkpointing - -# predict -./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf -``` - -Output files will be saved every N iterations (config with `--save-every N`). -The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output. - -To train GGUF models just pass them to `--checkpoint-in FN`. diff --git a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py deleted file mode 100644 index e045beb72..000000000 --- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env python3 -# train-text-from-scratch checkpoint --> gguf conversion - -import argparse -import os -import struct -import sys -import numpy as np -from pathlib import Path - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py')) -import gguf - -# gguf constants -LLM_KV_OPTIMIZER_TYPE = "optimizer.type" -LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" -LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" -LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" -LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" -LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" -LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" -LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" -LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" -LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" -LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" -LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" -LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" -LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" - -LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" -LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" -LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" - -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" -LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" - -LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" -LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" -LLM_KV_TRAINING_TYPE = "training.type" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" - -class Tensor: - def __init__(self, dtype='f', ne=None): - if ne is None: - ne = [] - self.dtype = dtype - self.ne = ne - self.nbytes = 0 - if self.dtype == 'f': - if len(self.ne) == 0: - self.nbytes = 0 - else: - self.nbytes = int(np.prod(self.ne)) * 4 - else: - raise ValueError(f"Unhandled data type '{self.dtype}'") - - def load(self, data, offset): - nd = struct.unpack(' 0 else []) - - self.lbfgs_x = Tensor('f', [self.nx]) - self.lbfgs_xp = Tensor('f', [self.nx]) - self.lbfgs_g = Tensor('f', [self.nx]) - self.lbfgs_gp = Tensor('f', [self.nx]) - self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) - self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) - self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) - - if self.type == 0: - # these tensors are stored, but we don't need their data - x = Tensor('f', [self.nx]) - g = Tensor('f', [self.nx]) - g2 = Tensor('f', [self.nx]) - mh = Tensor('f', [self.nx]) - vh = Tensor('f', [self.nx]) - - offset = x.load(data, offset) - offset = g.load(data, offset) - offset = g2.load(data, offset) - offset = self.adam_m.load(data, offset) - offset = self.adam_v.load(data, offset) - offset = mh.load(data, offset) - offset = vh.load(data, offset) - offset = self.adam_pf.load(data, offset) - - self.adam_fx_best = struct.unpack(' 0 else []) - - self.lbfgs_x = Tensor('f', [self.nx]) - self.lbfgs_xp = Tensor('f', [self.nx]) - self.lbfgs_g = Tensor('f', [self.nx]) - self.lbfgs_gp = Tensor('f', [self.nx]) - self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) - self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) - self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) - - # forgot to save type in version 1: - # guess self.type from number of remaining bytes - size_type_0 = 12 + sum([t.max_storage_size() for t in - [self.adam_m, self.adam_v] - +([self.adam_pf] if (self.past > 0) else [])]) - size_type_1 = 24 + sum([t.max_storage_size() for t in - [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, - self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, - self.lbfgs_lmal, self.lbfgs_lmys, - self.lbfgs_lms, self.lbfgs_lmy] - +([self.lbfgs_pf] if (self.past > 0) else [])]) - # due to alignment padding the size might not by exact - # but the difference in size for both types is significant, - # so we can just use whichever is closest - remaining = len(data) - offset - if abs(remaining - size_type_0) < abs(remaining - size_type_1): - self.type = 0 - else: - self.type = 1 - - if self.type == 0: - offset = self.adam_m.load(data, offset) - offset = self.adam_v.load(data, offset) - offset = self.adam_pf.load(data,offset) - - self.adam_fx_best = struct.unpack(' 0: - self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) - - elif self.type == 1: - gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) - - self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) - self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) - self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) - self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) - self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) - if self.past > 0: - self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) - self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) - self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) - self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) - self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) - else: - raise ValueError('Unknown optimizer type') - -class ModelParams: - def __init__(self): - pass - - def load(self, data, offset): - self.n_vocab = struct.unpack(' -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; - uint32_t n_embd = 4096; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_ff = 11008; - - // float f_norm_eps = 1e-5f; // falcon - float f_norm_rms_eps = 1e-5f; // llama - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; -}; - -struct my_llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 -}; - -struct my_llama_model { - struct ggml_context * ctx = NULL; - ggml_backend_buffer_t data = NULL; - - my_llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; -}; - -// gguf constants (sync with gguf.py) -static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; -static const char * LLM_KV_TRAINING_TYPE = "training.type"; - -static const char * LLM_KV_GENERAL_NAME = "general.name"; -static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -static const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; -static const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; -static const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; -static const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; -static const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; -static const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; -static const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; -static const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; -static const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; -static const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; - -static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -static const char * LLM_TENSOR_OUTPUT = "output"; -static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_layer: %u\n", __func__, params->n_layer); - printf("%s: n_rot: %u\n", __func__, params->n_rot); -} - -static void set_param_model(struct my_llama_model * model) { - const auto& hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct ggml_context* ctx = model->ctx; - - ggml_set_param(ctx, model->tok_embeddings); - ggml_set_param(ctx, model->norm); - ggml_set_param(ctx, model->output); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - ggml_set_param(ctx, layer.attention_norm); - ggml_set_param(ctx, layer.wq); - ggml_set_param(ctx, layer.wk); - ggml_set_param(ctx, layer.wv); - ggml_set_param(ctx, layer.wo); - ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.ffn_gate); - ggml_set_param(ctx, layer.ffn_down); - ggml_set_param(ctx, layer.ffn_up); - } -} - -static void init_model(struct my_llama_model * model) { - const auto & hparams = model->hparams; - - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; - const uint32_t n_ff = hparams.n_ff; - - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); - return tn_buf.data(); - }; - - // context for model tensors without their data - struct ggml_init_params ctx_model_params; - ctx_model_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); - ctx_model_params.mem_buffer = NULL; - ctx_model_params.no_alloc = true; - - struct ggml_context * ctx = ggml_init(ctx_model_params); - model->ctx = ctx; - - model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - - ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD)); - ggml_set_name(model->norm, tn(LLM_TENSOR_OUTPUT_NORM)); - ggml_set_name(model->output, tn(LLM_TENSOR_OUTPUT)); - - model->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - - ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); - - ggml_set_name(layer.wq, tni(LLM_TENSOR_ATTN_Q, i)); - ggml_set_name(layer.wk, tni(LLM_TENSOR_ATTN_K, i)); - ggml_set_name(layer.wv, tni(LLM_TENSOR_ATTN_V, i)); - ggml_set_name(layer.wo, tni(LLM_TENSOR_ATTN_OUT, i)); - - ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); - - ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i)); - ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i)); - ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i)); - } - - set_param_model(model); - - // allocate data - model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); -} - -static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { - const auto & hparams = model->hparams; - - const uint32_t n_layer = hparams.n_layer; - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(model->tok_embeddings, rnd); - randomize_tensor_normal(model->norm, rnd); - randomize_tensor_normal(model->output, rnd); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = model->layers[i]; - randomize_tensor_normal(layer.attention_norm, rnd); - - randomize_tensor_normal(layer.wq, rnd); - randomize_tensor_normal(layer.wk, rnd); - randomize_tensor_normal(layer.wv, rnd); - randomize_tensor_normal(layer.wo, rnd); - - randomize_tensor_normal(layer.ffn_norm, rnd); - - randomize_tensor_normal(layer.ffn_gate, rnd); - randomize_tensor_normal(layer.ffn_down, rnd); - randomize_tensor_normal(layer.ffn_up, rnd); - } - - free_random_normal_distribution(rnd); -} - -static struct ggml_tensor * llama_build_train_graphs( - struct my_llama_model * model, - ggml_gallocr_t alloc, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - const int n_tokens, - const int n_batch, - const bool enable_flash_attn, - const bool enable_checkpointing, - const bool measure_only) { - - ggml_set_scratch(ctx, { 0, 0, nullptr, }); - const int n_past = 0; - const int N = n_tokens; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = hparams.n_ff; - const float f_norm_rms_eps = hparams.f_norm_rms_eps; - const float rope_freq_base = hparams.rope_freq_base; - const float rope_freq_scale = hparams.rope_freq_scale; - - auto set_name = [](struct ggml_tensor * t, const char * n) { - ggml_set_name(t, n); - if (t->grad) { - ggml_format_name(t->grad, "%s->grad", n); - } - }; - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_set_input(KQ_pos); - - // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] - (struct ggml_tensor * t) -> struct ggml_tensor * { - // not capturing these, to silcence warnings - const int rope_mode = 0; - - return ggml_rope_ext( - ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f - ); - }; - - set_name(tokens_input, "tokens_input"); - set_name(targets, "targets"); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); - struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); - - struct ggml_tensor * cur = t01; - - std::vector checkpoints; - checkpoints.push_back(tokens_input); - checkpoints.push_back(targets); - checkpoints.push_back(t00); - checkpoints.push_back(t01); - - const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); - struct ggml_tensor * t03 = ggml_repeat (ctx, layer.attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); - struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); - struct ggml_tensor * t05 = ggml_mul_mat (ctx, layer.wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); - struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t08 = ggml_mul_mat (ctx, layer.wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd, N*n_batch); - struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t11 = ggml_mul_mat (ctx, t04, layer.wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd); - struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - struct ggml_tensor * t16; - if (enable_flash_attn) { - GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); - //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - } else { - struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); - struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); - struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); - struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); - t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - } - struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); - struct ggml_tensor * t20 = ggml_mul_mat (ctx, layer.wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); - struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); - struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); - struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); - struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); - struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); - struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); - struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); - cur = t30; - checkpoints.push_back(cur); - } - struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = ggml_repeat (ctx, model->norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); - struct ggml_tensor * t34 = ggml_mul_mat (ctx, model->output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); - - checkpoints.push_back(t31); - checkpoints.push_back(t32); - checkpoints.push_back(t33); - checkpoints.push_back(t34); - checkpoints.push_back(t35); - checkpoints.push_back(t36); - - ggml_build_forward_expand(gf, t36); - - if (enable_checkpointing) { - ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); - } else { - ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, true); - } - - if (alloc) { - // make sure some tensors are not reallocated by inserting new temporary nodes depending on them - int n_leafs_before = gb->n_leafs; - int n_nodes_before = gb->n_nodes; - // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); - // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); - // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); - GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_set_input(t36->grad); - - // allocating checkpoints in one block to reduce memory fragmentation - // note: they will be freed in reverse order - for (int i = 0; i < (int) checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_set_input(checkpoints[i]); - } - } - - //int n_leafs_after = gb->n_leafs; - //int n_nodes_after = gb->n_nodes; - if (measure_only) { - // FIXME: will still allocate - ggml_gallocr_reserve(alloc, gb); - } else { - ggml_gallocr_alloc_graph(alloc, gb); - - if (!measure_only) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - } - - // remove the additional nodes and leafs - for (int i = n_leafs_before; i < gb->n_leafs; ++i) { - gb->leafs[i] = NULL; - } - for (int i = n_nodes_before; i < gb->n_nodes; ++i) { - gb->nodes[i] = NULL; - } - gb->n_leafs = n_leafs_before; - gb->n_nodes = n_nodes_before; - } - - *logits = t35; - return t36; -} - -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -do { \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - die_fmt("key not found in model: %s", skey.c_str()); \ - } \ -} while (0) - -static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - std::string arch; - - std::vector keybuf; - keybuf.resize(512); - auto kv = [&arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); - return keybuf.data(); - }; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); - return tn_buf.data(); - }; - - GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - GGML_ASSERT(arch == "llama"); - - uint32_t ftype_u; - GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); - GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - - // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here - GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); - - GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(fctx, model->hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - - model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head; - GGUF_GET_KEY(fctx, model->hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); - - float rope_freq_scale = 1.0f; - GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); - GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (rope_freq_scale != 1.0f) { - model->hparams.rope_freq_scale = 1.0f / rope_freq_scale; - } - - init_model(model); - - copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); - copy_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); - copy_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); - copy_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); - copy_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); - copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); - copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); - copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); - } -} - -static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { - const char * arch = "llama"; - - enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - - std::vector keybuf; - keybuf.resize(512); - auto kv = [arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch); - return keybuf.data(); - }; - - // set arch - gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); - gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch); - gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - - // set hparams - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx ); - gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd ); - gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff ); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head ); - gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer ); - gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_rot ); - - gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps ); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base ); // TODO load in llama.cpp - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), 1.0f / model->hparams.rope_freq_scale ); - - // set vocab by copying from vocab_model gguf file - { - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, - }; - struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params); - - const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); - if (token_idx == -1) { - die("cannot find tokenizer vocab in model file"); - } - const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); - - const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); - if (score_idx == -1) { - die("cannot find tokenizer scores in model file"); - } - - const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); - - const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); - if (toktype_idx == -1) { - die("cannot find token type list in GGUF file"); - } - - const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); - - std::string tokenizer_name; - GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); - - gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str()); - gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab); - gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab); - - int32_t special_bos_id = 1; - int32_t special_eos_id = 2; - int32_t special_unk_id = 0; - int32_t special_sep_id = -1; - int32_t special_pad_id = -1; - if (tokenizer_name == "llama") { - // default special tokens - special_bos_id = 1; - special_eos_id = 2; - special_unk_id = 0; - special_sep_id = -1; - special_pad_id = -1; - } else if (tokenizer_name == "gpt2") { - // read and copy bpe merges - const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); - if (merges_keyidx == -1) { - die("cannot find tokenizer merges in model file"); - } - - const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); - - std::vector merges; - merges.resize(n_merges); - for (int i = 0; i < n_merges; i++) { - merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i); - } - gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges); - - // default special tokens - special_bos_id = 11; - special_eos_id = 11; - special_unk_id = -1; - special_sep_id = -1; - special_pad_id = -1; - } else { - fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); - fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__); - } - - std::vector tokens; - tokens.resize(n_vocab); - for (uint32_t i = 0; i < n_vocab; i++) { - tokens[i] = gguf_get_arr_str(vctx, token_idx, i); - } - gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab); - - GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); - GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); - GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); - GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); - GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); - - gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id); - gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id); - gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id); - gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id); - gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id); - - gguf_free(vctx); - } - - // add tensors - gguf_add_tensor(fctx, model->tok_embeddings); - gguf_add_tensor(fctx, model->norm); - gguf_add_tensor(fctx, model->output); - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - - gguf_add_tensor(fctx, layer.attention_norm); - gguf_add_tensor(fctx, layer.wq); - gguf_add_tensor(fctx, layer.wk); - gguf_add_tensor(fctx, layer.wv); - gguf_add_tensor(fctx, layer.wo); - gguf_add_tensor(fctx, layer.ffn_norm); - gguf_add_tensor(fctx, layer.ffn_gate); - gguf_add_tensor(fctx, layer.ffn_down); - gguf_add_tensor(fctx, layer.ffn_up); - } -} - -static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { - printf("%s: saving to %s\n", __func__, filename); - struct gguf_context * fctx = gguf_init_empty(); - - save_llama_model_gguf(fctx, fn_vocab_model, model); - - // write file - const bool only_meta = false; - gguf_write_to_file(fctx, filename, only_meta); - gguf_free(fctx); -} - -static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) { - load_llama_model_gguf(fctx, f_ggml_ctx, model); - if (load_train_state_gguf(fctx, f_ggml_ctx, train)) { - std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL; - GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); - GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL); - } else { - printf("%s: loaded llama model as checkpoint\n", __func__); - } -} - -static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { - gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL); - save_llama_model_gguf(fctx, fn_vocab_model, model); - save_train_state_gguf(fctx, train); -} - -static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) { - struct ggml_context * f_ggml_ctx; - struct gguf_init_params params; - params.no_alloc = false; - params.ctx = &f_ggml_ctx; - struct gguf_context * fctx = gguf_init_from_file(filename, params); - if (fctx == NULL) { - return false; - } - - load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); - - gguf_free(fctx); - return true; -} - -static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) { - printf("%s: saving to %s\n", __func__, filename); - struct gguf_context * fctx = gguf_init_empty(); - - save_checkpoint_gguf(fctx, fn_vocab_model, model, train); - - // write file - const bool only_meta = false; - gguf_write_to_file(fctx, filename, only_meta); - gguf_free(fctx); -} - -struct train_params { - struct train_params_common common; - - const char * fn_vocab_model; - const char * fn_model_out; - - bool only_write_model; - - int n_ctx; - int n_embd; - int n_head; - int n_layer; - int n_ff; - - float f_norm_rms_eps; - float rope_freq_base; - float rope_freq_scale; -}; - -static struct train_params get_default_train_params() { - struct train_params params; - params.common = get_default_train_params_common(); - params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; - params.fn_model_out = "ggml-checkpoint-f32.bin"; - - params.only_write_model = false; - - params.n_ctx = 128; - params.n_embd = 256; - params.n_head = 8; - params.n_layer = 16; - params.n_ff = 768; - - params.f_norm_rms_eps = 1e-5f; - params.rope_freq_base = 10000.0f; - params.rope_freq_scale = 1.0f; - - return params; -} - -static void train_print_usage(int argc, char ** argv, const struct train_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - - fprintf(stderr, " --vocab-model FNAME model path from which to load vocab (default '%s')\n", params->fn_vocab_model); - fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out); - fprintf(stderr, " --only-write-model only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n"); - fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); - fprintf(stderr, " --ff N Feedforward size used for new models. (default %d)\n", params->n_ff); - fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); - fprintf(stderr, " --layer N Number of layers for new models (default %d)\n", params->n_layer); - fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); - fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); - fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - - print_common_train_usage(argc, argv, ¶ms->common); -} - -static bool train_params_parse(int argc, char ** argv, struct train_params * params) { - bool invalid_param = false; - std::string arg; - struct train_params default_params = get_default_train_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { - if (invalid_param) { - break; - } else if (params->common.print_usage) { - train_print_usage(argc, argv, &default_params); - exit(0); - } - } else if (arg == "--vocab-model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_vocab_model = argv[i]; - } else if (arg == "--model-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_out = argv[i]; - } else if (arg == "--only-write-model") { - params->only_write_model = true; - } else if (arg == "--embd") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_embd = std::stoi(argv[i]); - } else if (arg == "--ff") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_ff = std::stoi(argv[i]); - } else if (arg == "--head") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_head = std::stoi(argv[i]); - } else if (arg == "--layer") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_layer = std::stoi(argv[i]); - } else if (arg == "--norm-rms-eps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->f_norm_rms_eps = std::stof(argv[i]); - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_scale = std::stof(argv[i]); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - finish_processing_train_args(¶ms->common); - - return true; -} - -struct save_train_files_data { - const char * fn_checkpoint_out; - const char * fn_model_out; - const char * fn_vocab_model; - const char * pattern_fn_it; - const char * fn_latest; - struct my_llama_model * model; -}; - -static void save_train_files(void * vdata, struct train_state * train) { - struct save_train_files_data * data = (struct save_train_files_data *) vdata; - int64_t iter = train->opt->iter; - - if (strlen(data->fn_checkpoint_out) > 0) { - save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train); - save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model, train); - - } - if (strlen(data->fn_model_out) > 0) { - save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model); - save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->fn_vocab_model, data->model); - } -} - -static int64_t get_parameter_count(struct my_llama_model* model) { - int64_t nx = 0; - nx += ggml_nelements(model->tok_embeddings); - nx += ggml_nelements(model->norm); - nx += ggml_nelements(model->output); - - for (uint32_t i = 0; i < model->layers.size(); ++i) { - auto & layer = model->layers[i]; - nx += ggml_nelements(layer.attention_norm); - nx += ggml_nelements(layer.wq); - nx += ggml_nelements(layer.wk); - nx += ggml_nelements(layer.wv); - nx += ggml_nelements(layer.wo); - nx += ggml_nelements(layer.ffn_norm); - nx += ggml_nelements(layer.ffn_gate); - nx += ggml_nelements(layer.ffn_down); - nx += ggml_nelements(layer.ffn_up); - } - return nx; -} - -int main(int argc, char ** argv) { - struct train_params params = get_default_train_params(); - - if (!train_params_parse(argc, argv, ¶ms)) { - return 1; - } - - if (params.common.seed == LLAMA_DEFAULT_SEED) { - params.common.seed = time(NULL); - } - printf("%s: seed: %u\n", __func__, params.common.seed); - srand(params.common.seed); - - struct llama_model_params mparams = llama_model_default_params(); - mparams.vocab_only = true; - - struct llama_context_params cparams = llama_context_default_params(); - - struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, mparams); - struct llama_context * lctx = llama_new_context_with_model(lmodel, cparams); - - struct my_llama_model model; - model.hparams.n_vocab = llama_n_vocab(lmodel); - model.hparams.n_ctx = params.common.n_ctx; - model.hparams.n_embd = params.n_embd; - model.hparams.n_head = params.n_head; - model.hparams.n_layer = params.n_layer; - model.hparams.n_ff = params.n_ff; - // llama.cpp requires n_rot to be exactly n_embd / n_head - model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head; - model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; - model.hparams.rope_freq_base = params.rope_freq_base; - model.hparams.rope_freq_scale = params.rope_freq_scale; - - struct train_state * train = init_train_state(); - struct ggml_opt_context * opt = train->opt; - - // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - opt->params.print_forward_graph = false; - opt->params.print_backward_graph = false; - opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; - opt->params.n_threads = params.common.n_threads; - opt->params.past = params.common.opt_past; - opt->params.delta = params.common.opt_delta; - opt->params.max_no_improvement = params.common.opt_max_no_improvement; - opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; - opt->params.adam.n_iter = params.common.adam_n_iter; - opt->params.adam.sched = 1.0f; - opt->params.adam.alpha = params.common.adam_alpha; - opt->params.adam.decay = params.common.adam_decay; - opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; - opt->params.adam.beta1 = params.common.adam_beta1; - opt->params.adam.beta2 = params.common.adam_beta2; - opt->params.adam.gclip = params.common.adam_gclip; - opt->params.adam.eps_f = params.common.adam_eps_f; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train); - if (existed) { - // overwrite last n_ctx with user provided n_ctx - if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; - } - - const bool opt_past_changed = opt->params.past != params.common.opt_past; - - if (opt_past_changed) { - die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); - // need to discard previous optimizer past function value statistics and opt_init with new shapes - // TODO - } - } else { - init_model(&model); - randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); - if (!params.only_write_model) { - ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model)); - } - } - opt->iter = train->train_its; - - print_params(&model.hparams); - printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); - printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); - printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); - printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f)); - - if (params.only_write_model) { - save_train_files_data save_data; - save_data.fn_checkpoint_out = ""; - save_data.fn_model_out = params.fn_model_out; - save_data.fn_vocab_model = params.fn_vocab_model; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - - save_train_files(&save_data, train); - - free_train_state(train); - ggml_free(model.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; - } - - printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); - printf("%s: opt iter %d\n", __func__, opt->iter); - - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.common.n_batch; - - // context for input tensors without their data - struct ggml_init_params ctx_input_params = { - ggml_tensor_overhead() * 2, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_input = ggml_init(ctx_input_params); - - // the input tensors - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - // measure required memory for input tensors - // allocate input tensors - ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); - size_t max_input_size = ggml_backend_buffer_get_size(input_data); - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - - // context for compute tensors without their data - const size_t estimated_compute_size_wo_data = ( - 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + - (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) - ); - struct ggml_init_params ctx_compute_params = { - estimated_compute_size_wo_data, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_compute = NULL; - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - - struct ggml_cgraph * gf = NULL; - struct ggml_cgraph * gb = NULL; - struct ggml_cgraph * gb_tmp = NULL; - - // measure required memory for compute tensors - size_t best_compute_size = SIZE_MAX; - enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; - // find best evaluation order - for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_train_graphs( - &model, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - true - ); - size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer - if (max_compute_size < best_compute_size) { - best_compute_size = max_compute_size; - best_order = gf->order; - } - ggml_free(ctx_compute); - } - size_t max_compute_size = best_compute_size; - printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); - printf("%s: evaluation order = %s\n", __func__, - (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : - (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : - "invalid"); - - // allocate compute tensors - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = best_order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_train_graphs( - &model, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - false - ); - - std::vector train_tokens; - std::vector train_samples_begin; - std::vector train_samples_size; - printf("%s: tokenize training data\n", __func__); - tokenize_file(lctx, - params.common.fn_train_data, - params.common.sample_start, - params.common.include_sample_start, - params.common.overlapping_samples, - n_tokens, - train_tokens, - train_samples_begin, - train_samples_size); - GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); - - printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); - - size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); - const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); - if (changed_train_data) { - printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); - } - if (params.common.force_reshuffle) { - printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); - } - if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { - train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); - train->shuffle_sample_count = train_samples_size.size(); - train->shuffle_next_sample = 0; - train->shuffle_samples_hash = shuffle_samples_hash; - } - std::vector train_shuffled_samples_offs; - std::vector train_shuffled_samples_begin; - std::vector train_shuffled_samples_size; - train_shuffled_samples_offs.resize(train_samples_begin.size()); - train_shuffled_samples_begin.resize(train_samples_begin.size()); - train_shuffled_samples_size.resize(train_samples_size.size()); - train->shuffle_rng_state_next = shuffle_samples( - train->shuffle_rng_state_current, - train_shuffled_samples_offs.data(), - train_shuffled_samples_begin.data(), - train_shuffled_samples_size.data(), - train_samples_begin.data(), - train_samples_size.data(), - train_samples_size.size()); - printf("%s: begin training\n", __func__); - - save_train_files_data save_data; - save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; - save_data.fn_model_out = params.fn_model_out; - save_data.fn_vocab_model = params.fn_vocab_model; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - - struct train_opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms.common; - opt_cb_data.train = train; - opt_cb_data.save_cb = &save_train_files; - opt_cb_data.save_data = &save_data; - opt_cb_data.lctx = lctx; - opt_cb_data.last_save_iter = opt->iter; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_begin = train_samples_begin.data(); - opt_cb_data.samples_size = train_samples_size.data(); - opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); - opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); - opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); - opt_cb_data.samples_count = train_samples_size.size(); - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_probs = target_probs; - opt_cb_data.first_iter = opt->iter; - opt_cb_data.first_epoch = train->train_epochs; - opt_cb_data.iter_at_last_epoch = -1; - opt_cb_data.last_time = ggml_time_ms(); - opt_cb_data.millis_per_iter = 0.0; - - // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; - printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); - - // context for work buffer - struct ggml_init_params ctx_work_params = { - max_work_size, // mem_size - NULL, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx_work = ggml_init(ctx_work_params); - - int64_t t0 = ggml_time_ms(); - - ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - - ggml_free(ctx_work); - ggml_free(ctx_compute); - ggml_free(ctx_input); - - int64_t t1 = ggml_time_ms(); - printf("%s: total training time: ", __func__); - print_duration((double) (t1 - t0)); - printf("\n"); - - int new_iters = opt->iter - opt_cb_data.last_save_iter; - if (new_iters > 0) { - train->train_its += new_iters; - train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - - save_train_files(&save_data, train); - opt_cb_data.last_save_iter = opt->iter; - } - - ggml_free(opt->ctx); - free_train_state(train); - ggml_free(model.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; -} From eddcb5238b2e09a37798b87cde1244017a194bcc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jul 2024 12:37:42 +0300 Subject: [PATCH 011/154] ggml : add and use ggml_cpu_has_llamafile() (#8664) --- ggml/include/ggml.h | 1 + ggml/src/ggml.c | 8 ++++++++ src/llama.cpp | 6 +----- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2fdb9fa40..548661b9b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2400,6 +2400,7 @@ extern "C" { GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_matmul_int8(void); GGML_API int ggml_cpu_has_cann (void); + GGML_API int ggml_cpu_has_llamafile (void); // // Internal types and functions exposed for tests and benchmarks diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index dbb3a3ebe..f65837e85 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -22005,6 +22005,14 @@ int ggml_cpu_has_cann(void) { #endif } +int ggml_cpu_has_llamafile(void) { +#if defined(GGML_USE_LLAMAFILE) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_gpublas(void) { return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); } diff --git a/src/llama.cpp b/src/llama.cpp index 9e502018d..80235ae19 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19146,11 +19146,7 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; -#ifdef GGML_USE_LLAMAFILE - s += "LLAMAFILE = 1 | "; -#else - s += "LLAMAFILE = 0 | "; -#endif + s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; return s.c_str(); } From ed67bcb24f2d6ac0072cae72620b2bd971741b98 Mon Sep 17 00:00:00 2001 From: Chen Xi Date: Thu, 25 Jul 2024 11:45:18 +0000 Subject: [PATCH 012/154] [SYCL] fix multi-gpu issue on sycl (#8554) --------- Signed-off-by: Chen Xi Co-authored-by: Meng, Hengyu --- docs/backend/SYCL.md | 36 ++++------ ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/dpct/helper.hpp | 101 +++++++++++++++++++++++++---- src/llama.cpp | 4 +- 4 files changed, 102 insertions(+), 41 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 885983e92..d36ac0a15 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow ```sh ./build/bin/llama-ls-sycl-device ``` -A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: + | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-------------------------------------------------------------| -| compute capability 1.3 | Level-zero driver/runtime, recommended | -| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | 4. Launch inference There are two device selection modes: - Single device: Use one device target specified by the user. -- Multiple devices: Automatically select the devices with the same largest Max compute-units. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| @@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow build\bin\ls-sycl-device.exe ``` -The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-----------------------------------------------------------| -| compute capability 1.3 | Level-zero running time, recommended | -| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | - 4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. -- Multiple devices: Automatically choose the devices with the same biggest Max compute units. +- Single device: Use one device assigned by user. Default device id is 0. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 68d41411b..397bd98dd 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -267,7 +267,7 @@ struct ggml_backend_sycl_context { queue_ptr stream(int device, int stream) { if (qptrs[device][stream] == nullptr) { - qptrs[device][stream] = &(dpct::get_current_device().default_queue()); + qptrs[device][stream] = &(dpct::get_device(device).default_queue()); } return qptrs[device][stream]; } diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 31df1cb9e..4aaa76bfb 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -588,7 +588,7 @@ namespace dpct out = prop; } - /// dpct device extension + /// dpct device extension class device_ext : public sycl::device { typedef std::mutex mutex_type; @@ -697,7 +697,7 @@ namespace dpct std::unique_lock lock(m_mutex); lock.unlock(); for (auto &q : _queues) { - q.wait_and_throw(); + q.wait_and_throw(); } // Guard the destruct of current_queues to make sure the ref count is // safe. @@ -734,7 +734,12 @@ namespace dpct void destroy_queue(sycl::queue queue) { std::lock_guard lock(m_mutex); - _queues.clear(); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const sycl::queue &q) -> bool + { + return q == queue; + }), + _queues.end()); } void set_saved_queue(sycl::queue q) { std::lock_guard lock(m_mutex); @@ -764,13 +769,13 @@ namespace dpct if (enable_exception_handler) { eh = exception_handler; } - auto q = sycl::queue(*this, eh, - sycl::property_list( + _queues.push_back(sycl::queue( + *this, eh, + sycl::property_list( #ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), + sycl::property::queue::enable_profiling(), #endif - properties...)); - _queues.push_back(q); + properties...))); return _queues.back(); } @@ -783,8 +788,8 @@ namespace dpct if (enable_exception_handler) { eh = exception_handler; } - _queues.push_back( - sycl::queue(device, eh, + _queues.push_back(sycl::queue( + device, eh, sycl::property_list( #ifdef DPCT_PROFILING_ENABLED sycl::property::queue::enable_profiling(), @@ -855,15 +860,75 @@ namespace dpct unsigned int get_device_id(const sycl::device &dev) { unsigned int id = 0; - for (auto dev_item : _devs) + for (auto &dev_item : _devs) { if (*dev_item == dev) { - break; + return id; } id++; } - return id; + return -1; + } + + inline std::string get_preferred_gpu_platform_name() { + std::string result; + + std::string filter = "level-zero"; + char* env = getenv("ONEAPI_DEVICE_SELECTOR"); + if (env) { + if (std::strstr(env, "level_zero")) { + filter = "level-zero"; + } + else if (std::strstr(env, "opencl")) { + filter = "opencl"; + } + else if (std::strstr(env, "cuda")) { + filter = "cuda"; + } + else if (std::strstr(env, "hip")) { + filter = "hip"; + } + else { + throw std::runtime_error("invalid device filter: " + std::string(env)); + } + } + + auto plaform_list = sycl::platform::get_platforms(); + + for (const auto& platform : plaform_list) { + auto devices = platform.get_devices(); + auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { + return d.is_gpu(); + }); + + if (gpu_dev == devices.end()) { + // cout << "platform [" << platform_name + // << "] does not contain GPU devices, skipping\n"; + continue; + } + + auto platform_name = platform.get_info(); + std::string platform_name_low_case; + platform_name_low_case.resize(platform_name.size()); + + std::transform( + platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower); + + if (platform_name_low_case.find(filter) == std::string::npos) { + // cout << "platform [" << platform_name + // << "] does not match with requested " + // << filter << ", skipping\n"; + continue; + } + + result = platform_name; + } + + if (result.empty()) + throw std::runtime_error("can not find preferred GPU platform"); + + return result; } template @@ -930,10 +995,15 @@ namespace dpct // Keep track of the number of devices per backend std::map DeviceNums; std::map> backend_devices; + auto preferred_platform_name = get_preferred_gpu_platform_name(); while (!Platforms.empty()) { auto Platform = Platforms.back(); Platforms.pop_back(); + auto platform_name = Platform.get_info(); + if (platform_name.compare(preferred_platform_name) != 0) { + continue; + } auto devices = Platform.get_devices(); std::string backend_type = get_device_backend_and_type(devices[0]); for (const auto &device : devices) { @@ -1989,6 +2059,11 @@ namespace dpct return dev_mgr::instance().current_device(); } + static inline device_ext &get_device(unsigned int id) + { + return dev_mgr::instance().get_device(id); + } + static inline sycl::queue &get_in_order_queue() { return dev_mgr::instance().current_device().in_order_queue(); diff --git a/src/llama.cpp b/src/llama.cpp index 80235ae19..972f870b0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16643,9 +16643,7 @@ struct llama_context * llama_new_context_with_model( for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i); llama_free(ctx); return nullptr; } From 88954f7fbd31aeb8c75140edee03e7a8ad5e2d9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jul 2024 18:57:44 +0300 Subject: [PATCH 013/154] tests : fix printfs (#8068) --- tests/test-chat-template.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 46a7d3aea..a8222caee 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -146,7 +146,7 @@ int main(void) { auto fmt_sys = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false); printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str()); - printf("-------------------------\n", output.c_str()); + printf("-------------------------\n"); return output; }; assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n"); @@ -165,7 +165,7 @@ int main(void) { auto fmt_single = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str()); - printf("-------------------------\n", output.c_str()); + printf("-------------------------\n"); return output; }; assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); From bf5a81df375f1c71e41462e1f48d57db359c9e80 Mon Sep 17 00:00:00 2001 From: "Andreas (Andi) Kunar" Date: Thu, 25 Jul 2024 18:01:00 +0200 Subject: [PATCH 014/154] ggml : fix build on Windows with Snapdragon X (#8531) * Improvements for Windows with Snapdragon X * Revert "Improvements for Windows with Snapdragon X" This reverts commit bf21397ae5ea7c73d3494db3b91505599909227d. * Improvements for Windows with Snapdragon X * WOA build clarifications * WIndows on ARM build clarifications * cmake build for Windows clarifications * Update docs/build.md Co-authored-by: Georgi Gerganov --------- Co-authored-by: AndreasKunar Co-authored-by: Georgi Gerganov --- docs/build.md | 13 ++++++++++++- ggml/src/ggml-aarch64.c | 12 ++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/build.md b/docs/build.md index 916fcf22d..d9d12c467 100644 --- a/docs/build.md +++ b/docs/build.md @@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options. make ``` - - On Windows: + - On Windows (x86/x64 only, arm64 requires cmake): 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 2. Extract `w64devkit` on your pc. @@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options. cmake -B build -G "Xcode" cmake --build build --config Debug ``` + - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: + - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): + - Tab Workload: Desktop-development with C++ + - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) + - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test + - For Windows on ARM (arm64, WoA) build with: + ```bash + cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF + cmake --build build-arm64-windows-llvm-release + ``` + Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. - Using `gmake` (FreeBSD): diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 26535b1c4..af53dea17 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); -#elif defined(__ARM_NEON) && defined(__aarch64__) +#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) if (svcntw() == 8) { const void * b_ptr = vx; const void * a_ptr = vy; @@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); -#elif defined(__ARM_NEON) && defined(__aarch64__) +#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) if (svcntw() == 8) { const void * b_ptr = vx; const void * a_ptr = vy; From 4226a8d10e3904db3a1297919fe6c7f06beba6c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jul 2024 19:57:31 +0300 Subject: [PATCH 015/154] llama : fix build + fix fabs compile warnings (#8683) ggml-ci --- src/llama-grammar.h | 2 -- src/llama.cpp | 10 ++-------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 8e578e09f..695ea0632 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -13,8 +13,6 @@ struct llama_grammar { llama_partial_utf8 partial_utf8; }; -struct llama_grammar * llama_get_grammar(struct llama_context * ctx); - // // internal API // diff --git a/src/llama.cpp b/src/llama.cpp index 972f870b0..bc0183741 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2657,7 +2657,6 @@ struct llama_context { llama_context(const llama_model & model) : model(model) , sampling(llama_n_vocab(&model)) - , grammar() , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} @@ -2675,7 +2674,6 @@ struct llama_context { struct llama_cparams cparams; struct llama_sampling sampling; - struct llama_grammar grammar; struct llama_kv_cache kv_self; struct llama_control_vector cvec; @@ -14048,7 +14046,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { f = -INFINITY; } else { if (hparams.use_alibi) { - f = -fabs(lctx.kv_self.cells[i].pos - pos); + f = -std::abs(lctx.kv_self.cells[i].pos - pos); } else { f = 0.0f; } @@ -14102,7 +14100,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int s = 0; s < batch.n_seq_id[i]; ++s) { if (batch.seq_id[i][s] == seq_id) { if (hparams.use_alibi) { - f = -fabs(batch.pos[i] - batch.pos[j]); + f = -std::abs(batch.pos[i] - batch.pos[j]); } else { f = 0.0f; } @@ -16833,10 +16831,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) { return &ctx->model.vocab; } -struct llama_grammar * llama_get_grammar(struct llama_context * ctx) { - return &ctx->grammar; -} - uint32_t llama_n_ctx(const struct llama_context * ctx) { return ctx->cparams.n_ctx; } From 49ce0ab6d45402e8bb622bf86f86529f2b0ba552 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Thu, 25 Jul 2024 22:23:05 +0100 Subject: [PATCH 016/154] ggml: handle ggml_init failure to fix NULL pointer deref (#8692) `ggml_init` can fail if no unused context is found. In that case, a NULL-pointer deref will happen later in the code during a call to `ggml_set_on_alloc`. This fixes it by bailing out if no context is found. --- ggml/src/ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f65837e85..29afcc7f8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -21096,6 +21096,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p }; *params.ctx = ggml_init(pdata); + if (*params.ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } struct ggml_context * ctx_data = *params.ctx; From 41cd47caab88c442edc50e90c8d8d0ac3e82768d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 25 Jul 2024 23:49:39 +0200 Subject: [PATCH 017/154] examples : export-lora : fix issue with quantized base models (#8687) --- examples/export-lora/export-lora.cpp | 64 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 124ee167d..150f7e8d5 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -211,8 +211,9 @@ struct lora_merge_ctx { } } - // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile - std::vector> base_tensors; + // mapping base tensor to out tensor (same shape with base, but different type) + // if out_tensor == nullptr, we only copy it + std::vector> base_to_out_tensors; for (auto & it : base_model.tensors) { bool t_a = true; bool t_b = true; @@ -221,22 +222,22 @@ struct lora_merge_ctx { t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); } auto base_tensor = it.second; - struct ggml_tensor * out_tensor; if (!t_a && !t_b) { // only copy - out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - ggml_set_name(out_tensor, base_tensor->name); - base_tensors.push_back(std::make_pair(out_tensor, false)); + struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(cpy_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + gguf_add_tensor(ctx_out, cpy_tensor); } else if (t_a && t_b) { // need merging - out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - out_tensor->type = get_out_tensor_type(base_tensor); + struct ggml_tensor * out_tensor = ggml_new_tensor( + ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); ggml_set_name(out_tensor, base_tensor->name); - base_tensors.push_back(std::make_pair(out_tensor, true)); + base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + gguf_add_tensor(ctx_out, out_tensor); } else { throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); } - gguf_add_tensor(ctx_out, out_tensor); } // placeholder for the meta data @@ -247,9 +248,9 @@ struct lora_merge_ctx { // process base model tensors size_t n_merged = 0; - for (auto & it : base_tensors) { - if (it.second) { - merge_tensor(it.first); + for (auto & it : base_to_out_tensors) { + if (it.second != nullptr) { + merge_tensor(it.first, it.second); n_merged++; } else { copy_tensor(it.first); @@ -265,7 +266,7 @@ struct lora_merge_ctx { } printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size()); + printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -276,7 +277,7 @@ struct lora_merge_ctx { zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - void merge_tensor(struct ggml_tensor * base) { + void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { std::string name_base(base->name); std::string name_lora_a = name_base + ".lora_a"; std::string name_lora_b = name_base + ".lora_b"; @@ -287,14 +288,14 @@ struct lora_merge_ctx { std::vector inp_a(adapters.size()); std::vector inp_b(adapters.size()); struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2), + /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; struct ggml_context * ctx = ggml_init(params); // alloc tensors - struct ggml_tensor * inp = ggml_dup_tensor(ctx, base); + struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); for (size_t i = 0; i < adapters.size(); ++i) { auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_b = adapters[i]->get_tensor(name_lora_b); @@ -303,9 +304,21 @@ struct lora_merge_ctx { } ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - // load data to backend buffer + // load base tensor to backend buffer base_model.read_tensor_data(name_base, read_buf); - ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp)); + if (base->type != GGML_TYPE_F32) { + // optionally dequantize it + printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); + auto nels = ggml_nelements(inp_base); + ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + std::vector dequant_buf(nels * sizeof(float)); + qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); + } else { + ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); + } + + // load lora tensors to backend buffer for (size_t i = 0; i < adapters.size(); ++i) { adapters[i]->read_tensor_data(name_lora_a, read_buf); ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); @@ -325,20 +338,21 @@ struct lora_merge_ctx { }; struct ggml_context * ctx0 = ggml_init(params0); gf = ggml_new_graph(ctx0); - struct ggml_tensor * cur = inp; + struct ggml_tensor * cur = inp_base; for (size_t i = 0; i < adapters.size(); ++i) { - struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i])); - struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]); + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); // scale const float alpha = adapters[i]->alpha; const float rank = (float) inp_b[i]->ne[0]; const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; delta = ggml_scale(ctx0, delta, scale); - cur = ggml_add(ctx0, cur, delta); - printf("%s : + merging from adapter[%ld]\n", __func__, i); + cur = ggml_add(ctx0, delta, cur); + printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); } - cur = ggml_cast(ctx0, cur, get_out_tensor_type(base)); + cur = ggml_cast(ctx0, cur, out->type); + printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); } From 01aec4a6310ab0160483196db0e726d78d4c94b6 Mon Sep 17 00:00:00 2001 From: Yaiko Date: Thu, 25 Jul 2024 18:10:16 -0400 Subject: [PATCH 018/154] server : add Speech Recognition & Synthesis to UI (#8679) * server : add Speech Recognition & Synthesis to UI * server : add Speech Recognition & Synthesis to UI (fixes) --- examples/server/public/index.html | 180 +++++++++++++++++++++++++++--- 1 file changed, 164 insertions(+), 16 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 48628a960..07fec6a38 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -1,5 +1,4 @@ - @@ -132,12 +131,20 @@ align-items: stretch; } - .right { + .message-controls { display: flex; - flex-direction: row; - gap: 0.5em; justify-content: flex-end; } + .message-controls > div:nth-child(2) { + display: flex; + flex-direction: column; + gap: 0.5em; + } + .message-controls > div:nth-child(2) > div { + display: flex; + margin-left: auto; + gap: 0.5em; + } fieldset { border: none; @@ -276,6 +283,7 @@ import { llama } from './completion.js'; import { SchemaConverter } from './json-schema-to-grammar.mjs'; + let selected_image = false; var slot_id = -1; @@ -447,6 +455,9 @@ /* END: Support for storing prompt templates and parameters in browsers LocalStorage */ + const tts = window.speechSynthesis; + const ttsVoice = signal(null) + const llamaStats = signal(null) const controller = signal(null) @@ -596,8 +607,51 @@ }); } + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null; function MessageInput() { - const message = useSignal("") + const message = useSignal(""); + + const talkActive = useSignal(false); + const sendOnTalk = useSignal(false); + const talkStop = (e) => { + if (e) e.preventDefault(); + + talkActive.value = false; + talkRecognition?.stop(); + } + const talk = (e) => { + e.preventDefault(); + + if (talkRecognition) + talkRecognition.start(); + else + alert("Speech recognition is not supported by this browser."); + } + if(talkRecognition) { + talkRecognition.onstart = () => { + talkActive.value = true; + } + talkRecognition.onresult = (e) => { + if (event.results.length > 0) { + message.value = event.results[0][0].transcript; + if (sendOnTalk.value) { + submit(e); + } + } + } + talkRecognition.onspeechend = () => { + talkStop(); + } + } + + const ttsVoices = useSignal(tts?.getVoices() || []); + const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default)); + if (tts) { + tts.onvoiceschanged = () => { + ttsVoices.value = tts.getVoices(); + } + } const submit = (e) => { stop(e); @@ -624,11 +678,45 @@ value="${message}" /> -
- - - - +
+
+
+
+ + + + +
+ +
+ { + e.preventDefault(); + alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`); + }}>[?] + + +
+
` @@ -659,26 +747,86 @@ } }, [messages]) + const ttsChatLineActiveIx = useSignal(undefined); + const ttsChatLine = (e, ix, msg) => { + if (e) e.preventDefault(); + + if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return; + + const ttsVoices = tts.getVoices(); + const voice = ttsVoices.find(v => v.name === ttsVoice.value); + if (!voice) return; + + if (ttsChatLineActiveIx.value !== undefined) { + tts.cancel(); + if (ttsChatLineActiveIx.value === ix) { + ttsChatLineActiveIx.value = undefined; + return; + } + } + + ttsChatLineActiveIx.value = ix; + let ttsUtter = new SpeechSynthesisUtterance(msg); + ttsUtter.voice = voice; + ttsUtter.onend = e => { + ttsChatLineActiveIx.value = undefined; + }; + tts.speak(ttsUtter); + } + const isCompletionMode = session.value.type === 'completion' + + // Try play the last bot message + const lastCharChatLinesIxs = useSignal([]); + const lastCharChatLinesIxsOld = useSignal([]); + useEffect(() => { + if ( + !isCompletionMode + && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length + && !generating.value + ) { + const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1]; + if (ix !== undefined) { + const msg = messages[ix]; + ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg); + } + + lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value); + } + }, [generating.value]); + const chatLine = ([user, data], index) => { let message - const isArrayMessage = Array.isArray(data) + const isArrayMessage = Array.isArray(data); + const text = isArrayMessage ? + data.map(msg => msg.content).join('') : + data; if (params.value.n_probs > 0 && isArrayMessage) { message = html`<${Probabilities} data=${data} />` } else { - const text = isArrayMessage ? - data.map(msg => msg.content).join('') : - data; message = isCompletionMode ? text : html`<${Markdownish} text=${template(text)} />` } + + const fromBot = user && user === '{{char}}'; + if (fromBot && !lastCharChatLinesIxs.value.includes(index)) + lastCharChatLinesIxs.value.push(index); + if (user) { - return html`

${template(user)}: ${message}

` + return html` +
+

${template(user)}: ${message}

+ ${ + fromBot && ttsVoice.value + && html`
` + } +
+ `; } else { return isCompletionMode ? html`${message}` : - html`

${message}

` + html`

${message}

` } }; From 01245f5b1629075543bc4478418c7d72a0b4b3c7 Mon Sep 17 00:00:00 2001 From: Judd Date: Fri, 26 Jul 2024 16:38:12 +0800 Subject: [PATCH 019/154] llama : fix order of parameters (#8706) usage of `aclrtGetMemInfo` is correct: https://www.hiascend.com/doc_center/source/zh/canncommercial/63RC2/inferapplicationdev/aclcppdevg/aclcppdevg_03_0103.html Co-authored-by: Judd --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index bc0183741..77f7d32f8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2905,7 +2905,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { #elif defined(GGML_USE_CANN) size_t total; size_t free; - ggml_backend_cann_get_device_memory(device, &total, &free); + ggml_backend_cann_get_device_memory(device, &free, &total); return free; #else return 1; From 2b1f616b208a4a21c4ee7a7eb85d822ff1d787af Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 27 Jul 2024 04:41:55 +0200 Subject: [PATCH 020/154] ggml : reduce hash table reset cost (#8698) * ggml : reduce hash table reset cost * fix unreachable code warnings after GGML_ASSERT(false) * GGML_ASSERT(false) -> GGML_ABORT("fatal error") * GGML_ABORT use format string --- Makefile | 6 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 4 +- examples/llama-bench/llama-bench.cpp | 6 +- examples/llava/clip.cpp | 2 +- examples/tokenize/tokenize.cpp | 2 +- ggml/include/ggml.h | 37 +- ggml/src/ggml-alloc.c | 42 +- ggml/src/ggml-backend.c | 214 +++--- ggml/src/ggml-blas.cpp | 3 +- ggml/src/ggml-cann.cpp | 6 +- ggml/src/ggml-cann/aclnn_ops.cpp | 26 +- ggml/src/ggml-cuda.cu | 6 +- ggml/src/ggml-cuda/argsort.cu | 2 +- ggml/src/ggml-cuda/binbcast.cu | 2 +- ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/cpy.cu | 4 +- ggml/src/ggml-cuda/dmmv.cu | 2 +- ggml/src/ggml-cuda/fattn-common.cuh | 6 +- ggml/src/ggml-cuda/fattn-tile-f16.cu | 2 +- ggml/src/ggml-cuda/fattn-tile-f32.cu | 2 +- ggml/src/ggml-cuda/fattn.cu | 10 +- ggml/src/ggml-cuda/getrows.cu | 3 +- ggml/src/ggml-cuda/mmq.cu | 2 +- ggml/src/ggml-cuda/mmq.cuh | 4 +- ggml/src/ggml-cuda/mmvq.cu | 6 +- ggml/src/ggml-cuda/quantize.cu | 2 +- ggml/src/ggml-cuda/rope.cu | 4 +- ggml/src/ggml-impl.h | 116 +++- ggml/src/ggml-kompute.cpp | 8 +- ggml/src/ggml-metal.m | 42 +- ggml/src/ggml-quants.c | 12 +- ggml/src/ggml-sycl.cpp | 16 +- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/dmmv.cpp | 2 +- ggml/src/ggml-sycl/dpct/helper.hpp | 2 +- ggml/src/ggml-sycl/mmq.cpp | 22 +- ggml/src/ggml-sycl/mmvq.cpp | 2 +- ggml/src/ggml-sycl/rope.cpp | 4 +- ggml/src/ggml-vulkan.cpp | 58 +- ggml/src/ggml.c | 838 +++++++++++------------ src/llama-grammar.cpp | 4 +- src/llama-vocab.cpp | 14 +- src/llama.cpp | 36 +- tests/test-backend-ops.cpp | 8 +- tests/test-sampling.cpp | 10 +- 46 files changed, 851 insertions(+), 754 deletions(-) diff --git a/Makefile b/Makefile index 8d2ccddc4..7e015af3e 100644 --- a/Makefile +++ b/Makefile @@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG endif else MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 - MK_CXXFLAGS += -O3 - MK_NVCCFLAGS += -O3 + MK_CFLAGS += -O3 -g + MK_CXXFLAGS += -O3 -g + MK_NVCCFLAGS += -O3 -g endif ifdef LLAMA_SANITIZE_THREAD diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c8a3016a4..37d30ab8c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } printf("%12.4f", v); sum += v; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 574f5ed9c..6ce1863cf 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } if (m_params.verbosity > 1) { printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); @@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; if (m_params.verbosity > 1) { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a6497b6e0..521fa8880 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) { case JSON: return "json"; case MARKDOWN: return "md"; case SQL: return "sql"; - default: GGML_ASSERT(!"invalid output format"); + default: GGML_ABORT("invalid output format"); } } @@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ASSERT(!"invalid split mode"); + default: GGML_ABORT("invalid split mode"); } } @@ -1326,7 +1326,7 @@ static std::unique_ptr create_printer(output_formats format) { case SQL: return std::unique_ptr(new sql_printer()); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } int main(int argc, char ** argv) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d23e282fb..7cda5f10c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = peg_0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 2afb6024c..17f5e4961 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { printf(">"); return; } - GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); + GGML_ABORT("MultiByteToWideChar() failed in an unexpected way."); } LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 548661b9b..464d765da 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -254,18 +254,8 @@ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) -#define GGML_ASSERT(x) \ - do { \ - if (!(x)) { \ - fflush(stdout); \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - ggml_print_backtrace(); \ - abort(); \ - } \ - } while (0) - #ifndef NDEBUG -#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") +#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) #elif defined(__GNUC__) #define GGML_UNREACHABLE() __builtin_unreachable() #elif defined(_MSC_VER) @@ -274,6 +264,17 @@ #define GGML_UNREACHABLE() ((void) 0) #endif +#ifdef __cplusplus +#define GGML_NORETURN [[noreturn]] +#elif defined(_MSC_VER) +#define GGML_NORETURN __declspec(noreturn) +#else +#define GGML_NORETURN _Noreturn +#endif + +#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) +#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // @@ -322,6 +323,9 @@ extern "C" { #endif + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) + GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); + enum ggml_status { GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_FAILED = -1, @@ -636,8 +640,11 @@ extern "C" { GGML_CGRAPH_EVAL_ORDER_COUNT }; + typedef uint32_t ggml_bitset_t; + struct ggml_hash_set { size_t size; + ggml_bitset_t * used; struct ggml_tensor ** keys; }; @@ -651,7 +658,7 @@ extern "C" { struct ggml_tensor ** grads; struct ggml_tensor ** leafs; - struct ggml_hash_set visited_hash_table; + struct ggml_hash_set visited_hash_set; enum ggml_cgraph_eval_order order; }; @@ -698,8 +705,6 @@ extern "C" { GGML_API int64_t ggml_cycles(void); GGML_API int64_t ggml_cycles_per_ms(void); - GGML_API void ggml_print_backtrace(void); - // accepts a UTF-8 path, even on Windows GGML_API FILE * ggml_fopen(const char * fname, const char * mode); @@ -2005,8 +2010,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index e176b883e..e485326ab 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); - GGML_ASSERT(!"not enough space in the buffer"); - return; + GGML_ABORT("not enough space in the buffer"); } void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, return; } } - GGML_ASSERT(!"out of allocated_tensors"); + GGML_ABORT("out of allocated_tensors"); } static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs return; } } - fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); - GGML_ASSERT(!"tensor not found"); + GGML_ABORT("tried to free tensor %s not found\n", tensor->name); } #endif @@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz // this should never happen fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); - GGML_UNREACHABLE(); + GGML_ABORT("not enough space in the buffer"); } } @@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { } } - free(galloc->hash_set.keys); + ggml_hash_set_free(&galloc->hash_set); free(galloc->hash_values); free(galloc->bufts); free(galloc->buffers); @@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { typedef struct ggml_gallocr * ggml_gallocr_t; static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { - size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); + size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); return &galloc->hash_values[i]; } @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { // clear hash tables - memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); - memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + ggml_hash_set_reset(&galloc->hash_set); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); // allocate leafs // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes @@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { - size_t hash_size = graph->visited_hash_table.size; + size_t min_hash_size = graph->n_nodes + graph->n_leafs; + // add 25% margin to avoid hash collisions + min_hash_size += min_hash_size / 4; // initialize hash table - if (galloc->hash_set.size < hash_size) { - free(galloc->hash_set.keys); - free(galloc->hash_values); - galloc->hash_set.size = hash_size; - galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *)); - galloc->hash_values = calloc(hash_size, sizeof(struct hash_node)); + if (galloc->hash_set.size < min_hash_size) { + ggml_hash_set_free(&galloc->hash_set); + galloc->hash_set = ggml_hash_set_new(min_hash_size); GGML_ASSERT(galloc->hash_set.keys != NULL); + + free(galloc->hash_values); + galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); GGML_ASSERT(galloc->hash_values != NULL); - } else { - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } // reset allocators @@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; - size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); return talloc->size_max >= node_size; } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index d39cfed88..954ab2072 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1055,11 +1055,10 @@ struct ggml_backend_sched { ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_gallocr_t galloc; - // hash keys of the nodes in the graph - struct ggml_hash_set hash_set; - // hash values - int * tensor_backend_id; - struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; + // hash map of the nodes in the graph + struct ggml_hash_set hash_set; + int * hv_tensor_backend_ids; // [hash_set.size] + struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] @@ -1068,7 +1067,7 @@ struct ggml_backend_sched { int * prev_leaf_backend_ids; // [graph_size] // copy of the graph with modified inputs - struct ggml_cgraph * graph; + struct ggml_cgraph graph; // graph splits struct ggml_backend_sched_split * splits; @@ -1087,19 +1086,16 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; - bool debug; + char * context_buffer; + size_t context_buffer_size; - // align context_buffer to GGML_MEM_ALIGN -#ifdef _MSC_VER - __declspec(align(GGML_MEM_ALIGN)) -#else - __attribute__((aligned(GGML_MEM_ALIGN))) -#endif - char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; + bool debug; }; -#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor) -#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)] +#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) +#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] +#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] +#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st return cur_backend_id; } - // assign nodes that use weights to the backend of the weights // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; @@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->is_reset = false; struct ggml_init_params params = { - /* .mem_size = */ sizeof(sched->context_buffer), + /* .mem_size = */ sched->context_buffer_size, /* .mem_buffer = */ sched->context_buffer, /* .no_alloc = */ true }; @@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->ctx = ggml_init(params); if (sched->ctx == NULL) { - fprintf(stderr, "%s: failed to initialize context\n", __func__); - GGML_ASSERT(false); + GGML_ABORT("%s: failed to initialize context\n", __func__); } // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; int * leaf_backend_id = &tensor_backend_id(leaf); - if (*leaf_backend_id != -1) { - // do not overwrite user assignments - continue; + // do not overwrite user assignments + if (*leaf_backend_id == -1) { + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } - *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - // do not overwrite user assignments - continue; - } - *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); - // src - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + // do not overwrite user assignments + if (*node_backend_id == -1) { + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); + +#if 0 + // src + if (node->op == GGML_OP_NONE) { continue; } - int * src_backend_id = &tensor_backend_id(src); - if (*src_backend_id == -1) { - *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + } } +#endif } } @@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 4: split graph, find tensors that need to be copied + // pass 5: split graph, find tensors that need to be copied { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops - for (int i = 0; i < graph->n_nodes; i++) { + int i = 0; + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); @@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } split->i_start = 0; split->n_inputs = 0; - memset(split->inputs, 0, sizeof(split->inputs)); //HACK int cur_backend_id = split->backend_id; - for (int i = 0; i < graph->n_nodes; i++) { + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { @@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now + assert(node_backend_id != -1); // all nodes should be assigned by now // check if we should start a new split based on the sources of the current node bool need_new_split = false; @@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = tensor_backend_id(src); - if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + if (src_backend_id != cur_backend_id) { need_new_split = true; break; } @@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); - int src_backend_id = sched->tensor_backend_id[id]; + int src_backend_id = sched->hv_tensor_backend_ids[id]; bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { + if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - const int src_backend_id = tensor_backend_id(src); + size_t src_id = hash_id(src); + const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - size_t id = hash_id(src); - if (sched->tensor_copies[id][src_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; @@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][src_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && !supported) { + if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend - const size_t id = hash_id(src); - if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } - node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; + node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } @@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_print_assignments(sched, graph); } - // swap node_backend_ids and leaf_backend_ids and prevs + // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids; @@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - // create copies of the graph for each split - // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); + int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; + if (sched->graph.size < graph_size) { + sched->graph.size = graph_size; + sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); + GGML_ASSERT(sched->graph.nodes != NULL); + GGML_ASSERT(sched->graph.leafs != NULL); + } + sched->graph.n_nodes = 0; + sched->graph.n_leafs = 0; + + struct ggml_cgraph * graph_copy = &sched->graph; + for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); @@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; const size_t input_id = hash_id(input); - struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; + sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg size_t id = hash_id(input); int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } - - sched->graph = graph_copy; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; - for (int i = 0; i < sched->graph->n_nodes; i++) { + for (int i = 0; i < sched->graph.n_nodes; i++) { if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; @@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } } if (!backend_ids_changed) { - for (int i = 0; i < sched->graph->n_leafs; i++) { + for (int i = 0; i < sched->graph.n_leafs; i++) { if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; @@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } // allocate graph - if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__); + fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif - ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { fprintf(stderr, "%s: failed to allocate graph\n", __func__); return false; } @@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done @@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + sched->n_backends = n_backends; + sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size); - sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); - sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); + // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) + sched->hash_set = ggml_hash_set_new(graph_size); + sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - sched->n_backends = n_backends; - - sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; + sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); + sched->context_buffer = malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); @@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + ggml_hash_set_free(&sched->hash_set); free(sched->splits); - free(sched->hash_set.keys); - free(sched->tensor_backend_id); - free(sched->tensor_copies); + free(sched->hv_tensor_backend_ids); + free(sched->hv_tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); free(sched->prev_leaf_backend_ids); + free(sched->context_buffer); + free(sched->graph.nodes); + free(sched->graph.leafs); free(sched); } void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run if (!sched->is_reset) { - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - + ggml_hash_set_reset(&sched->hash_set); + memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); ggml_backend_sched_split_graph(sched, measure_graph); - // TODO: extract this to a separate function - if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } @@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); ggml_backend_sched_split_graph(sched, graph); + if (!ggml_backend_sched_alloc_splits(sched)) { return false; } @@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; SET_CAUSE(node, "usr"); + sched->is_reset = false; } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { @@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, GGML_ASSERT(src != NULL); GGML_ASSERT(src->data && "graph must be allocated"); - size_t id = ggml_hash_insert(hash_set, src); - if (id == GGML_HASHTABLE_ALREADY_EXISTS) { - return node_copies[ggml_hash_find(hash_set, src)]; + size_t id = ggml_hash_insert(&hash_set, src); + if (id == GGML_HASHSET_ALREADY_EXISTS) { + return node_copies[ggml_hash_find(&hash_set, src)]; } struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); @@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, return dst; } -static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { +static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { - struct ggml_hash_set hash_set = { - /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT - }; + struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); @@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s if (ctx_allocated == NULL || ctx_unallocated == NULL) { fprintf(stderr, "failed to allocate context for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { fprintf(stderr, "failed to allocate buffer for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_copy_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(&hash_set, node_copies, node_init, node); } // build graph copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)]; + struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)]; graph_copy->nodes[i] = node_copy; } graph_copy->n_nodes = graph->n_nodes; - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index a37aa4072..713731735 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t break; default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); } } diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 9bf7e332a..ad5feea05 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { file, line); GGML_CANN_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CANN error"); + GGML_ABORT("CANN error"); } /** @@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { // memory should always buffered. these memory may still needed by // tasks in stream. // TODO, fix me. - GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); } }; @@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a02efc828..f27666970 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_max_pool2d(ctx, dst); break; case GGML_OP_POOL_COUNT: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { @@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (src->type == GGML_TYPE_F32) { // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size // && nb0 == type_size) @@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { // TODO: dst not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } if (dst->type == GGML_TYPE_F16) { @@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_mul_mat_q8_0(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e48269e46..54ccf6bb1 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CUDA error"); + GGML_ABORT("CUDA error"); } // this is faster on Windows @@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (quantize_src1 && !src1_is_contiguous) { @@ -2945,7 +2945,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); #endif - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 15757ca18..607ded855 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co } else if (order == GGML_SORT_ORDER_DESC) { k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 19b08b74f..34bc67acd 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast( } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1c2e00c1e..eac026f47 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -348,7 +348,7 @@ static __device__ void no_device_code( #ifdef __CUDA_ARCH__ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) #else -#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ static __device__ __forceinline__ float warp_reduce_sum(float x) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 3db57034b..aad34bfe5 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 174489e06..d7a2a2513 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec( convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index f24312dd0..950fd93df 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); fprintf(stderr, "By default only f16 KV cache is supported.\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (D == 128) { fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); fprintf(stderr, "Supported combinations:\n"); @@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); fprintf(stderr, "Only f16 is supported.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index c6c35134d..1b2fd500b 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 15e22f495..f3e68dbfa 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d30b210..29f608b0f 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } else { @@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); // break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 55af195fd..4c3703238 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { break; default: // TODO: k-quants - fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); break; } } diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 84f6387e2..78d70cd7a 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q( mul_mat_q_case(ctx, args, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index f08a4758d..e8a957447 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { case GGML_TYPE_IQ4_NL: return MMQ_Q8_1_DS_LAYOUT_D4; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda break; default: fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index e22faf69b..7dbbc9939 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda( rows_per_cuda_block = 2; break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda( mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index aa7f1eff0..45408ce86 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda( <<>>(x, vy, kx0, kx1, kx0_padded); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 596fb7c13..99ec1dd98 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a2c8dbec0..7f7afdbfc 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif -#define GGML_HASHTABLE_FULL ((size_t)-1) -#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +// bitset + +static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); +#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) +#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) + +static size_t ggml_bitset_size(size_t n) { + return (n + BITSET_MASK) >> BITSET_SHR; +} + +static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) { + return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK))); +} + +static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK)); +} + +static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK)); +} + +// hash set + +#define GGML_HASHSET_FULL ((size_t)-1) +#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) struct ggml_hash_set ggml_hash_set_new(size_t size); +void ggml_hash_set_free(struct ggml_hash_set * hash_set); -bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns the minimum size for a hash set that can hold min_sz elements +size_t ggml_hash_size(size_t min_sz); -// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted -size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// remove all elements from the hash set +void ggml_hash_set_reset(struct ggml_hash_set * hash_set); -// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full -size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns true if key is in the hash set +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); // return index, asserts if table is full -size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// hash function for ggml_tensor +static inline size_t ggml_hash(const struct ggml_tensor * p) { + // the last 4 bits are always zero due to alignment + return (size_t)(uintptr_t)p >> 4; +} + +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) { + i = (i + 1) % hash_set->size; + if (i == h) { + // visited all hash table entries -> not found + return GGML_HASHSET_FULL; + } + } + return i; +} + +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t i = ggml_hash_find(hash_set, key); + return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i); +} + +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return GGML_HASHSET_ALREADY_EXISTS; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} + +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return i; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} #ifdef __cplusplus } diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index ed5f2e349..41ac63fa4 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) { } if ((a % b) != 0) { fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); - GGML_ASSERT(!"safe_divide result would've had remainder"); + GGML_ABORT("safe_divide result would've had remainder"); } return a / b; } @@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml if (!ggml_vk_supports_op(dst)) { fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } const int32_t ne00 = src0 ? src0->ne[0] : 0; @@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml default: { fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } break; @@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml continue; not_implemented: {} fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } // Evaluate sequence diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index a7619bcca..48b813131 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ASSERT(!"capture failed"); + GGML_ABORT("capture failed"); } } @@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute( if (!ggml_metal_supports_op(ctx, dst)) { GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } if (should_capture) { @@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } bcast_row = true; @@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } } @@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } [encoder setComputePipelineState:pipeline]; @@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_SQR: @@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + default: GGML_ABORT("MUL MAT-MAT not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); + default: GGML_ABORT("MUL_MAT_ID not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } else { switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } @@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (dst->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (order) { case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } else { @@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } @@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; case GGML_TYPE_F16: @@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute( switch (dstt) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 47418597c..1c6c85aac 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -12692,7 +12692,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+1] |= (block_signs[k] << 7*k); @@ -12871,7 +12871,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+k] = grid_index | (block_signs[k] << 9); } @@ -13314,7 +13314,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (grid_size == 256) { q3[8*ib+k] = grid_index; @@ -13527,7 +13527,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } qs[k] = grid_index & 255; qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); @@ -14503,7 +14503,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int i8 = 2*ib + k; y[ibl].qs[i8] = grid_index & 255; @@ -14623,7 +14623,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } if (nbytes % ggml_type_size(type) != 0) { - fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); + fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type)); return false; } diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 36518ff93..7cb07d0dc 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, }); }); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, // GGML_SYCL_DEBUG("current device index %d\n", id); src_ptr = (char *) extra->data_device[id]; } else { - // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n"); - GGML_ASSERT(false); + // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n"); + GGML_ABORT("fatal error"); } char * dst_ptr = (char *) dst; @@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te default: // TODO: k-quants fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::arraytype), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } (void) dst; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 397bd98dd..86d8b40e8 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -100,7 +100,7 @@ static void crash() { const char* msg) { fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); fprintf(stderr, " in function %s at %s:%d\n", func, file, line); - GGML_ASSERT(!"SYCL error"); + GGML_ABORT("SYCL error"); } #define SYCL_CHECK(err) \ diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 70a94fc16..ae45630e1 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( break; default: printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 4aaa76bfb..ef4609e32 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -975,7 +975,7 @@ namespace dpct if (backend == "opencl:cpu") return 4; if (backend == "opencl:acc") return 5; printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } static bool compare_backend(std::string &backend1, std::string &backend2) { return convert_backend_index(backend1) < convert_backend_index(backend2); diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index 3107ba919..e952533d3 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_0_PASCAL; nwarps = NWARPS_Q4_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_1_PASCAL; nwarps = NWARPS_Q4_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_0_PASCAL; nwarps = NWARPS_Q5_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_1_PASCAL; nwarps = NWARPS_Q5_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q8_0_PASCAL; nwarps = NWARPS_Q8_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q2_K_PASCAL; nwarps = NWARPS_Q2_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q3_K_PASCAL; nwarps = NWARPS_Q3_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_K_PASCAL; nwarps = NWARPS_Q4_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_K_PASCAL; nwarps = NWARPS_Q5_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q6_K_PASCAL; nwarps = NWARPS_Q6_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q( ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 3fbc4dd60..23232357e 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q( mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 6f507941a..c7545bcc1 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -251,7 +251,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 6bcd81a7b..74991f6d1 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() { // Make sure at least one device exists if (devices.empty()) { std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Default to using all dedicated GPUs @@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf; @@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont staging = ctx->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf = nullptr; @@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s staging_buffer = dst->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si staging_buffer = src->device->sync_staging; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_ } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { @@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (mmp == nullptr) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Not implemented @@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c std::cerr << " and " << ggml_type_name(src1->type); } std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } op_func(ctx, subctx, src0, src1, dst); @@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 } else if (type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_s; shname = "F16_ALIGNED_S"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { @@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_m; shname = "F16_ALIGNED_M"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { @@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_l; shname = "F16_ALIGNED_L"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { GGML_ASSERT(0); @@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { @@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same()) { src0_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (std::is_same()) { src1_type = GGML_TYPE_F32; } else if (std::is_same()) { src1_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); @@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { std::cerr << std::endl; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { @@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return; } @@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } else if (tensor->type == GGML_TYPE_I32) { val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { @@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); @@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * } } else { std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { @@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) { first_error[0] = i0; @@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 29afcc7f8..c196fd5bf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -141,23 +141,25 @@ typedef pthread_t ggml_thread_t; #include -void ggml_print_backtrace(void) { - /* - #include - #include - +#if defined(__linux__) +#include +static void ggml_print_backtrace_symbols(void) { void * trace[100]; - int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0])); - backtrace_symbols_fd(trace, nptrs, STDERR_FILENO); - */ +} +#else +static void ggml_print_backtrace_symbols(void) { + // platform not supported +} +#endif - // backtrack_symbols does not show line numbers, use gdb instead +static void ggml_print_backtrace(void) { char attach[32]; snprintf(attach, sizeof(attach), "attach %d", getpid()); int pid = fork(); if (pid == 0) { + // try gdb execlp("gdb", "gdb", "--batch", "-ex", "set style enabled on", "-ex", attach, @@ -165,16 +167,46 @@ void ggml_print_backtrace(void) { "-ex", "detach", "-ex", "quit", (char *) NULL); + // try lldb + execlp("lldb", "lldb", "--batch", + "-o", "bt", + "-o", "quit", + "-p", attach, + (char *) NULL); + exit(EXIT_FAILURE); } else { - waitpid(pid, NULL, 0); + int wstatus; + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) == EXIT_FAILURE) { + // gdb failed, fallback to backtrace_symbols + ggml_print_backtrace_symbols(); + } + } } } #else -void ggml_print_backtrace(void) { +static void ggml_print_backtrace(void) { // platform not supported } #endif +void ggml_abort(const char * file, int line, const char * fmt, ...) { + fflush(stdout); + + fprintf(stderr, "%s:%d: ", file, line); + + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + + fprintf(stderr, "\n"); + + ggml_print_backtrace(); + abort(); +} + #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -246,7 +278,7 @@ inline static void * ggml_aligned_malloc(size_t size) { break; } GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return NULL; } return aligned_memory; @@ -267,7 +299,7 @@ inline static void * ggml_malloc(size_t size) { void * result = malloc(size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -281,7 +313,7 @@ inline static void * ggml_calloc(size_t num, size_t size) { void * result = calloc(num, size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -3372,7 +3404,7 @@ static inline int ggml_up(int n, int m) { } // assert that pointer is aligned to GGML_MEM_ALIGN -#define ggml_assert_aligned(ptr) \ +#define GGML_ASSERT_ALIGNED(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// @@ -3473,7 +3505,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT(ctx->mem_buffer != NULL); - ggml_assert_aligned(ctx->mem_buffer); + GGML_ASSERT_ALIGNED(ctx->mem_buffer); GGML_PRINT_DEBUG("%s: context initialized\n", __func__); @@ -3605,7 +3637,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml .type = type, }; - ggml_assert_aligned(mem_buffer + obj_new->offs); + GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs); if (obj_cur != NULL) { obj_cur->next = obj_new; @@ -3706,7 +3738,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( #endif // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //ggml_assert_aligned(result->data); + //GGML_ASSERT_ALIGNED(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -3879,8 +3911,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -3938,8 +3970,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -4008,11 +4040,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { @@ -4055,8 +4085,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4076,10 +4106,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { @@ -4111,8 +4139,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4149,11 +4177,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { @@ -4190,8 +4216,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4211,10 +4237,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { @@ -4246,8 +4270,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4270,8 +4294,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) { } struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { - strncpy(tensor->name, name, sizeof(tensor->name) - 1); - tensor->name[sizeof(tensor->name) - 1] = '\0'; + size_t i; + for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) { + tensor->name[i] = name[i]; + } + tensor->name[i] = '\0'; return tensor; } @@ -4842,7 +4869,7 @@ struct ggml_tensor * ggml_mean( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -4865,7 +4892,7 @@ struct ggml_tensor * ggml_argmax( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); is_node = true; } @@ -5188,7 +5215,7 @@ static struct ggml_tensor * ggml_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -5291,7 +5318,7 @@ static struct ggml_tensor * ggml_group_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -5705,7 +5732,7 @@ struct ggml_tensor * ggml_reshape( if (b->grad) { // gradient propagation is not supported - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); @@ -6488,7 +6515,7 @@ struct ggml_tensor * ggml_clamp( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6564,7 +6591,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6636,7 +6663,7 @@ struct ggml_tensor * ggml_im2col( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6722,7 +6749,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6763,7 +6790,7 @@ struct ggml_tensor * ggml_pool_1d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6801,7 +6828,7 @@ struct ggml_tensor * ggml_pool_2d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6834,7 +6861,7 @@ static struct ggml_tensor * ggml_upscale_impl( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6884,7 +6911,7 @@ struct ggml_tensor * ggml_pad( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6933,7 +6960,7 @@ struct ggml_tensor * ggml_timestep_embedding( bool is_node = false; if (timesteps->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7059,7 +7086,7 @@ struct ggml_tensor * ggml_flash_attn_back( struct ggml_tensor * v, struct ggml_tensor * d, bool masked) { - GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes"); + GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes"); GGML_ASSERT(ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -7158,7 +7185,7 @@ struct ggml_tensor * ggml_ssm_conv( bool is_node = false; if (s->grad || x->grad || c->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7212,7 +7239,7 @@ struct ggml_tensor * ggml_ssm_scan( bool is_node = false; if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7244,7 +7271,7 @@ struct ggml_tensor * ggml_win_part( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7282,7 +7309,7 @@ struct ggml_tensor * ggml_win_unpart( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7312,7 +7339,7 @@ struct ggml_tensor * ggml_get_rel_pos( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -8002,7 +8029,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8044,7 +8071,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8161,7 +8188,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8288,7 +8315,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8348,7 +8375,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8517,7 +8544,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8603,7 +8630,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8663,7 +8690,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8834,7 +8861,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -9012,8 +9039,8 @@ static void ggml_compute_forward_dup( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9165,7 +9192,7 @@ static void ggml_compute_forward_add_f16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9240,7 +9267,7 @@ static void ggml_compute_forward_add_bf16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9292,7 +9319,7 @@ static void ggml_compute_forward_add_f16_f16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9344,7 +9371,7 @@ static void ggml_compute_forward_add_bf16_bf16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9438,7 +9465,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_F16: @@ -9450,7 +9477,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -9462,7 +9489,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -9492,8 +9519,8 @@ static void ggml_compute_forward_add( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9827,7 +9854,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -9839,7 +9866,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -9870,8 +9897,8 @@ static void ggml_compute_forward_add1( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9995,8 +10022,8 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10076,8 +10103,8 @@ static void ggml_compute_forward_sub( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10170,8 +10197,8 @@ static void ggml_compute_forward_mul( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10261,8 +10288,8 @@ static void ggml_compute_forward_div( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10306,8 +10333,8 @@ static void ggml_compute_forward_sqr( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10351,8 +10378,8 @@ static void ggml_compute_forward_sqrt( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10396,8 +10423,8 @@ static void ggml_compute_forward_log( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10525,8 +10552,8 @@ static void ggml_compute_forward_sum( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10578,8 +10605,8 @@ static void ggml_compute_forward_sum_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10635,8 +10662,8 @@ static void ggml_compute_forward_mean( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10683,8 +10710,8 @@ static void ggml_compute_forward_argmax( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10801,8 +10828,8 @@ static void ggml_compute_forward_repeat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10879,8 +10906,8 @@ static void ggml_compute_forward_repeat_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10948,8 +10975,8 @@ static void ggml_compute_forward_concat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10992,8 +11019,8 @@ static void ggml_compute_forward_abs( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11036,8 +11063,8 @@ static void ggml_compute_forward_sgn( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11080,8 +11107,8 @@ static void ggml_compute_forward_neg( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11124,8 +11151,8 @@ static void ggml_compute_forward_step( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11168,8 +11195,8 @@ static void ggml_compute_forward_tanh( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11212,8 +11239,8 @@ static void ggml_compute_forward_elu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11256,8 +11283,8 @@ static void ggml_compute_forward_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11300,8 +11327,8 @@ static void ggml_compute_forward_sigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11359,8 +11386,8 @@ static void ggml_compute_forward_gelu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11418,8 +11445,8 @@ static void ggml_compute_forward_gelu_quick( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11477,8 +11504,8 @@ static void ggml_compute_forward_silu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } // ggml_compute_forward_leaky_relu @@ -11526,8 +11553,8 @@ static void ggml_compute_forward_leaky_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11589,8 +11616,8 @@ static void ggml_compute_forward_silu_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11631,8 +11658,8 @@ static void ggml_compute_forward_hardswish( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11673,8 +11700,8 @@ static void ggml_compute_forward_hardsigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11745,8 +11772,8 @@ static void ggml_compute_forward_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11813,8 +11840,8 @@ static void ggml_compute_forward_rms_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11986,8 +12013,8 @@ static void ggml_compute_forward_rms_norm_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12080,8 +12107,8 @@ static void ggml_compute_forward_group_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12839,17 +12866,17 @@ static void ggml_compute_forward_out_prod( } break; case GGML_TYPE_F16: { - GGML_ASSERT(false); // todo + GGML_ABORT("fatal error"); // todo // ggml_compute_forward_out_prod_f16_f32(params, dst); - } break; + } case GGML_TYPE_F32: { ggml_compute_forward_out_prod_f32(params, dst); } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12908,8 +12935,8 @@ static void ggml_compute_forward_scale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13024,8 +13051,8 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13302,8 +13329,8 @@ static void ggml_compute_forward_get_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13410,8 +13437,8 @@ static void ggml_compute_forward_get_rows_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13488,8 +13515,8 @@ static void ggml_compute_forward_diag( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13558,8 +13585,8 @@ static void ggml_compute_forward_diag_mask_inf( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13576,8 +13603,8 @@ static void ggml_compute_forward_diag_mask_zero( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13694,8 +13721,8 @@ static void ggml_compute_forward_soft_max( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13790,8 +13817,8 @@ static void ggml_compute_forward_soft_max_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13881,8 +13908,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_F64: case GGML_TYPE_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14211,8 +14238,8 @@ static void ggml_compute_forward_rope( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14235,8 +14262,8 @@ static void ggml_compute_forward_rope_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14435,8 +14462,8 @@ static void ggml_compute_forward_conv_transpose_1d( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14607,8 +14634,8 @@ static void ggml_compute_forward_im2col( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14740,20 +14767,20 @@ static void ggml_compute_forward_pool_1d_sk_p0( switch (op) { case GGML_OP_POOL_AVG: drow[i] = 0; break; case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { switch (op) { case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } ++j; } switch (op) { case GGML_OP_POOL_AVG: drow[i] /= k; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } @@ -14822,7 +14849,7 @@ static void ggml_compute_forward_pool_2d( switch (op) { case GGML_OP_POOL_AVG: *out = 0; break; case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } const int ix = offset0 + ox * s0; @@ -14837,14 +14864,14 @@ static void ggml_compute_forward_pool_2d( switch (op) { case GGML_OP_POOL_AVG: *out += srow[j]; break; case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } switch (op) { case GGML_OP_POOL_AVG: *out /= ka; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } @@ -14908,8 +14935,8 @@ static void ggml_compute_forward_upscale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14966,8 +14993,8 @@ static void ggml_compute_forward_pad( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15007,8 +15034,8 @@ static void ggml_compute_forward_arange( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15058,8 +15085,8 @@ static void ggml_compute_forward_timestep_embedding( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15117,8 +15144,8 @@ static void ggml_compute_forward_argsort( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15340,8 +15367,8 @@ static void ggml_compute_forward_flash_attn_ext( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15676,8 +15703,8 @@ static void ggml_compute_forward_flash_attn_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15798,8 +15825,8 @@ static void ggml_compute_forward_ssm_conv( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15919,8 +15946,8 @@ static void ggml_compute_forward_ssm_scan( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15982,8 +16009,8 @@ static void ggml_compute_forward_win_part( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16043,8 +16070,8 @@ static void ggml_compute_forward_win_unpart( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16111,8 +16138,8 @@ static void ggml_compute_forward_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16158,8 +16185,8 @@ static void ggml_compute_forward_get_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16239,8 +16266,8 @@ static void ggml_compute_forward_add_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16285,8 +16312,8 @@ static void ggml_compute_forward_map_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16334,8 +16361,8 @@ static void ggml_compute_forward_map_binary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16533,8 +16560,8 @@ static void ggml_compute_forward_cross_entropy_loss( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16620,8 +16647,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16956,14 +16983,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } //////////////////////////////////////////////////////////////////////////////// -static size_t ggml_hash_size(size_t min_sz) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { + size = ggml_hash_size(size); + struct ggml_hash_set result; + result.size = size; + result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); + result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t)); + return result; +} + +void ggml_hash_set_reset(struct ggml_hash_set * hash_set) { + memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size)); +} + +void ggml_hash_set_free(struct ggml_hash_set * hash_set) { + GGML_FREE(hash_set->used); + GGML_FREE(hash_set->keys); +} + +size_t ggml_hash_size(size_t min_sz) { // next primes after powers of two static const size_t primes[] = { 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, @@ -16974,7 +17019,7 @@ static size_t ggml_hash_size(size_t min_sz) { }; static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); - // find the smallest prime that is larger or equal to min_sz + // find the smallest prime that is larger or equal than min_sz size_t l = 0; size_t r = n_primes; while (l < r) { @@ -16989,67 +17034,6 @@ static size_t ggml_hash_size(size_t min_sz) { return sz; } -static size_t ggml_hash(const void * p) { - return (size_t)p; -} - -size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t h = ggml_hash(key) % hash_set.size; - - // linear probing - size_t i = h; - while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) { - i = (i + 1) % hash_set.size; - if (i == h) { - // visited all hash table entries -> not found - return GGML_HASHTABLE_FULL; - } - } - return i; -} - -bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key; -} - -size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - if (hash_set.keys[i] == key) { - return GGML_HASHTABLE_ALREADY_EXISTS; - } - - // insert - GGML_ASSERT(hash_set.keys[i] == NULL); - hash_set.keys[i] = key; - return i; -} - -size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - hash_set.keys[i] = key; - return i; -} - -struct ggml_hash_set ggml_hash_set_new(size_t size) { - size = ggml_hash_size(size); - struct ggml_hash_set result; - result.size = size; - result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); - memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); - return result; -} - -static void ggml_hash_set_free(struct ggml_hash_set hash_set) { - GGML_FREE(hash_set.keys); -} - struct hash_map { struct ggml_hash_set set; struct ggml_tensor ** vals; @@ -17058,13 +17042,12 @@ struct hash_map { static struct hash_map * ggml_new_hash_map(size_t size) { struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size); - memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); + result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *)); return result; } static void ggml_hash_map_free(struct hash_map * map) { - ggml_hash_set_free(map->set); + ggml_hash_set_free(&map->set); GGML_FREE(map->vals); GGML_FREE(map); } @@ -17085,7 +17068,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - if (!ggml_hash_contains(graph->visited_hash_table, node)) { + if (!ggml_hash_contains(&graph->visited_hash_set, node)) { return node; } @@ -17100,8 +17083,8 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - size_t i = ggml_hash_find(replacements->set, node); - GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full + size_t i = ggml_hash_find(&replacements->set, node); + GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full if (replacements->set.keys[i] == node) { return replacements->vals[i]; } @@ -17159,8 +17142,8 @@ void ggml_build_backward_gradient_checkpointing( // insert checkpoints in replacements for (int i = 0; i < n_checkpoints; ++i) { - size_t k = ggml_hash_find(replacements->set, checkpoints[i]); - GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full + size_t k = ggml_hash_find(&replacements->set, checkpoints[i]); + GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite replacements->set.keys[k] = checkpoints[i]; replacements->vals[k] = checkpoints[i]; @@ -17188,7 +17171,7 @@ void ggml_build_backward_gradient_checkpointing( // functions to change gradients considering the case that input a might be initial gradient with zero value -static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return b; } else { @@ -17196,7 +17179,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); @@ -17205,7 +17188,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_repeat(ctx, b, a); } else { @@ -17213,7 +17196,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g } } -static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_neg(ctx, b); } else { @@ -17221,7 +17204,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg } } -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) { +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; @@ -17390,8 +17373,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MEAN: case GGML_OP_ARGMAX: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_REPEAT: { // necessary for llama @@ -17414,16 +17397,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CONCAT: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_SILU_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_RMS_NORM: { // necessary for llama @@ -17439,12 +17422,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_RMS_NORM_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_GROUP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged @@ -17505,12 +17488,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MUL_MAT_ID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_OUT_PROD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_SCALE: { // necessary for llama @@ -17686,12 +17669,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_GET_ROWS_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG_MASK_INF: { // necessary for llama @@ -17729,8 +17712,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SOFT_MAX_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ROPE: { // necessary for llama @@ -17805,52 +17788,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CLAMP: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_IM2COL: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_UPSCALE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_PAD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARANGE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_TIMESTEP_EMBEDDING: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARGSORT: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_LEAKY_RELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_FLASH_ATTN_EXT: { struct ggml_tensor * flash_grad = NULL; @@ -17906,13 +17889,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_FLASH_ATTN_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_UNARY: @@ -17950,12 +17933,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_TANH: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_ELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_RELU: { if (src0->grad) { @@ -17969,16 +17952,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_SIGMOID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU_QUICK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_SILU: { // necessary for llama @@ -17990,7 +17973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_GET_REL_POS: @@ -18004,8 +17987,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { @@ -18020,16 +18003,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_NONE: { // nop } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } for (int i = 0; i < GGML_MAX_SRC; ++i) { @@ -18049,7 +18032,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) { + if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) { return; } @@ -18131,7 +18114,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size); for (int i = 0; i < gf->n_nodes; i++) { if (gf->grads[i]) { - ggml_hash_insert(zero_table, gf->grads[i]); + ggml_hash_insert(&zero_table, gf->grads[i]); } } @@ -18141,7 +18124,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * // inplace operations to add gradients are not created by ggml_compute_backward // use allocator to automatically make inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, zero_table); + ggml_compute_backward(ctx, node, &zero_table); } } @@ -18154,16 +18137,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } - ggml_hash_set_free(zero_table); + ggml_hash_set_free(&zero_table); +} + +static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { + void * ptr = *p; + ptr = (void *) GGML_PAD((uintptr_t) ptr, align); + *p = (void *) ((char *) ptr + size); + return ptr; } static size_t ggml_graph_nbytes(size_t size, bool grads) { - size_t nbytes = sizeof(struct ggml_cgraph); - nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes + size_t hash_size = ggml_hash_size(size * 2); + void * p = 0; + incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1); + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs + incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys if (grads) { - nbytes += size * sizeof(struct ggml_tensor *); // grads + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads } - nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set + incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); + + size_t nbytes = (size_t) p; return nbytes; } @@ -18180,19 +18176,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); - struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); - + // the size of the hash table is doubled since it needs to hold both nodes and leafs size_t hash_size = ggml_hash_size(size * 2); - struct ggml_tensor ** nodes_ptr = data_start; - struct ggml_tensor ** leafs_ptr = nodes_ptr + size; - struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size; - struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL; + + void * p = cgraph + 1; + + struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); // check that we allocated the correct amount of memory - assert(obj_size == (size_t) ( - (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph)); - - memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *)); + assert(obj_size == (size_t)((char *)p - (char *)cgraph)); *cgraph = (struct ggml_cgraph) { /*.size =*/ size, @@ -18201,10 +18197,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.nodes =*/ nodes_ptr, /*.grads =*/ grads_ptr, /*.leafs =*/ leafs_ptr, - /*.hash_table =*/ { hash_size, hash_keys_ptr }, + /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; + ggml_hash_set_reset(&cgraph->visited_hash_set); + return cgraph; } @@ -18220,7 +18218,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.nodes =*/ cgraph0->nodes + i0, /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL }, + /*.hash_table =*/ { 0, NULL, NULL }, /*.order =*/ cgraph0->order, }; @@ -18230,7 +18228,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { GGML_ASSERT(dst->size >= src->n_leafs); GGML_ASSERT(dst->size >= src->n_nodes); - GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size); + GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size); dst->n_leafs = src->n_leafs; dst->n_nodes = src->n_nodes; @@ -18251,9 +18249,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } - for (size_t i = 0; i < src->visited_hash_table.size; ++i) { - if (src->visited_hash_table.keys[i]) { - ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]); + for (size_t i = 0; i < src->visited_hash_set.size; ++i) { + if (src->visited_hash_set.keys[i]) { + ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); } } } @@ -18279,7 +18277,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { void ggml_graph_clear(struct ggml_cgraph * cgraph) { cgraph->n_leafs = 0; cgraph->n_nodes = 0; - memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); + ggml_hash_set_reset(&cgraph->visited_hash_set); } // @@ -18471,7 +18469,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = n_threads; } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } break; case GGML_OP_SILU_BACK: @@ -18598,8 +18596,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: { fprintf(stderr, "%s: op not implemented: ", __func__); @@ -18608,8 +18606,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } else { fprintf(stderr, "%d\n", node->op); } - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } assert(n_tasks > 0); @@ -18719,7 +18717,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa cur += sizeof(float)*ne00*ne01*ne02; cur += sizeof(float)*ne10*ne11; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_CONV_TRANSPOSE_2D: @@ -18765,8 +18763,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: break; } @@ -20000,9 +19998,9 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - GGML_ASSERT(false && "line search failed"); + GGML_ABORT("line search failed"); - return GGML_LINESEARCH_FAIL; + //return GGML_LINESEARCH_FAIL; } static enum ggml_opt_result ggml_opt_lbfgs( @@ -20270,9 +20268,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - GGML_ASSERT(false && "lbfgs failed"); + GGML_ABORT("lbfgs failed"); - return GGML_OPT_RESULT_DID_NOT_CONVERGE; + //return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -20967,10 +20965,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } if (!ok) { @@ -21551,12 +21549,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); GGML_FREE((void *)data); } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { - GGML_ASSERT(false && "nested arrays not supported"); + GGML_ABORT("nested arrays not supported"); } else { gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); } } break; - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } } @@ -21565,7 +21563,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { if (gguf_find_tensor(ctx, tensor->name) != -1) { - GGML_ASSERT(false && "duplicated tensor name"); + GGML_ABORT("duplicated tensor name"); } const int idx = ctx->header.n_tensors; @@ -21598,7 +21596,7 @@ void gguf_add_tensor( void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].type = type; @@ -21607,7 +21605,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].data = data; @@ -21736,10 +21734,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } } @@ -21800,7 +21798,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { FILE * file = ggml_fopen(fname, "wb"); if (!file) { - GGML_ASSERT(false && "failed to open file for writing"); + GGML_ABORT("failed to open file for writing"); } struct gguf_buf buf = gguf_buf_init(16*1024); diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bd9322e2f..b123d7331 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -221,7 +221,7 @@ static void llama_grammar_advance_stack( // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc return; } } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const std::string & piece = vocab->cache_token_to_piece.at(token); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index c482b3689..133094904 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { return strtol(buf.c_str(), NULL, 16); } case LLAMA_VOCAB_TYPE_BPE: { - GGML_ASSERT(false); - return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? + GGML_ABORT("fatal error"); + //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? } case LLAMA_VOCAB_TYPE_WPM: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1396,7 +1396,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } } break; case LLAMA_VOCAB_TYPE_NONE: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return output; @@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(unicode_byte_to_utf8(ch)); } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token break; } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/src/llama.cpp b/src/llama.cpp index 77f7d32f8..bc830c0ef 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2259,8 +2259,7 @@ struct llama_hparams { return n_head_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_head_kv(uint32_t il = 0) const { @@ -2268,8 +2267,7 @@ struct llama_hparams { return n_head_kv_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_ff(uint32_t il = 0) const { @@ -2277,8 +2275,7 @@ struct llama_hparams { return n_ff_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_gqa(uint32_t il = 0) const { @@ -8072,7 +8069,7 @@ static struct ggml_tensor * llm_build_moe_ffn( cb(gate, "ffn_moe_gelu", il); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] @@ -8635,8 +8632,8 @@ struct llm_build_context { } break; default: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } cb(cur, "result_embd_pooled", -1); @@ -8891,7 +8888,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -11723,7 +11720,7 @@ struct llm_build_context { switch (model.type) { case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; cb(Qcur, "Qcur_scaled", il); @@ -13888,7 +13885,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_jais(); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // add on pooling layer @@ -14687,8 +14684,8 @@ static int llama_decode_internal( } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } } n_outputs_prev += lctx.n_outputs; @@ -15079,7 +15076,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA - GGML_ASSERT(false && "Deepseek2 does not support K-shift"); + GGML_ABORT("Deepseek2 does not support K-shift"); } { @@ -15218,7 +15215,7 @@ static void llama_tensor_dequantize_internal( } else if (ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { - GGML_ASSERT(false); // unreachable + GGML_ABORT("fatal error"); // unreachable } return; } @@ -16904,8 +16901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: - GGML_ASSERT(false && "unknown architecture"); - break; + GGML_ABORT("unknown architecture"); } return LLAMA_ROPE_TYPE_NONE; @@ -18469,7 +18465,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } @@ -18514,7 +18510,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2c03c60d4..2fa59fd0a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -94,7 +94,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m // This is going to create some weird integers though. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -132,7 +132,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -1435,7 +1435,7 @@ struct test_argsort : public test_case { ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -2462,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op return true; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return false; } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 6374958fe..de858bd3b 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -166,12 +166,12 @@ static void test_sampler_queue( for (auto s : samplers_sequence) { switch (s){ case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; - case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break; - case 'y': GGML_ASSERT(false && "typical test not implemented"); break; + case 'f': GGML_ABORT("tail_free test not implemented"); break; + case 'y': GGML_ABORT("typical test not implemented"); break; case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; - case 't': GGML_ASSERT(false && "temperature test not implemented"); break; - default : GGML_ASSERT(false && "Unknown sampler"); break; + case 't': GGML_ABORT("temperature test not implemented"); break; + default : GGML_ABORT("Unknown sampler"); break; } llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests @@ -222,7 +222,7 @@ static void test_sampler_queue( GGML_ASSERT(candidates_p.data[0].id == max_token_id); GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } From bfb4c74981f0a40d757b450b596a9fe4ca983d26 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Sat, 27 Jul 2024 16:36:44 +0800 Subject: [PATCH 021/154] cann: Fix Multi-NPU execution error (#8710) * cann: fix multi-npu exec error * cann: update comment for ggml_backend_cann_supports_buft --- ggml/src/ggml-cann.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index ad5feea05..461febcc0 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( return false; } + // need open both directions for memcpyasync between devices. + ggml_cann_set_device(cann_ctx_dst->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0)); ggml_cann_set_device(cann_ctx_src->device); ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, - cann_ctx_dst->stream())); + cann_ctx_src->stream())); - // record event on src stream - if (!cann_ctx_src->copy_event) { - ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event)); - } - - ACL_CHECK( - aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream())); - - // wait on dst stream for the copy to complete - ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), - cann_ctx_src->copy_event)); + //TODO: workaround for Event didn`t work here. + aclrtSynchronizeStream(cann_ctx_src->stream()); } else { // src and dst are on the same backend ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, @@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * * This function determines whether the CANN backend supports the given backend * buffer type by comparing the device context of the backend and buffer type. - * It returns true if the device associated with the buffer type matches the - * device associated with the backend. + * It returns true if the devices are same between the backend context and + * buffer type context. * * @param backend Pointer to the CANN backend. * @param buft Pointer to the backend buffer type to check. @@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { */ GGML_CALL static bool ggml_backend_cann_supports_buft( ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cann_buffer_type_name; - - GGML_UNUSED(backend); + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_context * cann_ctx = + (ggml_backend_cann_context *)backend->context; + ggml_backend_cann_buffer_type_context * buft_ctx = + (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == cann_ctx->device; + } + return false; } /** From 9d03d085dd6cb275c078690bb64073b9b043e95f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 27 Jul 2024 12:45:02 +0200 Subject: [PATCH 022/154] common : add --no-warmup option for main/llama-cli (#8712) This commit adds a --no-warmup option for llama-cli. The motivation for this is that it can be convenient to skip the warmup llama_decode call when debugging. Signed-off-by: Daniel Bevenius --- common/common.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index ec44a0552..60c7eac75 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--no-warmup") { + params.warmup = false; + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); options.push_back({ "server infill", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); From 92090eca212650727e38b335c1d4accfbcc9b79c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 14:59:29 +0300 Subject: [PATCH 023/154] llama : add function for model-based max number of graph nodes (#8622) * llama : model-based max number of graph nodes ggml-ci * llama : disable 405B max_nodes path due to lack of complaints ggml-ci --- src/llama.cpp | 96 ++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index bc830c0ef..c9cdbb343 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -101,7 +101,6 @@ #endif // bump if necessary -#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -3567,6 +3566,15 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; +// TODO: update when needed or think of some clever automatic way to do this +static size_t llama_model_max_nodes(const llama_model & /*model*/) { + //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B + // return 32768; + //} + + return 8192; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -8396,7 +8404,7 @@ struct llm_build_context { } struct ggml_cgraph * build_k_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.size == n_ctx); @@ -8427,7 +8435,7 @@ struct llm_build_context { } struct ggml_cgraph * build_s_copy() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.recurrent); @@ -8450,7 +8458,7 @@ struct llm_build_context { } struct ggml_cgraph * build_defrag(const std::vector & ids) { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); for (uint32_t i = 0; i < ids.size(); ++i) { const uint32_t id = ids[i]; @@ -8691,7 +8699,7 @@ struct llm_build_context { } struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -8834,7 +8842,7 @@ struct llm_build_context { } struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8949,7 +8957,7 @@ struct llm_build_context { } struct ggml_cgraph * build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9052,7 +9060,7 @@ struct llm_build_context { } struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9172,7 +9180,7 @@ struct llm_build_context { } struct ggml_cgraph * build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9329,7 +9337,7 @@ struct llm_build_context { } struct ggml_cgraph * build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9455,7 +9463,7 @@ struct llm_build_context { } struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9559,7 +9567,7 @@ struct llm_build_context { } struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9653,7 +9661,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9847,7 +9855,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9948,7 +9956,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10238,7 +10246,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10350,7 +10358,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10462,7 +10470,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -10608,7 +10616,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10729,7 +10737,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10961,7 +10969,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11066,7 +11074,7 @@ struct llm_build_context { } struct ggml_cgraph * build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11177,7 +11185,7 @@ struct llm_build_context { } struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11295,7 +11303,7 @@ struct llm_build_context { } struct ggml_cgraph * build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11416,7 +11424,7 @@ struct llm_build_context { // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 // based on the original build_llama() function struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11560,7 +11568,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11668,7 +11676,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11803,7 +11811,7 @@ struct llm_build_context { struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11922,7 +11930,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t d_model = n_embd; const int64_t d_conv = hparams.ssm_d_conv; @@ -12071,7 +12079,7 @@ struct llm_build_context { struct ggml_cgraph * build_command_r() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12225,7 +12233,7 @@ struct llm_build_context { // * removed bias // * removed MoE struct ggml_cgraph * build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12349,7 +12357,7 @@ struct llm_build_context { } struct ggml_cgraph * build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12474,7 +12482,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12616,7 +12624,7 @@ struct llm_build_context { } struct ggml_cgraph * build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12748,7 +12756,7 @@ struct llm_build_context { } struct ggml_cgraph * build_deepseek2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12976,7 +12984,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13116,7 +13124,7 @@ struct llm_build_context { } struct ggml_cgraph * build_t5() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -13433,7 +13441,7 @@ struct llm_build_context { } struct ggml_cgraph * build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -13525,7 +13533,7 @@ struct llm_build_context { } struct ggml_cgraph * build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -14870,9 +14878,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation // - x2 for keys and values - //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer); // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer); + const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -16762,8 +16770,10 @@ struct llama_context * llama_new_context_with_model( } } + const size_t max_nodes = llama_model_max_nodes(*model); + // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = @@ -16776,7 +16786,7 @@ struct llama_context * llama_new_context_with_model( // currently this is only implemented in the CUDA backend pipeline_parallel = false; #endif - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); From b5e95468b1676e1e5c9d80d1eeeb26f542a38f42 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 27 Jul 2024 05:03:45 -0700 Subject: [PATCH 024/154] llama : add support for llama 3.1 rope scaling factors (#8676) * Add llama 3.1 rope scaling factors to llama conversion and inference This commit generates the rope factors on conversion and adds them to the resulting model as a tensor. At inference time, these factors are passed to the `ggml_rope_ext` rope oepration, improving results for context windows above 8192 * Update convert_hf_to_gguf.py Co-authored-by: compilade * address comments * address comments * Update src/llama.cpp Co-authored-by: compilade * Update convert_hf_to_gguf.py Co-authored-by: compilade --------- Co-authored-by: compilade --- convert_hf_to_gguf.py | 28 ++++++++++++++++++++++++++++ src/llama.cpp | 14 ++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4087187c1..8ba3c5844 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1570,6 +1570,34 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + super().prepare_tensors() if self._experts is not None: diff --git a/src/llama.cpp b/src/llama.cpp index c9cdbb343..0345d0062 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2451,6 +2451,7 @@ struct llama_layer { // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; + struct ggml_tensor * rope_freqs = nullptr; // bitnet scale struct ggml_tensor * wq_scale; @@ -6059,6 +6060,8 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (n_expert == 0) { layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -8536,6 +8539,10 @@ struct llm_build_context { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { return model.layers[il].rope_long; } @@ -8730,6 +8737,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -8753,14 +8763,14 @@ struct llm_build_context { } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); From c12b6e8ee7d905e0f299caf311689189fb1b4ac5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 8 Jul 2024 12:03:42 +0200 Subject: [PATCH 025/154] ggml : remove unnecessary UNUSED macro call (ggml/880) This commit removes an UNUSED macro call that is not needed as the variable n0 is used in the code and will not produce a warning. Signed-off-by: Daniel Bevenius --- ggml/src/ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c196fd5bf..a14d0d1db 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18078,7 +18078,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } const int n0 = cgraph->n_nodes; - UNUSED(n0); ggml_visit_parents(cgraph, tensor); From d2b851bfa131478665315bc5c7c707506c14d703 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 12 Jul 2024 17:24:20 +0300 Subject: [PATCH 026/154] cmake : only enable GGML_NATIVE and x86 flags if not crosscompiling (ggml/885) --- ggml/CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index be22a7460..1768a508b 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -50,9 +50,15 @@ else() set(GGML_BLAS_VENDOR_DEFAULT "Generic") endif() +if (CMAKE_CROSSCOMPILING) + set(GGML_NATIVE_DEFAULT OFF) +else() + set(GGML_NATIVE_DEFAULT ON) +endif() + # general option(GGML_STATIC "ggml: static link libraries" OFF) -option(GGML_NATIVE "ggml: enable -march=native flag" ON) +option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) option(GGML_LTO "ggml: enable link time optimization" OFF) option(GGML_CCACHE "ggml: use ccache if available" ON) @@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) # instruction set specific -if (GGML_NATIVE) +if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT) set(INS_ENB OFF) else() set(INS_ENB ON) From 203b7f1531303a060730ec1d1e01920e70302398 Mon Sep 17 00:00:00 2001 From: Tony Wasserka <4840017+neobrain@users.noreply.github.com> Date: Sat, 20 Jul 2024 20:49:44 +0200 Subject: [PATCH 027/154] vulkan : initialize vk_buffer_struct members to VK_NULL_HANDLE (ggml/893) This prevents invalid frees when destroying a partially initialized vk_buffer_struct. For example, this could happen in ggml_vk_create_buffer when running out of device memory. Co-authored-by: Tony Wasserka --- ggml/src/ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 74991f6d1..fa68360b9 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -236,8 +236,8 @@ struct vk_device_struct { }; struct vk_buffer_struct { - vk::Buffer buffer; - vk::DeviceMemory device_memory; + vk::Buffer buffer = VK_NULL_HANDLE; + vk::DeviceMemory device_memory = VK_NULL_HANDLE; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; From 9f77d899b7b0d56496f679e54b797da6199fed8e Mon Sep 17 00:00:00 2001 From: Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:32:02 +0300 Subject: [PATCH 028/154] ggml: add support for float16 input tensors in pooling operations (ggml/895) * Add support for float16 tensors in 1d pooling operations * Add support for float16 input tensors in 2d pooling operations * code cleanup remove unnecessary casting during srow ptr initialization --------- Co-authored-by: vanaka11 --- ggml/src/ggml.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a14d0d1db..c76d00a39 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -14746,7 +14746,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_tensor * src = dst->src[0]; - assert(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14759,10 +14759,8 @@ static void ggml_compute_forward_pool_1d_sk_p0( const int64_t rs = dst->ne[0]; while (cdata < data_end) { - const float * const srow = (const float *)cdata; - + const void * srow = (const void *)cdata; int j = 0; - for (int64_t i = 0; i < rs; ++i) { switch (op) { case GGML_OP_POOL_AVG: drow[i] = 0; break; @@ -14770,10 +14768,11 @@ static void ggml_compute_forward_pool_1d_sk_p0( case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + case GGML_OP_POOL_AVG: drow[i] += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } ++j; } @@ -14814,7 +14813,7 @@ static void ggml_compute_forward_pool_2d( const struct ggml_tensor * src = dst->src[0]; - GGML_ASSERT(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14857,14 +14856,15 @@ static void ggml_compute_forward_pool_2d( for (int ky = 0; ky < k1; ++ky) { if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: *out += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + case GGML_OP_POOL_AVG: *out += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } From a05ca9369716a8319014cd1fc365980d43f8aae9 Mon Sep 17 00:00:00 2001 From: Mahesh Madhav <67384846+heshpdx@users.noreply.github.com> Date: Thu, 25 Jul 2024 00:54:08 -0700 Subject: [PATCH 029/154] ggml : loop tiling optimizations for scalar path (ggml/898) Apply a loop tiling technique to the generic path, which provides performance upside for ISAs with enough registers to take advantage of it. Also helps the compiler optimize this path. --- ggml/src/ggml-quants.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1c6c85aac..aa936fe5f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8; - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); } @@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0 + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F); const int v1 = (x[ib].qs[j] >> 4); - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } @@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; } @@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; @@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } From ae7985cd7beca3b849328d169a8d592469cd021f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 15:53:48 +0300 Subject: [PATCH 030/154] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 80159b70b..998b23ac6 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -e3b3846976c94163f2b3dd128cc959782653edbb +31d544f87835a55602883fe09156bb85a4c163d8 From 345c8c0c87a97c1595f9c8b14833d531c8c7d8df Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 15:57:09 +0300 Subject: [PATCH 031/154] ggml : add missing semicolon (#0) ggml-ci --- ggml/src/ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index aa936fe5f..9016314f5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4477,7 +4477,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi0 = 0 + int sumi0 = 0; int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { From 56f20aa25d5f97248a204b473c99f4040900f0e5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 17:19:35 +0300 Subject: [PATCH 032/154] scripts : sync ggml-aarch64 sources --- scripts/sync-ggml-am.sh | 4 ++++ scripts/sync-ggml.sh | 2 ++ 2 files changed, 6 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index ba3bedf21..f624e4881 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -102,6 +102,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake # # src/ggml.c -> ggml/src/ggml.c + # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c + # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend.c -> ggml/src/ggml-backend.c @@ -143,6 +145,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 402446ef9..af3784c3d 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -5,6 +5,8 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c +cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c +cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c From 5e2727fe0321c38d1664d26173c654fa1801dc5f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 18:08:31 +0300 Subject: [PATCH 033/154] scripts : sync vulkan-shaders (#0) --- scripts/sync-ggml-am.sh | 2 ++ scripts/sync-ggml.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index f624e4881..c40025356 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -119,6 +119,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-sycl/* -> ggml/src/ggml-sycl/ # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp + # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/ # # include/ggml.h -> ggml/include/ggml.h # include/ggml-alloc.h -> ggml/include/ggml-alloc.h @@ -162,6 +163,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index af3784c3d..d6d7d0a60 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -23,6 +23,7 @@ cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp +cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h From e54c35e4fb5777c76316a50671640e6e144c9538 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Sun, 28 Jul 2024 07:41:25 +0800 Subject: [PATCH 034/154] feat: Support Moore Threads GPU (#8383) * Update doc for MUSA Signed-off-by: Xiaodong Ye * Add GGML_MUSA in Makefile Signed-off-by: Xiaodong Ye * Add GGML_MUSA in CMake Signed-off-by: Xiaodong Ye * CUDA => MUSA Signed-off-by: Xiaodong Ye * MUSA adds support for __vsubss4 Signed-off-by: Xiaodong Ye * Fix CI build failure Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye --- Makefile | 57 ++++++++-- README.md | 1 + docs/build.md | 13 +++ ggml/CMakeLists.txt | 1 + ggml/include/ggml-cuda.h | 3 + ggml/src/CMakeLists.txt | 62 +++++++++-- ggml/src/ggml-common.h | 6 +- ggml/src/ggml-cuda.cu | 22 ++-- ggml/src/ggml-cuda/common.cuh | 194 +++++++++++++++++++++++++++++++++- 9 files changed, 329 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index 7e015af3e..c82f4268a 100644 --- a/Makefile +++ b/Makefile @@ -528,10 +528,21 @@ ifndef GGML_NO_ACCELERATE endif endif # GGML_NO_ACCELERATE +ifdef GGML_MUSA + CC := clang + CXX := clang++ + GGML_CUDA := 1 + MK_CPPFLAGS += -DGGML_USE_MUSA +endif + ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp + ifdef GGML_MUSA + MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp + MK_LDFLAGS += -L/usr/lib/llvm-10/lib + endif # GGML_MUSA endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS @@ -582,15 +593,27 @@ else endif # GGML_CUDA_FA_ALL_QUANTS ifdef GGML_CUDA - ifneq ('', '$(wildcard /opt/cuda)') - CUDA_PATH ?= /opt/cuda - else - CUDA_PATH ?= /usr/local/cuda - endif + ifdef GGML_MUSA + ifneq ('', '$(wildcard /opt/musa)') + CUDA_PATH ?= /opt/musa + else + CUDA_PATH ?= /usr/local/musa + endif - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - MK_NVCCFLAGS += -use_fast_math + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include + MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64 + MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22 + else + ifneq ('', '$(wildcard /opt/cuda)') + CUDA_PATH ?= /opt/cuda + else + CUDA_PATH ?= /usr/local/cuda + endif + + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib + MK_NVCCFLAGS += -use_fast_math + endif # GGML_MUSA OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) @@ -600,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS +ifndef GGML_MUSA ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT +endif # GGML_MUSA ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo @@ -615,8 +640,12 @@ endif # GGML_CUDA_DEBUG ifdef GGML_CUDA_NVCC NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else - NVCC = $(CCACHE) nvcc -endif #GGML_CUDA_NVCC + ifdef GGML_MUSA + NVCC = $(CCACHE) mcc + else + NVCC = $(CCACHE) nvcc + endif # GGML_MUSA +endif # GGML_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -687,9 +716,15 @@ define NVCC_COMPILE $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else + ifdef GGML_MUSA +define NVCC_COMPILE + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@ +endef # NVCC_COMPILE + else define NVCC_COMPILE $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE + endif # GGML_MUSA endif # JETSON_EOL_MODULE_DETECT ggml/src/ggml-cuda/%.o: \ @@ -944,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef GGML_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifndef GGML_MUSA ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH @@ -953,6 +989,7 @@ endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) +endif # GGML_MUSA endif # GGML_CUDA $(info ) diff --git a/README.md b/README.md index d0ae2efb9..775ce2c88 100644 --- a/README.md +++ b/README.md @@ -409,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md) | [BLAS](./docs/build.md#blas-build) | All | | [BLIS](./docs/backend/BLIS.md) | All | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [Vulkan](./docs/build.md#vulkan) | GPU | diff --git a/docs/build.md b/docs/build.md index d9d12c467..cfe42ebbf 100644 --- a/docs/build.md +++ b/docs/build.md @@ -192,6 +192,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | +### MUSA + +- Using `make`: + ```bash + make GGML_MUSA=1 + ``` +- Using `CMake`: + + ```bash + cmake -B build -DGGML_MUSA=ON + cmake --build build --config Release + ``` + ### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 1768a508b..a5c2e96a8 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -113,6 +113,7 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) option(GGML_CUDA "ggml: use CUDA" OFF) +option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index d7903c666..71bb6dcf0 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -6,6 +6,9 @@ #ifdef GGML_USE_HIPBLAS #define GGML_CUDA_NAME "ROCm" #define GGML_CUBLAS_NAME "hipBLAS" +#elif defined(GGML_USE_MUSA) +#define GGML_CUDA_NAME "MUSA" +#define GGML_CUBLAS_NAME "muBLAS" #else #define GGML_CUDA_NAME "CUDA" #define GGML_CUBLAS_NAME "cuBLAS" diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c6496c921..836496fb9 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -139,6 +139,17 @@ if (GGML_METAL) ) endif() +if (GGML_MUSA) + set(CMAKE_C_COMPILER clang) + set(CMAKE_C_EXTENSIONS OFF) + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_CXX_EXTENSIONS OFF) + + set(GGML_CUDA ON) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA) +endif() + if (GGML_OPENMP) find_package(OpenMP) if (OpenMP_FOUND) @@ -147,6 +158,11 @@ if (GGML_OPENMP) add_compile_definitions(GGML_USE_OPENMP) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + + if (GGML_MUSA) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so") + endif() else() message(WARNING "OpenMP not found") endif() @@ -249,7 +265,13 @@ endif() if (GGML_CUDA) cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - find_package(CUDAToolkit) + if (GGML_MUSA) + list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/") + find_package(MUSAToolkit) + set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND}) + else() + find_package(CUDAToolkit) + endif() if (CUDAToolkit_FOUND) message(STATUS "CUDA found") @@ -268,7 +290,11 @@ if (GGML_CUDA) endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - enable_language(CUDA) + if (GGML_MUSA) + set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE}) + else() + enable_language(CUDA) + endif() file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h") @@ -332,21 +358,40 @@ if (GGML_CUDA) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() + if (GGML_MUSA) + set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX) + foreach(SOURCE ${GGML_SOURCES_CUDA}) + set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22") + endforeach() + endif() + if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() endif() else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() endif() if (GGML_CUDA_NO_VMM) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() endif() else() message(WARNING "CUDA not found") @@ -857,8 +902,10 @@ function(get_flags CCID CCVER) set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) - if (CCVER VERSION_GREATER_EQUAL 7.1.0) - list(APPEND CXX_FLAGS -Wno-format-truncation) + if (NOT GGML_MUSA) + if (CCVER VERSION_GREATER_EQUAL 7.1.0) + list(APPEND CXX_FLAGS -Wno-format-truncation) + endif() endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) list(APPEND CXX_FLAGS -Wextra-semi) @@ -1264,6 +1311,7 @@ endif() target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) +target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fafd5fa7a..e40057632 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -19,7 +19,11 @@ typedef half2 ggml_half2; #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) +#if defined(GGML_COMMON_DECL_MUSA) +#include +#else #include +#endif #include typedef half ggml_half; @@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ #define GGML_TABLE_END() }; #define GGML_COMMON_IMPL -#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) +#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA) #include #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 54ccf6bb1..c73ae40d4 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() { for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -179,7 +179,7 @@ static ggml_cuda_device_info ggml_cuda_init() { alloc_prop.location.id = id; CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); } -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) info.devices[id].vmm = !!device_vmm; cudaDeviceProp prop; @@ -315,7 +315,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); } }; -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr(new ggml_cuda_pool_vmm(device)); } -#endif +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) return std::unique_ptr(new ggml_cuda_pool_leg(device)); } @@ -1341,7 +1341,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static cudaError_t ggml_cuda_Memcpy2DPeerAsync( void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices cudaMemcpy3DPeerParms p = {}; p.dstDevice = dstDevice; @@ -1355,7 +1355,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync( GGML_UNUSED(dstDevice); GGML_UNUSED(srcDevice); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) } static void ggml_cuda_op_mul_mat( @@ -1828,6 +1828,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } } #else +#ifdef GGML_USE_MUSA + GGML_ASSERT(false); +#else // !GGML_USE_MUSA if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx @@ -1870,6 +1873,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } +#endif // GGML_USE_MUSA #endif if (dst->op_params[0] == GGML_PREC_DEFAULT) { @@ -3027,7 +3031,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size return false; } -#if CUDART_VERSION >= 11100 +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); if (err != cudaSuccess) { // clear the error diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index eac026f47..8c3c20b90 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -12,6 +12,10 @@ #else #define GGML_COMMON_DECL_CUDA #define GGML_COMMON_IMPL_CUDA +#if defined(GGML_USE_MUSA) +#define GGML_COMMON_DECL_MUSA +#define GGML_COMMON_IMPL_MUSA +#endif #endif #include "ggml-common.h" @@ -114,6 +118,150 @@ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#elif defined(GGML_USE_MUSA) +#include +#include +#include +#include +// XXX: Keep the following order the same as hipBLAS +// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F +// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F +#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F +#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N MUBLAS_OP_N +#define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS +// #define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F MUSA_R_16F +#define CUDA_R_32F MUSA_R_32F +// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +// #define cublasComputeType_t mublasComputeType_t +#define cublasCreate mublasCreate +#define cublasDestroy mublasDestroy +#define cublasGemmEx mublasGemmEx +#define cublasGemmBatchedEx mublasGemmBatchedEx +#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx +#define cublasHandle_t mublasHandle_t +// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetMathMode mublasSetMathMode +#define cublasSetStream mublasSetStream +#define cublasSgemm mublasSgemm +#define cublasStatus_t mublasStatus_t +#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess +#define cudaDeviceProp musaDeviceProp +#define cudaDeviceSynchronize musaDeviceSynchronize +#define cudaError_t musaError_t +#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags musaEventCreateWithFlags +#define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventRecord musaEventRecord +#define cudaEventSynchronize musaEventSynchronize +#define cudaEvent_t musaEvent_t +#define cudaEventDestroy musaEventDestroy +#define cudaFree musaFree +#define cudaFreeHost musaFreeHost +#define cudaGetDevice musaGetDevice +#define cudaGetDeviceCount musaGetDeviceCount +#define cudaGetDeviceProperties musaGetDeviceProperties +#define cudaGetErrorString musaGetErrorString +#define cudaGetLastError musaGetLastError +#define cudaHostRegister musaHostRegister +#define cudaHostRegisterPortable musaHostRegisterPortable +#define cudaHostRegisterReadOnly musaHostRegisterReadOnly +#define cudaHostUnregister musaHostUnregister +#define cudaLaunchHostFunc musaLaunchHostFunc +#define cudaMalloc musaMalloc +#define cudaMallocHost musaMallocHost +#define cudaMemcpy musaMemcpy +#define cudaMemcpyAsync musaMemcpyAsync +#define cudaMemcpyPeerAsync musaMemcpyPeerAsync +#define cudaMemcpy2DAsync musaMemcpy2DAsync +#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost +#define cudaMemcpyHostToDevice musaMemcpyHostToDevice +#define cudaMemcpyKind musaMemcpyKind +#define cudaMemset musaMemset +#define cudaMemsetAsync musaMemsetAsync +#define cudaMemGetInfo musaMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize +#define cudaSetDevice musaSetDevice +#define cudaStreamCreateWithFlags musaStreamCreateWithFlags +#define cudaStreamDestroy musaStreamDestroy +#define cudaStreamFireAndForget musaStreamFireAndForget +#define cudaStreamNonBlocking musaStreamNonBlocking +#define cudaStreamPerThread musaStreamPerThread +#define cudaStreamSynchronize musaStreamSynchronize +#define cudaStreamWaitEvent musaStreamWaitEvent +#define cudaStream_t musaStream_t +#define cudaSuccess musaSuccess + +// XXX: Other CUDA => MUSA mapping +#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE +#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED +#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED +#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE +#define CUdevice MUdevice +#define CUdeviceptr MUdeviceptr +#define CUmemAccessDesc MUmemAccessDesc +#define CUmemAllocationProp MUmemAllocationProp +#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle +#define cuDeviceGet muDeviceGet +#define cuDeviceGetAttribute muDeviceGetAttribute +#define cuMemAddressFree muMemAddressFree +#define cuMemAddressReserve muMemAddressReserve +#define cuMemCreate muMemCreate +#define cuMemGetAllocationGranularity muMemGetAllocationGranularity +#define cuMemMap muMemMap +#define cuMemRelease muMemRelease +#define cuMemSetAccess muMemSetAccess +#define cuMemUnmap muMemUnmap +#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize +#define cudaFuncSetAttribute musaFuncSetAttribute +#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms +#define make_cudaExtent make_musaExtent +#define make_cudaPitchedPtr make_musaPitchedPtr + +// XXX: USE_CUDA_GRAPH +#define CUDA_SUCCESS MUSA_SUCCESS +#define CUresult MUresult +#define cuGetErrorString muGetErrorString +#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure +#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction +#define cudaGraphDestroy musaGraphDestroy +#define cudaGraphExecDestroy musaGraphExecDestroy +#define cudaGraphExec_t musaGraphExec_t +#define cudaGraphExecUpdate musaGraphExecUpdate +#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult +#define cudaGraphGetNodes musaGraphGetNodes +#define cudaGraphInstantiate musaGraphInstantiate +#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams +#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams +#define cudaGraphLaunch musaGraphLaunch +#define cudaGraphNodeGetType musaGraphNodeGetType +#define cudaGraphNode_t musaGraphNode_t +#define cudaGraphNodeType musaGraphNodeType +#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel +#define cudaGraph_t musaGraph_t +#define cudaKernelNodeParams musaKernelNodeParams +#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed +#define cudaStreamEndCapture musaStreamEndCapture + +// XXX: cuBLAS => muBLAS mapping +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t + +// XXX: Clang builtins mapping +#define __vsub4 __vsub4_musa +#define __vcmpeq4 __vcmpeq4_musa +#define __vcmpne4 __vcmpne4_musa #else #include #include @@ -168,9 +316,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) -#if CUDART_VERSION >= 12000 +#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { +#ifndef GGML_USE_MUSA return cublasGetStatusString(err); +#else + return mublasStatus_to_string(err); +#endif // GGML_USE_MUSA } #else static const char * cublas_get_error_str(const cublasStatus_t err) { @@ -200,7 +352,7 @@ static const char * cu_get_error_str(CUresult err) { #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #endif -#if CUDART_VERSION >= 11100 +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) #define GGML_CUDA_ASSUME(x) __builtin_assume(x) #else #define GGML_CUDA_ASSUME(x) @@ -214,6 +366,42 @@ typedef float dfloat; // dequantize float typedef float2 dfloat2; #endif //GGML_CUDA_F16 +#if defined(GGML_USE_MUSA) +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); + +static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} +#endif // defined(GGML_USE_MUSA) + #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -455,7 +643,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); return mask_low | mask_high; } -#endif // CUDART_VERSION < 12000 +#endif // CUDART_VERSION < CUDART_HMASK static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) From 4c676c85e59ef8f771f3a129e6eb217552139231 Mon Sep 17 00:00:00 2001 From: compilade Date: Sun, 28 Jul 2024 00:42:05 -0400 Subject: [PATCH 035/154] llama : refactor session file management (#8699) * llama : refactor session file management * llama : saving and restoring state checks for overflow The size of the buffers should now be given to the functions working with them, otherwise a truncated file could cause out of bound reads. * llama : stream from session file instead of copying into a big buffer Loading session files should no longer cause a memory usage spike. * llama : llama_state_get_size returns the actual size instead of max This is a breaking change, but makes that function *much* easier to keep up to date, and it also makes it reflect the behavior of llama_state_seq_get_size. * llama : share code between whole and seq_id-specific state saving Both session file types now use a more similar format. * llama : no longer store all hparams in session files Instead, the model arch name is stored. The layer count and the embedding dimensions of the KV cache are still verified when loading. Storing all the hparams is not necessary. * llama : fix uint64_t format type * llama : various integer type cast and format string fixes Some platforms use "%lu" and others "%llu" for uint64_t. Not sure how to handle that, so casting to size_t when displaying errors. * llama : remove _context suffix for llama_data_context * llama : fix session file loading llama_state_get_size cannot be used to get the max size anymore. * llama : more graceful error handling of invalid session files * llama : remove LLAMA_MAX_RNG_STATE It's no longer necessary to limit the size of the RNG state, because the max size of session files is not estimated anymore. * llama : cast seq_id in comparison with unsigned n_seq_max --- examples/save-load-state/save-load-state.cpp | 20 +- include/llama.h | 23 +- src/llama.cpp | 1524 +++++++++--------- 3 files changed, 749 insertions(+), 818 deletions(-) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 00c2277ac..d8afdc141 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -47,7 +47,7 @@ int main(int argc, char ** argv) { // save state (rng, logits, embedding and kv_cache) to file { std::vector state_mem(llama_state_get_size(ctx)); - const size_t written = llama_state_get_data(ctx, state_mem.data()); + const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size()); FILE *fp_write = fopen("dump_state.bin", "wb"); fwrite(state_mem.data(), 1, written, fp_write); @@ -99,13 +99,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx2)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx2, state_mem.data())) { + if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx2); llama_free_model(model); @@ -159,13 +162,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx3)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx3, state_mem.data())) { + if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx3); llama_free_model(model); @@ -182,7 +188,7 @@ int main(int argc, char ** argv) { { // save kv of seq 0 std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); + const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); if (ncopy != seq_store.size()) { fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); llama_free(ctx3); @@ -196,7 +202,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); + const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); if (nset != seq_store.size()) { fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); llama_free(ctx3); diff --git a/include/llama.h b/include/llama.h index 413070d95..f23355a6b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -33,17 +33,15 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF -#define LLAMA_MAX_RNG_STATE (64*1024) - #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 7 +#define LLAMA_SESSION_VERSION 8 #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ -#define LLAMA_STATE_SEQ_VERSION 1 +#define LLAMA_STATE_SEQ_VERSION 2 #ifdef __cplusplus extern "C" { @@ -691,10 +689,11 @@ extern "C" { // State / sessions // - // Returns the maximum size in bytes of the state (rng, logits, embedding - // and kv_cache) - will often be smaller after compacting tokens - LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); - LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx), + // Returns the *actual* size in bytes of the state + // (rng, logits, embedding and kv_cache) + // Only use when saving the state, not when restoring it, otherwise the size may be too small. + LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); + LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), "use llama_state_get_size instead"); // Copies the state to the specified destination address. @@ -702,7 +701,8 @@ extern "C" { // Returns the number of bytes copied LLAMA_API size_t llama_state_get_data( struct llama_context * ctx, - uint8_t * dst); + uint8_t * dst, + size_t size); LLAMA_API DEPRECATED(size_t llama_copy_state_data( struct llama_context * ctx, uint8_t * dst), @@ -712,7 +712,8 @@ extern "C" { // Returns the number of bytes read LLAMA_API size_t llama_state_set_data( struct llama_context * ctx, - const uint8_t * src); + const uint8_t * src, + size_t size); LLAMA_API DEPRECATED(size_t llama_set_state_data( struct llama_context * ctx, const uint8_t * src), @@ -754,6 +755,7 @@ extern "C" { LLAMA_API size_t llama_state_seq_get_data( struct llama_context * ctx, uint8_t * dst, + size_t size, llama_seq_id seq_id); // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence @@ -763,6 +765,7 @@ extern "C" { LLAMA_API size_t llama_state_seq_set_data( struct llama_context * ctx, const uint8_t * src, + size_t size, llama_seq_id dest_seq_id); LLAMA_API size_t llama_state_seq_save_file( diff --git a/src/llama.cpp b/src/llama.cpp index 0345d0062..a207451f5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2933,7 +2933,7 @@ static bool llama_kv_cache_init( // TODO: find a nicer way to add other recurrent model architectures cache.recurrent = model.arch == LLM_ARCH_MAMBA; - cache.v_trans = !cparams.flash_attn; + cache.v_trans = !cache.recurrent && !cparams.flash_attn; cache.head = 0; cache.size = kv_size; @@ -17303,18 +17303,18 @@ void llama_kv_cache_update(struct llama_context * ctx) { } // deprecated -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t llama_get_state_size(struct llama_context * ctx) { return llama_state_get_size(ctx); } // deprecated size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - return llama_state_get_data(ctx, dst); + return llama_state_get_data(ctx, dst, -1); } // deprecated size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - return llama_state_set_data(ctx, src); + return llama_state_set_data(ctx, src, -1); } // deprecated @@ -17327,603 +17327,205 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi return llama_state_save_file(ctx, path_session, tokens, n_token_count); } -// Returns the *maximum* size of the state -size_t llama_state_get_size(const struct llama_context * ctx) { - const auto & cparams = ctx->cparams; - const auto & hparams = ctx->model.hparams; - - // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. - // for reference, std::mt19937(1337) serializes to 6701 bytes. - const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_n_outputs = sizeof(size_t); - // assume worst case for outputs although only currently set ones are serialized - const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t); - const size_t s_logits_size = sizeof(size_t); - const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0; - const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0; - const size_t s_kv_buf_size = sizeof(size_t); - const size_t s_kv_head = sizeof(uint32_t); - const size_t s_kv_size = sizeof(uint32_t); - const size_t s_kv_used = sizeof(uint32_t); - const size_t s_v_trans = sizeof(uint32_t); - const size_t s_kv = ctx->kv_self.total_size(); - const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id); - const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; - - const size_t s_total = ( - + s_rng_size - + s_rng - + s_n_outputs - + s_output_pos - + s_logits_size - + s_logits - + s_embedding_size - + s_embedding - + s_kv_buf_size - + s_kv_head - + s_kv_size - + s_kv_used - + s_v_trans - + s_kv - + s_kv_cells - ); - - // on session change it is very likely that the state size has changed - so we need to update this function - static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); - - return s_total; -} - -// llama_context_data -struct llama_data_context { +// TODO: replace all non-fatal assertions with returned errors or exceptions +struct llama_data_write { virtual void write(const void * src, size_t size) = 0; virtual size_t get_size_written() = 0; - virtual ~llama_data_context() = default; -}; + virtual ~llama_data_write() = default; -struct llama_data_buffer_context : llama_data_context { - uint8_t * ptr; - size_t size_written = 0; + void write_string(const std::string & str) { + uint32_t str_size = str.size(); - llama_data_buffer_context(uint8_t * p) : ptr(p) {} - - void write(const void * src, size_t size) override { - memcpy(ptr, src, size); - ptr += size; - size_written += size; + write(&str_size, sizeof(str_size)); + write(str.data(), str_size); } - size_t get_size_written() override { - return size_written; - } -}; - -struct llama_data_file_context : llama_data_context { - llama_file * file; - size_t size_written = 0; - - llama_data_file_context(llama_file * f) : file(f) {} - - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; + void write_model_info(const struct llama_context * ctx) { + std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical } - size_t get_size_written() override { - return size_written; - } -}; - -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_file_context data_ctx(&file); - * llama_state_get_data(ctx, &data_ctx); - * - * buffer context: - * std::vector buf(max_size, 0); - * llama_data_buffer_context data_ctx(&buf.data()); - * llama_state_get_data(ctx, &data_ctx); - * -*/ -static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { - llama_synchronize(ctx); - - // copy rng - { + void write_rng(const std::mt19937 & rng) { std::ostringstream rng_ss; - rng_ss << ctx->sampling.rng; + rng_ss << rng; - const std::string & rng_str = rng_ss.str(); - const size_t rng_size = rng_str.size(); + const std::string & rng_str = rng_ss.str(); - GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(rng_str.data(), rng_size); + write_string(rng_str); } - // copy outputs - { - // Can't use ctx->n_outputs because it's not for the - // entire last batch when n_ubatch is smaller than n_batch - size_t n_outputs = 0; + void write_output_ids(const struct llama_context * ctx) { + const uint32_t n_outputs = ctx->n_outputs; - // copy output ids - { - std::vector output_pos; - - const size_t n_batch = ctx->cparams.n_batch; - const auto & output_ids = ctx->output_ids; - - output_pos.resize(ctx->output_size); - - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch; ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - if ((size_t) pos >= n_outputs) { - n_outputs = pos + 1; - } - GGML_ASSERT((size_t) pos < ctx->output_size); - output_pos[pos] = i; - } - } - - data_ctx->write(&n_outputs, sizeof(n_outputs)); - - if (n_outputs) { - data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t)); - } - } - - // copy logits - { - const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab); - - data_ctx->write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - data_ctx->write(ctx->logits, logits_size * sizeof(float)); - } - } - - // copy embeddings - { - const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd); - - data_ctx->write(&embeddings_size, sizeof(embeddings_size)); - - if (embeddings_size) { - data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); - } - } - } - - // copy kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - // NOTE: kv_size and kv_buf_size are mostly used for sanity checks - const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); - const uint32_t kv_size = kv_self.size; - const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head; - const uint32_t kv_used = kv_self.used; - const uint32_t v_trans = kv_self.v_trans ? 1 : 0; - - data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); - data_ctx->write(&kv_head, sizeof(kv_head)); - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_used, sizeof(kv_used)); - data_ctx->write(&v_trans, sizeof(v_trans)); - - if (kv_buf_size) { - const size_t pre_kv_buf_size = data_ctx->get_size_written(); - - std::vector tmp_buf; - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); - - tmp_buf.resize(k_size); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); - - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); - - tmp_buf.resize(v_size); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); - continue; - } - - // v is not contiguous, copy row by row - const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); - - tmp_buf.resize(v_row_size); - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); - } - } - GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size); - } - - for (uint32_t i = 0; i < kv_head; ++i) { - const auto & cell = kv_self.cells[i]; - - const llama_pos pos = cell.pos; - const size_t seq_id_size = cell.seq_id.size(); - - data_ctx->write(&pos, sizeof(pos)); - data_ctx->write(&seq_id_size, sizeof(seq_id_size)); - - for (auto seq_id : cell.seq_id) { - data_ctx->write(&seq_id, sizeof(seq_id)); - } - } - } -} - -size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) { - llama_data_buffer_context data_ctx(dst); - llama_state_get_data_internal(ctx, &data_ctx); - - return data_ctx.get_size_written(); -} - -// Sets the state reading from the specified source address -size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { - llama_synchronize(ctx); - - const uint8_t * inp = src; - - // set rng - { - size_t rng_size; - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - - GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - - std::string rng_str((const char *)inp, rng_size); inp += rng_size; - - std::istringstream rng_ss(rng_str); - rng_ss >> ctx->sampling.rng; - - GGML_ASSERT(!rng_ss.fail()); - } - - // set output ids - { - size_t n_outputs; std::vector output_pos; - memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs); + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; - GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs)); + GGML_ASSERT(n_outputs <= ctx->output_size); + + output_pos.resize(n_outputs); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + GGML_ASSERT((uint32_t) pos < n_outputs); + output_pos[pos] = i; + } + } + + write(&n_outputs, sizeof(n_outputs)); if (n_outputs) { - output_pos.resize(n_outputs); - memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t)); - inp += n_outputs * sizeof(int32_t); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); - ctx->output_ids[id] = i; - } - - ctx->n_outputs = n_outputs; + write(output_pos.data(), n_outputs * sizeof(int32_t)); } } - // set logits - { - size_t logits_size; + void write_logits(const struct llama_context * ctx) { + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab); - memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - - GGML_ASSERT(ctx->logits_size >= logits_size); + write(&logits_size, sizeof(logits_size)); if (logits_size) { - memcpy(ctx->logits, inp, logits_size * sizeof(float)); - inp += logits_size * sizeof(float); + write(ctx->logits, logits_size * sizeof(float)); } } - // set embeddings - { - size_t embeddings_size; + void write_embeddings(const struct llama_context * ctx) { + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); - memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - - GGML_ASSERT(ctx->embd_size >= embeddings_size); + write(&embeddings_size, sizeof(embeddings_size)); if (embeddings_size) { - memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); - inp += embeddings_size * sizeof(float); + write(ctx->embd, embeddings_size * sizeof(float)); } } - // set kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; + void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) { - const uint32_t n_layer = hparams.n_layer; + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = kv_self.cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; - size_t kv_buf_size; - uint32_t kv_head; - uint32_t kv_size; - uint32_t kv_used; - uint32_t v_trans; + write(&pos, sizeof(pos)); + write(&n_seq_id, sizeof(n_seq_id)); - memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); - memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); - memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans); + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + write(&seq_id, sizeof(seq_id)); + } + } + } + } + } - GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition + void write_kv_cache_data(const struct llama_context * ctx, const std::vector> & cell_ranges) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + const struct llama_hparams & hparams = ctx->model.hparams; - if (kv_self.size != kv_size) { - // the KV cache needs to be big enough to load all the KV cells from the saved state - GGML_ASSERT(kv_self.size >= kv_head); + const uint32_t v_trans = kv_self.v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; - LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n", - __func__, kv_head, kv_size, kv_self.size); + write(&v_trans, sizeof(v_trans)); + write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * k_size_row); + ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); + write(tmp_buf.data(), tmp_buf.size()); + } } - llama_kv_cache_clear(ctx); - - if (kv_buf_size) { - const size_t pre_kv_buf_size = inp - src; - - GGML_ASSERT(kv_self.total_size() >= kv_buf_size); - - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); - inp += k_size; + // Write row size of value + const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + write(&v_size_row, sizeof(v_size_row)); - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); - - ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size); - inp += v_size; - continue; - } - - // v is not contiguous, copy row by row - const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size); - - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); - inp += v_row_size; + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * v_size_row); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); + write(tmp_buf.data(), tmp_buf.size()); } } - GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size); - } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = kv_self.size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - ctx->kv_self.head = kv_head; - ctx->kv_self.used = kv_used; + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - for (uint32_t i = 0; i < kv_head; ++i) { - llama_pos pos; - size_t seq_id_size; + // Write element size + const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + write(&v_size_el, sizeof(v_size_el)); - memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos); - memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size); + // Write GQA embedding size + write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); - ctx->kv_self.cells[i].pos = pos; - - llama_seq_id seq_id; - - for (size_t j = 0; j < seq_id_size; ++j) { - memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id); - ctx->kv_self.cells[i].seq_id.insert(seq_id); + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + tmp_buf.resize(range_size * v_size_el); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); + write(tmp_buf.data(), tmp_buf.size()); + } + } } } } - const size_t nread = inp - src; - const size_t max_size = llama_state_get_size(ctx); + void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; - GGML_ASSERT(nread <= max_size); - - return nread; -} - -static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(path_session, "rb"); - - // sanity checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } - - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); - - if (session_hparams != ctx->model.hparams) { - LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); - return false; - } - } - - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); - - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } - - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } - - // restore the context state - { - const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_state_get_size(ctx); - - if (n_state_size_cur > n_state_size_max) { - LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); - return false; - } - - std::vector state_data(n_state_size_max); - file.read_raw(state_data.data(), n_state_size_cur); - - llama_state_set_data(ctx, state_data.data()); - } - - return true; -} - -bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - try { - return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); - return false; - } -} - -static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - llama_file file(path_session, "wb"); - - file.write_u32(LLAMA_SESSION_MAGIC); - file.write_u32(LLAMA_SESSION_VERSION); - - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - // save the context state using stream saving - llama_data_file_context data_ctx(&file); - llama_state_get_data_internal(ctx, &data_ctx); - - return true; -} - -bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - try { - return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving session file: %s\n", err.what()); - return false; - } -} - -size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) { - // save the size of size_t as a uint32_t for safety check - const size_t size_t_size_size = sizeof(uint32_t); - - // other values - const size_t s_cell_count_size = sizeof(uint32_t); - const size_t s_layer_count_size = sizeof(uint32_t); - const size_t n_embd_v_gqa_size = sizeof(uint32_t); - - size_t s_cell_count = 0; - size_t s_cell_data_size = 0; - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; - if (cell.seq_id.count(seq_id) > 0) { - ++s_cell_count; - s_cell_data_size += sizeof(llama_pos); - } - } - - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // types of keys and values - s_cell_data_size += sizeof(int32_t) * 2; - // k_size_row and v_size_el values of layer - s_cell_data_size += sizeof(size_t) * 2; - - // keys - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - s_cell_data_size += k_size_row * s_cell_count; - - // values (transposed) - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa; - } - - const size_t s_total = ( - size_t_size_size + - s_cell_count_size + - s_layer_count_size + - n_embd_v_gqa_size + - s_cell_data_size - ); - - return s_total; -} - -static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { - llama_synchronize(ctx); - - const auto & kv_self = ctx->kv_self; - GGML_ASSERT(!kv_self.recurrent); // not implemented - - // Save the size of size_t as a uint32_t for safety check - const uint32_t size_t_size = sizeof(size_t); - data_ctx.write(&size_t_size, sizeof(size_t_size)); - - std::vector> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; - - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id - { + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) uint32_t cell_range_begin = kv_self.size; for (uint32_t i = 0; i < kv_self.size; ++i) { const auto & cell = kv_self.cells[i]; - if (cell.has_seq_id(seq_id)) { + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { ++cell_count; if (cell_range_begin == kv_self.size) { cell_range_begin = i; } - } - else { + } else { if (cell_range_begin != kv_self.size) { cell_ranges.emplace_back(cell_range_begin, i); cell_range_begin = kv_self.size; @@ -17940,301 +17542,622 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam cell_count_check += range.second - range.first; } GGML_ASSERT(cell_count == cell_count_check); + + write(&cell_count, sizeof(cell_count)); + + write_kv_cache_meta(kv_self, cell_ranges, seq_id); + write_kv_cache_data(ctx, cell_ranges); + } +}; + +struct llama_data_read { + virtual const uint8_t * read(size_t size) = 0; + virtual void read_to(void * dst, size_t size) = 0; + virtual size_t get_size_read() = 0; + virtual ~llama_data_read() = default; + + void read_string(std::string & str) { + uint32_t str_size; + read_to(&str_size, sizeof(str_size)); + + str.assign((const char *) read(str_size), str_size); } - // Write the cell count - data_ctx.write(&cell_count, sizeof(cell_count)); - - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; - - // Write the layer count - data_ctx.write(&n_layer, sizeof(n_layer)); - - // Write n_embd_v_gqa (reference value) - { - const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s(); - data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + // validate model information + void read_model_info(const struct llama_context * ctx) { + std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + std::string arch_str; + read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); + } + // TODO: add more info which needs to be identical but which is not verified otherwise } - // Iterate the ranges and write all the pos (this is the token position in the prompt) - for (const auto & range : cell_ranges) { - for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = kv_self.cells[i]; - data_ctx.write(&cell.pos, sizeof(cell.pos)); + void read_rng(std::mt19937 & rng) { + std::string rng_str; + read_string(rng_str); + + std::istringstream rng_ss(rng_str); + rng_ss >> rng; + + if (rng_ss.fail()) { + throw std::runtime_error("failed to load RNG state"); } } - // Iterate and write all the keys first, each row is a cell - // Get whole range at a time - std::vector tmp_buf; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + void read_output_ids(struct llama_context * ctx) { + std::vector output_pos; - // Write key type - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - data_ctx.write(&k_type_i, sizeof(k_type_i)); + uint32_t n_outputs; + read_to(&n_outputs, sizeof(n_outputs)); - // Write row size of key - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - data_ctx.write(&k_size_row, sizeof(k_size_row)); + if (n_outputs > llama_output_reserve(*ctx, n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } - // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); + if (n_outputs) { + output_pos.resize(n_outputs); + read_to(output_pos.data(), n_outputs * sizeof(int32_t)); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + if ((uint32_t) id >= ctx->cparams.n_batch) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + } + ctx->output_ids[id] = i; + } + + ctx->n_outputs = n_outputs; } } - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + void read_logits(struct llama_context * ctx) { + uint64_t logits_size; + read_to(&logits_size, sizeof(logits_size)); - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); + if (ctx->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } - // Write row size of value - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - data_ctx.write(&v_size_row, sizeof(v_size_row)); + if (logits_size) { + read_to(ctx->logits, logits_size * sizeof(float)); + } + } - // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); + void read_embeddings(struct llama_context * ctx) { + uint64_t embeddings_size; + read_to(&embeddings_size, sizeof(embeddings_size)); + + if (ctx->embd_size < embeddings_size) { + throw std::runtime_error("embeddings buffer too small"); + } + + if (embeddings_size) { + read_to(ctx->embd, embeddings_size * sizeof(float)); + } + } + + bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { + struct llama_kv_cache & kv_self = ctx->kv_self; + + if (dest_seq_id != -1) { + // single sequence + + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + + llama_batch batch = llama_batch_init(cell_count, 0, 1); + batch.n_tokens = cell_count; + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = dest_seq_id; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { + llama_batch_free(batch); + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + + // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); + GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + + // Cleanup + llama_batch_free(batch); + } else { + // whole KV cache restore + + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + llama_kv_cache_clear(kv_self); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = kv_self.cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + read_to(&seq_id, sizeof(seq_id)); + + if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + return false; + } + + cell.seq_id.insert(seq_id); + } + } + + kv_self.head = 0; + kv_self.used = cell_count; + } + + return true; + } + + bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) { + const struct llama_hparams & hparams = ctx->model.hparams; + struct llama_kv_cache & kv_self = ctx->kv_self; + uint32_t v_trans; + uint32_t n_layer; + read_to(&v_trans, sizeof(v_trans)); + read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size); + return false; + } + if (kv_self.v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row); } } - } else { - // For the values, they are transposed, so we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv_self.size; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - // Write element size - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - data_ctx.write(&v_size_el, sizeof(v_size_el)); + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } - // For each row, we get the element values of each cell - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); + // Read row size of value + uint64_t v_size_row_ref; + read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el; + ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } } } } + return true; } + void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) { + uint32_t cell_count; + read_to(&cell_count, sizeof(cell_count)); + + bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count); + + if (!res) { + if (seq_id == -1) { + llama_kv_cache_clear(ctx); + } else { + llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } + } +}; + +struct llama_data_write_dummy : llama_data_write { + size_t size_written = 0; + + llama_data_write_dummy() {} + + // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context + + void write(const void * /* src */, size_t size) override { + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_write_buffer : llama_data_write { + uint8_t * ptr; + size_t buf_size = 0; + size_t size_written = 0; + + llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + + void write(const void * src, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, src, size); + ptr += size; + size_written += size; + buf_size -= size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_read_buffer : llama_data_read { + const uint8_t * ptr; + size_t buf_size = 0; + size_t size_read = 0; + + llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + + const uint8_t * read(size_t size) override { + const uint8_t * base_ptr = ptr; + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ptr += size; + size_read += size; + buf_size -= size; + return base_ptr; + } + + void read_to(void * dst, size_t size) override { + memcpy(dst, read(size), size); + } + + size_t get_size_read() override { + return size_read; + } +}; + +struct llama_data_write_file : llama_data_write { + llama_file * file; + size_t size_written = 0; + + llama_data_write_file(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_read_file : llama_data_read { + llama_file * file; + size_t size_read = 0; + std::vector temp_buffer; + + llama_data_read_file(llama_file * f) : file(f) {} + + void read_to(void * dst, size_t size) override { + file->read_raw(dst, size); + size_read += size; + } + + const uint8_t * read(size_t size) override { + temp_buffer.resize(size); + read_to(temp_buffer.data(), size); + return temp_buffer.data(); + } + + size_t get_size_read() override { + return size_read; + } +}; + +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_file file("/path", "wb"); + * llama_data_write_file data_ctx(&file); + * llama_state_get_data_internal(ctx, data_ctx); + * + * buffer context: + * std::vector buf(max_size, 0); + * llama_data_write_buffer data_ctx(buf.data(), max_size); + * llama_state_get_data_internal(ctx, data_ctx); + * +*/ +static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) { + llama_synchronize(ctx); + + data_ctx.write_model_info(ctx); + + data_ctx.write_rng(ctx->sampling.rng); + + // copy outputs + data_ctx.write_output_ids(ctx); + data_ctx.write_logits(ctx); + data_ctx.write_embeddings(ctx); + + data_ctx.write_kv_cache(ctx); + return data_ctx.get_size_written(); } -size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) { - llama_data_buffer_context data_ctx(dst); +size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } +} + +// Returns the *actual* size of the state. +// Intended to be used when saving to state to a buffer. +size_t llama_state_get_size(struct llama_context * ctx) { + llama_data_write_dummy data_ctx; + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } +} + +static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) { + llama_synchronize(ctx); + + data_ctx.read_model_info(ctx); + + // set rng + data_ctx.read_rng(ctx->sampling.rng); + + // set outputs + data_ctx.read_output_ids(ctx); + data_ctx.read_logits(ctx); + data_ctx.read_embeddings(ctx); + + data_ctx.read_kv_cache(ctx); + + return data_ctx.get_size_read(); +} + +// Sets the state reading from the specified source address +size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_set_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } +} + +static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + + llama_data_read_file data_ctx(&file); + const size_t n_read = llama_state_set_data_internal(ctx, data_ctx); + + if (n_read != n_state_size_cur) { + LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); + return false; + } + } + return true; +} + +bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + try { + return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what()); + return false; + } +} + +static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state using stream saving + llama_data_write_file data_ctx(&file); + llama_state_get_data_internal(ctx, data_ctx); + + return true; +} + +bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + try { + return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what()); + return false; + } +} + +static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { + llama_synchronize(ctx); + + data_ctx.write_kv_cache(ctx, seq_id); + + return data_ctx.get_size_written(); +} + +size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) { + llama_data_write_dummy data_ctx; return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); } -size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { +size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what()); + return 0; + } +} + +static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { llama_synchronize(ctx); - auto & kv_self = ctx->kv_self; - GGML_ASSERT(!kv_self.recurrent); // not implemented + data_ctx.read_kv_cache(ctx, dest_seq_id); - // Wipe the slot - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + return data_ctx.get_size_read(); +} - const uint8_t * inp = src; - - // Read size of size_t - uint32_t size_t_size; - memcpy(&size_t_size, inp, sizeof(size_t_size)); - inp += sizeof(size_t_size); - if (size_t_size != sizeof(size_t)) { - LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__); +size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what()); return 0; } - - // Read the cell count - uint32_t cell_count; - memcpy(&cell_count, inp, sizeof(cell_count)); - inp += sizeof(cell_count); - - // Read the layer count - uint32_t n_layer_ref; - memcpy(&n_layer_ref, inp, sizeof(n_layer_ref)); - inp += sizeof(n_layer_ref); - - // Read n_embd_v_gqa - uint32_t n_embd_v_gqa_ref; - memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref)); - inp += sizeof(n_embd_v_gqa_ref); - - // Sanity check model compatibility - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; - - if (n_layer != n_layer_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref); - return 0; - } - - if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref); - return 0; - } - - // Allocate the new cells for the slot - if (cell_count) { - llama_batch batch = llama_batch_init(cell_count, 0, 1); - batch.n_tokens = cell_count; - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - memcpy(&pos, inp, sizeof(pos)); - inp += sizeof(pos); - - batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = dest_seq_id; - } - if (!llama_kv_cache_find_slot(kv_self, batch)) { - llama_batch_free(batch); - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); - return 0; - } - - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); - - // Cleanup - llama_batch_free(batch); - } - - const uint32_t kv_size = kv_self.size; - const uint32_t kv_head = kv_self.head; - - // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Read type of key - int32_t k_type_i_ref; - memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref)); - inp += sizeof(k_type_i_ref); - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - if (k_type_i != k_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); - return 0; - } - - // Read row size of key - size_t k_size_row_ref; - memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref)); - inp += sizeof(k_size_row_ref); - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - if (k_size_row != k_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il); - return 0; - } - - if (cell_count) { - // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row); - inp += cell_count * k_size_row; - } - } - - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } - - // Read row size of value - size_t v_size_row_ref; - memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref)); - inp += sizeof(v_size_row_ref); - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - if (v_size_row != v_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il); - return 0; - } - - if (cell_count) { - // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row); - inp += cell_count * v_size_row; - } - } - } else { - // For each layer, read the values for each cell (transposed) - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } - - // Read element size of value - size_t v_size_el_ref; - memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref)); - inp += sizeof(v_size_el_ref); - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - if (v_size_el != v_size_el_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il); - return 0; - } - - if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv_head + j * kv_size) * v_size_el; - ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el); - inp += cell_count * v_size_el; - } - } - } - } - - const size_t nread = inp - src; - - return nread; } static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { @@ -18244,11 +18167,11 @@ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, con file.write_u32(LLAMA_STATE_SEQ_VERSION); // save the prompt - file.write_u32((uint32_t)n_token_count); + file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_file_context data_ctx(&file); + llama_data_write_file data_ctx(&file); llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); const size_t res = file.tell(); @@ -18286,9 +18209,8 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con // restore the context state { const size_t state_size = file.size - file.tell(); - std::vector state_data(state_size); - file.read_raw(state_data.data(), state_size); - const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id); + llama_data_read_file data_ctx(&file); + const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); if (!nread) { LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); return 0; @@ -18304,7 +18226,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what()); return 0; } } @@ -18313,7 +18235,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what()); return 0; } } From 4730faca618ff9cee0780580145e3cbe86f24876 Mon Sep 17 00:00:00 2001 From: Austin <77757836+teleprint-me@users.noreply.github.com> Date: Sun, 28 Jul 2024 03:52:42 -0400 Subject: [PATCH 036/154] chore : Fix vulkan related compiler warnings, add help text, improve CLI options (#8477) * chore: Fix compiler warnings, add help text, improve CLI options * Add prototypes for function definitions * Invert logic of --no-clean option to be more intuitive * Provide a new help prompt with clear instructions * chore : Add ignore rule for vulkan shader generator Signed-off-by: teleprint-me <77757836+teleprint-me@users.noreply.github.com> * Update ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp Co-authored-by: 0cc4m * chore : Remove void and apply C++ style empty parameters * chore : Remove void and apply C++ style empty parameters --------- Signed-off-by: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Co-authored-by: 0cc4m --- .gitignore | 1 + .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 33 +++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7c7dee0c6..c9b4d9983 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ build* !docs/build.md /libllama.so /llama-* +/vulkan-shaders-gen android-ndk-* arm_neon.h cmake-build-* diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index c5be3754b..c9dbf9dfd 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,6 +30,20 @@ #define ASYNCIO_CONCURRENCY 64 +// define prototypes +void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str); +bool directory_exists(const std::string& path); +bool create_directory(const std::string& path); +std::string to_uppercase(const std::string& input); +bool string_ends_with(const std::string& str, const std::string& suffix); +std::string join_paths(const std::string& path1, const std::string& path2); +std::string basename(const std::string &path); +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16); +std::map merge_maps(const std::map& a, const std::map& b); +void matmul_shaders(std::vector>& tasks, bool fp16, bool matmul_id); +void process_shaders(std::vector>& tasks); +void write_output_files(); + std::mutex lock; std::vector> shader_fnames; @@ -38,7 +52,7 @@ std::string input_dir = "vulkan-shaders"; std::string output_dir = "/tmp"; std::string target_hpp = "ggml-vulkan-shaders.hpp"; std::string target_cpp = "ggml-vulkan-shaders.cpp"; -bool no_clean = false; +bool clean = true; const std::vector type_names = { "f32", @@ -464,8 +478,9 @@ void write_output_files() { } fprintf(src, "\n};\n\n"); - if (!no_clean) { + if (clean) { std::remove(path.c_str()); + // fprintf(stderr, "Removed: %s\n", path.c_str()); } } @@ -481,6 +496,18 @@ int main(int argc, char** argv) { } } + if (argc <= 1 || args.find("--help") != args.end()) { + std::cout << "Usage:\n" + "\tvulkan-shaders-gen [options]\n\n" + "Options:\n" + "\t--glslc Path to glslc executable (default: /usr/bin/glslc)\n" + "\t--input-dir Directory containing shader sources (required)\n" + "\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n" + "\t--target-hpp Path to generate a header file with shader declarations in C++ format\n" + "\t--target-cpp Path to generate a source code file implementing the declared shaders (optional)\n" + "\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n"; + return EXIT_SUCCESS; + } if (args.find("--glslc") != args.end()) { GLSLC = args["--glslc"]; // Path to glslc } @@ -497,7 +524,7 @@ int main(int argc, char** argv) { target_cpp = args["--target-cpp"]; // Path to generated cpp file } if (args.find("--no-clean") != args.end()) { - no_clean = true; // Keep temporary SPIR-V files in output-dir after build + clean = false; // Keep temporary SPIR-V files in output-dir after build } if (!directory_exists(input_dir)) { From 6eeaeba126ff701f3e8f79f246805b7023709972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 28 Jul 2024 22:32:44 +0200 Subject: [PATCH 037/154] cmake: use 1 more thread for non-ggml in CI (#8740) --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a1e183d11..b9246659a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,7 +860,8 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON - cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name id: tag From 0832de723695ab400316a6c49b9f712380e3a731 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Mon, 29 Jul 2024 10:50:27 +0800 Subject: [PATCH 038/154] [SYCL] add conv support (#8688) --- ggml/src/ggml-sycl.cpp | 12 +++++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/conv.cpp | 99 ++++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/conv.hpp | 21 ++++++++ ggml/src/ggml-sycl/presets.hpp | 1 + 5 files changed, 134 insertions(+) create mode 100644 ggml/src/ggml-sycl/conv.cpp create mode 100644 ggml/src/ggml-sycl/conv.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 7cb07d0dc..d1dd07f64 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_func_t func; switch (tensor->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + func = ggml_sycl_op_conv_transpose_1d; + break; case GGML_OP_REPEAT: func = ggml_sycl_repeat; break; @@ -5090,6 +5093,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 067181de3..98b0ebc19 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -15,6 +15,7 @@ #include "concat.hpp" #include "common.hpp" +#include "conv.hpp" #include "convert.hpp" #include "dequantize.hpp" #include "dmmv.hpp" diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp new file mode 100644 index 000000000..bc4ab1ddb --- /dev/null +++ b/ggml/src/ggml-sycl/conv.cpp @@ -0,0 +1,99 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "conv.hpp" + +static void conv_transpose_1d_kernel( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float * src0, const float * src1, float * dst, + const sycl::nd_item<3> &item_ct1) { + int global_index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (global_index >= output_size) { + return; + } + + int out_index = global_index / dst_ne0; + + float accumulator = 0; + + for (int c = 0; c < src0_ne2; c++) { + int idx = global_index % dst_ne0; + + int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); + int input_offset = src1_ne0 * c; + + for (int i = 0; i < src1_ne0; i++) { + if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { + continue; + } + int weight_idx = idx - i*s0; + + float kernel_weight = src0[kernel_offset + weight_idx]; + float input_value = src1[input_offset+i]; + + accumulator += kernel_weight * input_value; + } + } + dst[global_index] = accumulator; +} + +static void conv_transpose_1d_f32_f32_sycl( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float *src0, const float *src1, float *dst, + const queue_ptr& stream) { + + const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv_transpose_1d_kernel( + s0, output_size, + src0_ne0, src0_ne1, src0_ne2, + src1_ne0, dst_ne0, + src0, src1, dst, item_ct1); + }); +} + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst) { + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *)dst->op_params; + + const int s0 = opts[0]; + + const int64_t output_size = ggml_nelements(dst); + + conv_transpose_1d_f32_f32_sycl(s0, output_size, + src0->ne[0], src0->ne[1], src0->ne[2], + src1->ne[0], dst->ne[0], + src0_d, src1_d, dst_d, stream); +} + diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp new file mode 100644 index 000000000..eb20730f9 --- /dev/null +++ b/ggml/src/ggml-sycl/conv.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONV_HPP +#define GGML_SYCL_CONV_HPP + +#include "common.hpp" + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst); + +#endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 15ddcac1f..479789626 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -41,6 +41,7 @@ #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 +#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X From 439b3fc75a8deb42899ac47a8f52aae75e0339fe Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Mon, 29 Jul 2024 20:56:12 +0800 Subject: [PATCH 039/154] cuda : organize vendor-specific headers into vendors directory (#8746) Signed-off-by: Xiaodong Ye --- ggml/src/ggml-cuda/common.cuh | 378 +----------------------------- ggml/src/ggml-cuda/vendors/cuda.h | 14 ++ ggml/src/ggml-cuda/vendors/hip.h | 177 ++++++++++++++ ggml/src/ggml-cuda/vendors/musa.h | 171 ++++++++++++++ 4 files changed, 366 insertions(+), 374 deletions(-) create mode 100644 ggml/src/ggml-cuda/vendors/cuda.h create mode 100644 ggml/src/ggml-cuda/vendors/hip.h create mode 100644 ggml/src/ggml-cuda/vendors/musa.h diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 8c3c20b90..eb39b6d23 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -27,255 +27,11 @@ #include #if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap() do { abort(); __builtin_unreachable(); } while(0) -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#include "vendors/hip.h" #elif defined(GGML_USE_MUSA) -#include -#include -#include -#include -// XXX: Keep the following order the same as hipBLAS -// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F -// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F -#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F -#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N MUBLAS_OP_N -#define CUBLAS_OP_T MUBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -// #define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F MUSA_R_16F -#define CUDA_R_32F MUSA_R_32F -// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -// #define cublasComputeType_t mublasComputeType_t -#define cublasCreate mublasCreate -#define cublasDestroy mublasDestroy -#define cublasGemmEx mublasGemmEx -#define cublasGemmBatchedEx mublasGemmBatchedEx -#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx -#define cublasHandle_t mublasHandle_t -// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetMathMode mublasSetMathMode -#define cublasSetStream mublasSetStream -#define cublasSgemm mublasSgemm -#define cublasStatus_t mublasStatus_t -#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess -#define cudaDeviceProp musaDeviceProp -#define cudaDeviceSynchronize musaDeviceSynchronize -#define cudaError_t musaError_t -#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags musaEventCreateWithFlags -#define cudaEventDisableTiming musaEventDisableTiming -#define cudaEventRecord musaEventRecord -#define cudaEventSynchronize musaEventSynchronize -#define cudaEvent_t musaEvent_t -#define cudaEventDestroy musaEventDestroy -#define cudaFree musaFree -#define cudaFreeHost musaFreeHost -#define cudaGetDevice musaGetDevice -#define cudaGetDeviceCount musaGetDeviceCount -#define cudaGetDeviceProperties musaGetDeviceProperties -#define cudaGetErrorString musaGetErrorString -#define cudaGetLastError musaGetLastError -#define cudaHostRegister musaHostRegister -#define cudaHostRegisterPortable musaHostRegisterPortable -#define cudaHostRegisterReadOnly musaHostRegisterReadOnly -#define cudaHostUnregister musaHostUnregister -#define cudaLaunchHostFunc musaLaunchHostFunc -#define cudaMalloc musaMalloc -#define cudaMallocHost musaMallocHost -#define cudaMemcpy musaMemcpy -#define cudaMemcpyAsync musaMemcpyAsync -#define cudaMemcpyPeerAsync musaMemcpyPeerAsync -#define cudaMemcpy2DAsync musaMemcpy2DAsync -#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost -#define cudaMemcpyHostToDevice musaMemcpyHostToDevice -#define cudaMemcpyKind musaMemcpyKind -#define cudaMemset musaMemset -#define cudaMemsetAsync musaMemsetAsync -#define cudaMemGetInfo musaMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize -#define cudaSetDevice musaSetDevice -#define cudaStreamCreateWithFlags musaStreamCreateWithFlags -#define cudaStreamDestroy musaStreamDestroy -#define cudaStreamFireAndForget musaStreamFireAndForget -#define cudaStreamNonBlocking musaStreamNonBlocking -#define cudaStreamPerThread musaStreamPerThread -#define cudaStreamSynchronize musaStreamSynchronize -#define cudaStreamWaitEvent musaStreamWaitEvent -#define cudaStream_t musaStream_t -#define cudaSuccess musaSuccess - -// XXX: Other CUDA => MUSA mapping -#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE -#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED -#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED -#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE -#define CUdevice MUdevice -#define CUdeviceptr MUdeviceptr -#define CUmemAccessDesc MUmemAccessDesc -#define CUmemAllocationProp MUmemAllocationProp -#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle -#define cuDeviceGet muDeviceGet -#define cuDeviceGetAttribute muDeviceGetAttribute -#define cuMemAddressFree muMemAddressFree -#define cuMemAddressReserve muMemAddressReserve -#define cuMemCreate muMemCreate -#define cuMemGetAllocationGranularity muMemGetAllocationGranularity -#define cuMemMap muMemMap -#define cuMemRelease muMemRelease -#define cuMemSetAccess muMemSetAccess -#define cuMemUnmap muMemUnmap -#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize -#define cudaFuncSetAttribute musaFuncSetAttribute -#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms -#define make_cudaExtent make_musaExtent -#define make_cudaPitchedPtr make_musaPitchedPtr - -// XXX: USE_CUDA_GRAPH -#define CUDA_SUCCESS MUSA_SUCCESS -#define CUresult MUresult -#define cuGetErrorString muGetErrorString -#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure -#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction -#define cudaGraphDestroy musaGraphDestroy -#define cudaGraphExecDestroy musaGraphExecDestroy -#define cudaGraphExec_t musaGraphExec_t -#define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult -#define cudaGraphGetNodes musaGraphGetNodes -#define cudaGraphInstantiate musaGraphInstantiate -#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams -#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams -#define cudaGraphLaunch musaGraphLaunch -#define cudaGraphNodeGetType musaGraphNodeGetType -#define cudaGraphNode_t musaGraphNode_t -#define cudaGraphNodeType musaGraphNodeType -#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel -#define cudaGraph_t musaGraph_t -#define cudaKernelNodeParams musaKernelNodeParams -#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed -#define cudaStreamEndCapture musaStreamEndCapture - -// XXX: cuBLAS => muBLAS mapping -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa +#include "vendors/musa.h" #else -#include -#include -#include -#include - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - +#include "vendors/cuda.h" #endif // defined(GGML_USE_HIPBLAS) #define STRINGIZE_IMPL(...) #__VA_ARGS__ @@ -318,11 +74,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { -#ifndef GGML_USE_MUSA return cublasGetStatusString(err); -#else - return mublasStatus_to_string(err); -#endif // GGML_USE_MUSA } #else static const char * cublas_get_error_str(const cublasStatus_t err) { @@ -364,129 +116,7 @@ typedef half2 dfloat2; #else typedef float dfloat; // dequantize float typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -#if defined(GGML_USE_MUSA) -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} -#endif // defined(GGML_USE_MUSA) - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#if defined(__gfx1010__) || defined(__gfx1012__) -#define RDNA1 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} - -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -// __shfl_xor() for half2 was added in ROCm 5.6 -static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { - typedef union half2_b32 { - half2 val; - int b32; - } half2_b32_t; - half2_b32_t tmp; - tmp.val = var; - tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); - return tmp.val; -} -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -#endif // defined(GGML_USE_HIPBLAS) +#endif // GGML_CUDA_F16 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h new file mode 100644 index 000000000..db9f6a165 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include +#include +#include + +#if CUDART_VERSION < 11020 +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h new file mode 100644 index 000000000..d0c377255 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -0,0 +1,177 @@ +#pragma once + +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyPeerAsync hipMemcpyPeerAsync +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap() do { abort(); __builtin_unreachable(); } while(0) +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED + +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#if defined(__gfx1010__) || defined(__gfx1012__) +#define RDNA1 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __vsub4(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} + +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 +// __shfl_xor() for half2 was added in ROCm 5.6 +static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { + typedef union half2_b32 { + half2 val; + int b32; + } half2_b32_t; + half2_b32_t tmp; + tmp.val = var; + tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); + return tmp.val; +} +#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h new file mode 100644 index 000000000..e50a103ac --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -0,0 +1,171 @@ +#pragma once + +#include +#include +#include +#include +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F +#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N MUBLAS_OP_N +#define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUDA_R_16F MUSA_R_16F +#define CUDA_R_32F MUSA_R_32F +#define cublasComputeType_t cudaDataType_t +#define cublasCreate mublasCreate +#define cublasDestroy mublasDestroy +#define cublasGemmEx mublasGemmEx +#define cublasGemmBatchedEx mublasGemmBatchedEx +#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx +#define cublasHandle_t mublasHandle_t +#define cublasSetMathMode mublasSetMathMode +#define cublasSetStream mublasSetStream +#define cublasSgemm mublasSgemm +#define cublasStatus_t mublasStatus_t +#define cublasGetStatusString mublasStatus_to_string +#define cudaDataType_t musaDataType_t +#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess +#define cudaDeviceProp musaDeviceProp +#define cudaDeviceSynchronize musaDeviceSynchronize +#define cudaError_t musaError_t +#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags musaEventCreateWithFlags +#define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventRecord musaEventRecord +#define cudaEventSynchronize musaEventSynchronize +#define cudaEvent_t musaEvent_t +#define cudaEventDestroy musaEventDestroy +#define cudaFree musaFree +#define cudaFreeHost musaFreeHost +#define cudaGetDevice musaGetDevice +#define cudaGetDeviceCount musaGetDeviceCount +#define cudaGetDeviceProperties musaGetDeviceProperties +#define cudaGetErrorString musaGetErrorString +#define cudaGetLastError musaGetLastError +#define cudaHostRegister musaHostRegister +#define cudaHostRegisterPortable musaHostRegisterPortable +#define cudaHostRegisterReadOnly musaHostRegisterReadOnly +#define cudaHostUnregister musaHostUnregister +#define cudaLaunchHostFunc musaLaunchHostFunc +#define cudaMalloc musaMalloc +#define cudaMallocHost musaMallocHost +#define cudaMemcpy musaMemcpy +#define cudaMemcpyAsync musaMemcpyAsync +#define cudaMemcpyPeerAsync musaMemcpyPeerAsync +#define cudaMemcpy2DAsync musaMemcpy2DAsync +#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost +#define cudaMemcpyHostToDevice musaMemcpyHostToDevice +#define cudaMemcpyKind musaMemcpyKind +#define cudaMemset musaMemset +#define cudaMemsetAsync musaMemsetAsync +#define cudaMemGetInfo musaMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize +#define cudaSetDevice musaSetDevice +#define cudaStreamCreateWithFlags musaStreamCreateWithFlags +#define cudaStreamDestroy musaStreamDestroy +#define cudaStreamFireAndForget musaStreamFireAndForget +#define cudaStreamNonBlocking musaStreamNonBlocking +#define cudaStreamPerThread musaStreamPerThread +#define cudaStreamSynchronize musaStreamSynchronize +#define cudaStreamWaitEvent musaStreamWaitEvent +#define cudaStream_t musaStream_t +#define cudaSuccess musaSuccess + +// Additional mappings for MUSA virtual memory pool +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE +#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED +#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED +#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE +#define CUdevice MUdevice +#define CUdeviceptr MUdeviceptr +#define CUmemAccessDesc MUmemAccessDesc +#define CUmemAllocationProp MUmemAllocationProp +#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle +#define cuDeviceGet muDeviceGet +#define cuDeviceGetAttribute muDeviceGetAttribute +#define cuMemAddressFree muMemAddressFree +#define cuMemAddressReserve muMemAddressReserve +#define cuMemCreate muMemCreate +#define cuMemGetAllocationGranularity muMemGetAllocationGranularity +#define cuMemMap muMemMap +#define cuMemRelease muMemRelease +#define cuMemSetAccess muMemSetAccess +#define cuMemUnmap muMemUnmap +#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize +#define cudaFuncSetAttribute musaFuncSetAttribute +#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms +#define make_cudaExtent make_musaExtent +#define make_cudaPitchedPtr make_musaPitchedPtr + +// Additional mappings for MUSA graphs +#define CUDA_SUCCESS MUSA_SUCCESS +#define CUresult MUresult +#define cuGetErrorString muGetErrorString +#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure +#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction +#define cudaGraphDestroy musaGraphDestroy +#define cudaGraphExecDestroy musaGraphExecDestroy +#define cudaGraphExec_t musaGraphExec_t +#define cudaGraphExecUpdate musaGraphExecUpdate +#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult +#define cudaGraphGetNodes musaGraphGetNodes +#define cudaGraphInstantiate musaGraphInstantiate +#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams +#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams +#define cudaGraphLaunch musaGraphLaunch +#define cudaGraphNodeGetType musaGraphNodeGetType +#define cudaGraphNode_t musaGraphNode_t +#define cudaGraphNodeType musaGraphNodeType +#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel +#define cudaGraph_t musaGraph_t +#define cudaKernelNodeParams musaKernelNodeParams +#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed +#define cudaStreamEndCapture musaStreamEndCapture + +// XXX: Clang builtins mapping +#define __vsub4 __vsub4_musa +#define __vcmpeq4 __vcmpeq4_musa +#define __vcmpne4 __vcmpne4_musa + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); + +static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} From 75af08c475e285888f66556d0f459c533b7deb95 Mon Sep 17 00:00:00 2001 From: CarterLi999 <664681047@qq.com> Date: Tue, 30 Jul 2024 00:38:34 +0800 Subject: [PATCH 040/154] ggml: bugfix: fix the inactive elements is agnostic for risc-v vector (#8748) In these codes, we want to retain the value that they previously held when mask[i] is false. So we should use undisturbed. With the default agnostic policy of rvv intrinsic, these values can be held or be written with 1s. Co-authored-by: carter.li --- ggml/src/ggml-quants.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9016314f5..16aaf523f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // compute mask for subtraction vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); m <<= 1; vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); m <<= 1; vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); m <<= 1; vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); m <<= 1; // load Q8 and take product with Q3 @@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl)); vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl); + vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl); m <<= 1; vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl)); vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl); + vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl); m <<= 1; vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl); From c887d8b01726b11ea03dbcaa9d44fa74422d0076 Mon Sep 17 00:00:00 2001 From: zhentaoyu Date: Tue, 30 Jul 2024 14:56:51 +0800 Subject: [PATCH 041/154] [SYCL] Add `TIMESTEP_EMBEDDING` OP (#8707) Signed-off-by: zhentaoyu --- ggml/src/ggml-sycl.cpp | 4 ++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/presets.hpp | 1 + ggml/src/ggml-sycl/tsembd.cpp | 71 ++++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/tsembd.hpp | 21 ++++++++++ 5 files changed, 98 insertions(+) create mode 100644 ggml/src/ggml-sycl/tsembd.cpp create mode 100644 ggml/src/ggml-sycl/tsembd.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index d1dd07f64..d8eb86c2c 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4108,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_ARGSORT: func = ggml_sycl_argsort; break; + case GGML_OP_TIMESTEP_EMBEDDING: + func = ggml_sycl_op_timestep_embedding; + break; default: return false; } @@ -5225,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: + case GGML_OP_TIMESTEP_EMBEDDING: return true; default: return false; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 98b0ebc19..58dd9c9a6 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -24,5 +24,6 @@ #include "rope.hpp" #include "norm.hpp" #include "softmax.hpp" +#include "tsembd.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 479789626..340ab8e93 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -42,6 +42,7 @@ #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 +#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp new file mode 100644 index 000000000..d5c227cd1 --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -0,0 +1,71 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "tsembd.hpp" + +static void timestep_embedding_f32( + const float * timesteps, float * dst, const int nb1, + const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) { + // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0] + // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = item_ct1.get_group(1); + int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half); + float arg = timestep * freq; + embed_data[j] = sycl::cos(arg); + embed_data[j + half] = sycl::sin(arg); +} + +static void timestep_embedding_f32_sycl( + const float * x, float * dst, const int ne00, const int nb1, + const int dim, const int max_period, const queue_ptr& stream) { + // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad + int half_ceil = dim / 2; + int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE; + sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE); + sycl::range<3> gridDim(1, ne00, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + gridDim * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + timestep_embedding_f32( + x, dst, nb1, dim, max_period, item_ct1 + ); + }); +} + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst) { + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); +} diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp new file mode 100644 index 000000000..ff854c337 --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_TSEMBD_HPP +#define GGML_SYCL_TSEMBD_HPP + +#include "common.hpp" + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst); + +#endif // GGML_SYCL_TSEMBD_HPP From 6e2b6000e5fe808954a7dcef8225b5b7f2c1b9e9 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Tue, 30 Jul 2024 18:37:35 +0800 Subject: [PATCH 042/154] cann: update cmake (#8765) --- ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index a5c2e96a8..7fe1661bb 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -207,6 +207,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-alloc.h include/ggml-backend.h include/ggml-blas.h + include/ggml-cann.h include/ggml-cuda.h include/ggml.h include/ggml-kompute.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 836496fb9..425a25895 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -849,11 +849,6 @@ if (GGML_CANN) ${CANN_INSTALL_DIR}/acllib/include ) - # TODO: find libs - link_directories( - ${CANN_INSTALL_DIR}/lib64 - ) - add_subdirectory(ggml-cann/kernels) list(APPEND CANN_LIBRARIES ascendcl @@ -872,6 +867,7 @@ if (GGML_CANN) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) + set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() else() From 140074bb8647df41840d6f32f4409fa8959bcf9f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 30 Jul 2024 15:58:57 +0300 Subject: [PATCH 043/154] flake.lock: Update (#8729) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 940cda6a4..3dc68abb6 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1721379653, - "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=", + "lastModified": 1722062969, + "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374", + "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3", "type": "github" }, "original": { From 7c27a19b2eb91bb0f43c7f7aec0386cec2dddc33 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Tue, 30 Jul 2024 23:40:18 +0900 Subject: [PATCH 044/154] added android implementation of ggml_print_backtrace_symbols (#8751) * added android implementation of ggml_print_backtrace_symbols * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren * Update ggml/src/ggml.c Co-authored-by: slaren --------- Co-authored-by: slaren --- ggml/src/ggml.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c76d00a39..4d5667884 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -141,7 +141,51 @@ typedef pthread_t ggml_thread_t; #include -#if defined(__linux__) +#if defined(__ANDROID__) +#include +#include +#include + +struct backtrace_state { + void ** current; + void ** end; +}; + +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { + struct backtrace_state * state = (struct backtrace_state *)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = (void*)pc; + } + } + return _URC_NO_REASON; +} + +static void ggml_print_backtrace_symbols(void) { + const int max = 100; + void* buffer[max]; + + struct backtrace_state state = {buffer, buffer + max}; + _Unwind_Backtrace(unwind_callback, &state); + + int count = state.current - buffer; + + for (int idx = 0; idx < count; ++idx) { + const void * addr = buffer[idx]; + const char * symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); + } +} +#elif defined(__linux__) #include static void ggml_print_backtrace_symbols(void) { void * trace[100]; From 7e72aa74fd676a093eb9970e761085ec22734c71 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 31 Jul 2024 00:57:03 +1000 Subject: [PATCH 045/154] py: add_array() will not add to kv store if value is an empty array (#8774) * gguf_writer.py: add_array() should not add to kv store if empty * Apply suggestions from code review I was wondering if there was a specific reason for `if val` but good to hear we can safely use `len(val == 0` Co-authored-by: compilade --------- Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ba6f53cda..2e0b335ee 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -312,6 +312,8 @@ class GGUFWriter: self.add_key_value(key, val, GGUFValueType.STRING) def add_array(self, key: str, val: Sequence[Any]) -> None: + if len(val) == 0: + return self.add_key_value(key, val, GGUFValueType.ARRAY) @staticmethod @@ -845,7 +847,14 @@ class GGUFWriter: encoded_val = val.encode("utf-8") if isinstance(val, str) else val kv_data += self._pack("Q", len(encoded_val)) kv_data += encoded_val - elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: + elif vtype == GGUFValueType.ARRAY: + + if not isinstance(val, Sequence): + raise ValueError("Invalid GGUF metadata array, expecting sequence") + + if len(val) == 0: + raise ValueError("Invalid GGUF metadata array. Empty array") + if isinstance(val, bytes): ltype = GGUFValueType.UINT8 else: From 268c5660062270a2c19a36fc655168aa287aaec2 Mon Sep 17 00:00:00 2001 From: Someone Date: Tue, 30 Jul 2024 23:35:30 +0300 Subject: [PATCH 046/154] nix: cuda: rely on propagatedBuildInputs (#8772) Listing individual outputs no longer necessary to reduce the runtime closure size after https://github.com/NixOS/nixpkgs/pull/323056. --- .devops/nix/package.nix | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 911c42ecb..a87423c71 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -126,16 +126,9 @@ let ++ optionals useMetalKit [ MetalKit ]; cudaBuildInputs = with cudaPackages; [ - cuda_cccl.dev # - - # A temporary hack for reducing the closure size, remove once cudaPackages - # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 - cuda_cudart.dev - cuda_cudart.lib - cuda_cudart.static - libcublas.dev - libcublas.lib - libcublas.static + cuda_cudart + cuda_cccl # + libcublas ]; rocmBuildInputs = with rocmPackages; [ From 44d28ddd5caaa5e9de573bdaaa5b5b2448a29ace Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 31 Jul 2024 16:40:08 +0300 Subject: [PATCH 047/154] cmake : fix use of external ggml (#8787) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 793709122..a31320635 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # -get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS) +get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) +get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) From 398ede5efeb07b9adf9fbda7ea63f630d476a792 Mon Sep 17 00:00:00 2001 From: pculliton Date: Wed, 31 Jul 2024 11:12:10 -0400 Subject: [PATCH 048/154] Adding Gemma 2 2B configs (#8784) * Adding Gemma 2 2B configs Updates to Q scaling and Gemma 2 model sizes to match v2 2B model. * Update src/llama.cpp Co-authored-by: slaren --------- Co-authored-by: slaren --- src/llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index a207451f5..e6f303d31 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4969,6 +4969,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -11736,6 +11737,7 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; default: GGML_ABORT("fatal error"); From ed9d2854c9de4ae1f448334294e61167b04bec2a Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Wed, 31 Jul 2024 15:51:06 -0400 Subject: [PATCH 049/154] Build: Fix potential race condition (#8781) * Fix potential race condition as pointed out by @fairydreaming in #8776 * Reference the .o rather than rebuilding every time. * Adding in CXXFLAGS and LDFLAGS * Removing unnecessary linker flags. --- Makefile | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index c82f4268a..f4ce4f1fb 100644 --- a/Makefile +++ b/Makefile @@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Mark legacy binary targets as .PHONY so that they are always checked. .PHONY: main quantize perplexity embedding server +# Define the object file target +examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +main: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +server: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.cpp +quantize: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif -perplexity: examples/deprecation-warning/deprecation-warning.cpp +perplexity: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif -embedding: examples/deprecation-warning/deprecation-warning.cpp +embedding: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." From afbbcf3c04e3c6420cad3d72571478cd62ac176c Mon Sep 17 00:00:00 2001 From: Igor Okulist Date: Wed, 31 Jul 2024 18:59:09 -0500 Subject: [PATCH 050/154] server : update llama-server embedding flag documentation (#8779) Fixes #8763 --- common/common.cpp | 2 +- examples/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 60c7eac75..521f849e2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); diff --git a/examples/server/README.md b/examples/server/README.md index 33a2b95cc..de83ee7d0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -247,7 +247,7 @@ server: --host HOST ip address to listen (default: 127.0.0.1) --port PORT port to listen (default: 8080) --path PATH path to serve static files from (default: ) - --embedding(s) enable embedding endpoint (default: disabled) + --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) --api-key KEY API key to use for authentication (default: none) --api-key-file FNAME path to file containing API keys (default: none) --ssl-key-file FNAME path to file a PEM-encoded SSL private key From c8a0090922bad576623de4aae227717085249262 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Thu, 1 Aug 2024 10:39:05 +0800 Subject: [PATCH 051/154] cann: support q8_0 for Ascend backend (#8805) --- ggml/src/ggml-cann/aclnn_ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index f27666970..90ccf3e18 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2381,10 +2381,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - ggml_cann_pool_alloc input_alloctor( - ctx.pool(), ggml_nelements(src1) * input_elem_size); + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; From 7a11eb3a260915aee16101808f291a244e2facc7 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 1 Aug 2024 15:26:22 +0200 Subject: [PATCH 052/154] cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X (#8800) * cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X * update asserts * only use dmmv for supported types * add test --- ggml/src/ggml-cuda.cu | 5 ++--- ggml/src/ggml-cuda/dmmv.cu | 21 +++++++++++++++------ ggml/src/ggml-cuda/dmmv.cuh | 2 ++ tests/test-backend-ops.cpp | 5 +++-- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index c73ae40d4..b510777fb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1885,10 +1885,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 - && src1->ne[1] == 1; + && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index d7a2a2513..96a5adef5 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead const dim3 block_nums(block_num_y, 1, 1); @@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { + return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 || + src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 || + src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || + src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || + src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || + src0_type == GGML_TYPE_F16; +} diff --git a/ggml/src/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh index 4c5ebd475..e727eb97f 100644 --- a/ggml/src/ggml-cuda/dmmv.cuh +++ b/ggml/src/ggml-cuda/dmmv.cuh @@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2fa59fd0a..5de70d554 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -804,8 +804,7 @@ struct test_cpy : public test_case { test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, std::array ne = {10, 10, 10, 1}, - std::array permute = {0, 0, 0, 0}, - bool _dst_use_permute = false) + std::array permute = {0, 0, 0, 0}) : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} @@ -2269,6 +2268,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { + + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), { 1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); } } From b7a08fd5e0e7c898c68d1743066ea495202d9608 Mon Sep 17 00:00:00 2001 From: Alex O'Connell <35843486+acon96@users.noreply.github.com> Date: Thu, 1 Aug 2024 12:53:46 -0400 Subject: [PATCH 053/154] Build: Only include execinfo.h on linux systems that support it (#8783) * Only enable backtrace on GLIBC linux systems * fix missing file from copy * use glibc macro instead of defining a custom one --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4d5667884..a4e89cf32 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -185,7 +185,7 @@ static void ggml_print_backtrace_symbols(void) { fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); } } -#elif defined(__linux__) +#elif defined(__linux__) && defined(__GLIBC__) #include static void ggml_print_backtrace_symbols(void) { void * trace[100]; From afbb4c1322a747d2a7b4bf67c868148f8afcc6c8 Mon Sep 17 00:00:00 2001 From: matteo Date: Thu, 1 Aug 2024 23:28:28 +0200 Subject: [PATCH 054/154] ggml-cuda: Adding support for unified memory (#8035) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding support for unified memory * adding again the documentation about unified memory * refactoring: Moved the unified memory code in the correct location. * Fixed compilation error when using hipblas * cleaning up the documentation * Updating the documentation Co-authored-by: Johannes Gäßler * adding one more case where the PR should not be enabled --------- Co-authored-by: matteo serva Co-authored-by: Johannes Gäßler --- docs/build.md | 6 +++++- ggml/src/ggml-cuda.cu | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/build.md b/docs/build.md index cfe42ebbf..8b16d1a35 100644 --- a/docs/build.md +++ b/docs/build.md @@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht cmake --build build --config Release ``` -The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: +The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. + +The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. + +The following compilation options are also available to tweak performance: | Option | Legal values | Default | Description | |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b510777fb..68605fff6 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) } return res; #else + +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + cudaError_t err; + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) + { + err = cudaMallocManaged(ptr, size); + } + else + { + err = cudaMalloc(ptr, size); + } + return err; +#else return cudaMalloc(ptr, size); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #endif } From 0fbbd884589d585c3b43cae8c16938ffffb863b9 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 2 Aug 2024 01:55:17 +0100 Subject: [PATCH 055/154] [SYCL] Fixing wrong VDR iq4nl value (#8812) --- ggml/src/ggml-sycl/mmvq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 23232357e..1b96925e1 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq4_nl_q8_1( + mul_mat_vec_q_iq4_nl_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); }); From e09a800f9a9b19c73aa78e03b4c4be8ed988f3e6 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 2 Aug 2024 16:50:53 +0800 Subject: [PATCH 056/154] cann: Fix ggml_cann_im2col for 1D im2col (#8819) * fix ggml_cann_im2col for 1D im2col * fix build warning --- ggml/src/ggml-cann/aclnn_ops.cpp | 167 +++++++++++++++++++++++++------ tests/test-backend-ops.cpp | 3 + 2 files changed, 142 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 90ccf3e18..556284888 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1312,6 +1312,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, #ifdef __cplusplus } #endif + +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + ggml_tensor* src1, + aclTensor* tmp_cast_tensor, + aclTensor* tmp_im2col_tensor) { + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_im2col_1d_post_process( + ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, + aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, + const std::vector& im2col_op_params) { + // get params + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; + const int64_t n_bytes_factor = im2col_op_params[10]; + + // Permute: [N, IC * KH * KW, OW * OH] -> + // [N, OW * OH * n_bytes_factor, IC * KH * KW] + aclTensor* tmp_permute_tensor = nullptr; + ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); + tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + void* tmp_permute_buffer = tmp_permute_allocator.get(); + + int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + tmp_permute_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + tmp_permute_tensor = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, + 3); + } + + // number of times the kernel moves in W dimension + const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; + size_t offset; + void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + + // memory copy with offset to restore 1D im2col from 2d + if (IC > 1) { + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + size_t size_cpy = KH * KW * ggml_type_size(dst->type); + + for (int c = 0; c < IC; c++) { + cur_permute_buffer = (char*)tmp_permute_buffer + offset + + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char*)dst->data + + c * KH * KW * n_step_w * ggml_type_size(dst->type); + + for (int i = 0; i < n_step_w; i++) { + ACL_CHECK(aclrtMemcpyAsync( + cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + cur_dst_buffer = + (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char*)cur_permute_buffer + + KH * KW * IC * ggml_type_size(dst->type); + } + } + } else { + offset = KH * KW * n_step_w * + ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, + (char*)tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // release + ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); +} + void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src1 = dst->src[1]; // input @@ -1320,21 +1425,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - GGML_TENSOR_BINARY_OP_LOCALS; - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; + // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D + // im2col and do post-processing to restore it to 1D. + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; - const int64_t KH = is_2D ? ne01 : 1; + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t KH = ne01; const int64_t KW = ne00; + const int64_t IW = ne10; const int64_t OH = is_2D ? ne2 : 1; const int64_t OW = ne1; @@ -1342,9 +1449,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + // memory allocated increased to 3x when is_2D == false + const int64_t n_bytes_factor = is_2D ? 1 : 3; + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; + int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; tmp_im2col_nb[0] = ggml_type_size(src1->type); @@ -1356,8 +1466,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. ggml_cann_pool_alloc im2col_allocator( - ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + ctx.pool(), + ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); void* tmp_im2col_buffer = im2col_allocator.get(); + aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, @@ -1380,8 +1492,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { paddings, strides, tmp_im2col_tensor, &workspaceSize, &executor)); + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); if (workspaceSize > 0) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspace_allocator.alloc(workspaceSize); workspaceAddr = workspace_allocator.get(); } @@ -1391,9 +1504,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); + void* tmp_cast_buffer = nullptr; if (src1->type != dst->type) { - tmp_cast_allocator.alloc(ggml_nbytes(dst)); - void* tmp_cast_buffer = tmp_cast_allocator.get(); + tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + tmp_cast_buffer = tmp_cast_allocator.get(); size_t temp_cast_nb[GGML_MAX_DIMS - 1]; temp_cast_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1408,24 +1522,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_type_mapping(dst->type)); } - // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] - int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); - - int64_t permute_dim[] = {0, 2, 1}; - if (src1->type != dst->type) { - aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + // post-processing + if (is_2D) { + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor); } else { - aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + std::vector im2col_op_params = { + KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor, im2col_op_params); } // release ACL_CHECK(aclDestroyTensor(acl_src1)); ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyIntArray(kernel_size)); ACL_CHECK(aclDestroyIntArray(dilations)); ACL_CHECK(aclDestroyIntArray(paddings)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5de70d554..f5065f145 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2139,6 +2139,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); + // test cases for 1D im2col + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); test_cases.emplace_back(new test_conv_transpose_1d()); test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1)); From b72c20b85c1029d135022d39e9a20d4807c11893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 2 Aug 2024 21:11:39 +0200 Subject: [PATCH 057/154] Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin --- convert_hf_to_gguf.py | 2 +- ggml/include/ggml.h | 1 + ggml/src/ggml-impl.h | 9 +++------ ggml/src/ggml.c | 11 +++++++++-- gguf-py/gguf/quants.py | 14 ++++++-------- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8ba3c5844..8b33c30d9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -316,7 +316,7 @@ class Model: if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data = gguf.quantize_bf16(data) - assert data.dtype == np.int16 + assert data.dtype == np.uint16 data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 464d765da..d8d3dceef 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -349,6 +349,7 @@ extern "C" { GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t); GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); struct ggml_object; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 7f7afdbfc..3daee4926 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { /** * Converts float32 to brain16. * - * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. - * Subnormals shall be flushed to zero, and NANs will be quiet. + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { @@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } - if (!(u.i & 0x7f800000)) { /* subnormal */ - h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ - return h; - } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a4e89cf32..be672f6ef 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -480,9 +480,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { + for (int i = 0; i < n; i++) { + y[i] = ggml_compute_fp32_to_bf16(x[i]); + } +} + void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) + // subnormals are flushed to zero on this platform for (; i + 32 <= n; i += 32) { _mm512_storeu_si512( (__m512i *)(y + i), @@ -962,7 +969,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, - .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -20650,7 +20657,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_BF16: { size_t elemsize = sizeof(ggml_bf16_t); - ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n); + ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; case GGML_TYPE_F32: diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 16e0a9aaa..f4361d751 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -25,14 +25,12 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati # same as ggml_compute_fp32_to_bf16 in ggml-impl.h def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: - n = n.astype(np.float32, copy=False).view(np.int32) + n = n.astype(np.float32, copy=False).view(np.uint32) # force nan to quiet - n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n) - # flush subnormals to zero - n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n) + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) # round to nearest even - n = (n + (0x7fff + ((n >> 16) & 1))) >> 16 - return n.astype(np.int16) + n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 + return n.astype(np.uint16) # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time @@ -49,10 +47,10 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape) + return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape) -__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16) +__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16) def quantize_bf16(n: np.ndarray): From 76614f352e94d25659306d9e97321f204e5de0d3 Mon Sep 17 00:00:00 2001 From: jdomke <28772296+jdomke@users.noreply.github.com> Date: Sun, 4 Aug 2024 01:34:41 +0900 Subject: [PATCH 058/154] ggml : reading the runtime sve config of the cpu (#8709) * ggml : reading the runtime sve config of the cpu * change to one time init to prevent performance drop * prefix variable to avoid possible conflicts * revert xxhash fix and add brackets --------- Co-authored-by: domke <673751-domke@users.noreply.gitlab.com> --- ggml/src/ggml-aarch64.c | 28 ++++++++++++++-------------- ggml/src/ggml-impl.h | 1 + ggml/src/ggml-quants.c | 4 ++-- ggml/src/ggml-quants.h | 4 ++++ ggml/src/ggml.c | 9 +++++++++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index af53dea17..d7a608997 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 3daee4926..190af0810 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -143,6 +143,7 @@ extern "C" { #if defined(__ARM_FEATURE_SVE) #include +#include #endif // 16-bit float diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 16aaf523f..d5b91c2db 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); @@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 88b1f3269..525d5ee30 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +#if defined(__ARM_FEATURE_SVE) +extern int ggml_sve_cnt_b; +#endif + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index be672f6ef..42f4a34b8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -37,6 +37,9 @@ #include #endif +#if defined(__ARM_FEATURE_SVE) +int ggml_sve_cnt_b = 0; +#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -3558,6 +3561,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); +#if defined(__ARM_FEATURE_SVE) + if (!ggml_sve_cnt_b) { + ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + } +#endif + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); From 4b77ea95f56a4c49bc995f08eac62a6416875ccc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 4 Aug 2024 05:53:20 +0300 Subject: [PATCH 059/154] flake.lock: Update (#8847) --- flake.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index 3dc68abb6..c54af88ea 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1719994518, - "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=", + "lastModified": 1722555600, + "narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7", + "rev": "8471fe90ad337a8074e957b69ca4d0089218391d", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1722062969, - "narHash": "sha256-QOS0ykELUmPbrrUGmegAUlpmUFznDQeR4q7rFhl8eQg=", + "lastModified": 1722421184, + "narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b73c2221a46c13557b1b3be9c2070cc42cf01eb3", + "rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1719876945, - "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=", + "lastModified": 1722555339, + "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" } }, "root": { From 01aae2b4975b57a265ce8194928fd87f2d71027e Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 3 Aug 2024 15:07:47 +0200 Subject: [PATCH 060/154] baby-llama : remove duplicate vector include --- examples/baby-llama/baby-llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 4f6c3746a..aca332e94 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,7 +1,6 @@ #include "ggml.h" #include "train.h" -#include #include #include #include From ecf6b7f23e664afd7ff856ec39034240ce438daa Mon Sep 17 00:00:00 2001 From: Brian Cunnie Date: Sun, 4 Aug 2024 03:55:03 -0700 Subject: [PATCH 061/154] batched-bench : handle empty `-npl` (#8839) * [example] batched-bench "segmentation fault" When `llama-batched-bench` is invoked _without_ setting `-npl`, "number of parallel prompts", it segfaults. The segfault is caused by invoking `max_element()` on a zero-length vector, `n_pl` This commit addresses that by first checking to see if the number of parallel prompts is zero, and if so sets the maximum sequence size to 1; otherwise, sets it to the original, the result of `max_element()`. Fixes, when running `lldb build/bin/llama-batched-bench -- -m models/Meta-Llama-3-8B.gguf` ``` * thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x0) frame #0: 0x000000010000366c llama-batched-bench`main(argc=3, argv=0x000000016fdff268) at batched-bench.cpp:72:28 69 llama_context_params ctx_params = llama_context_params_from_gpt_params(params); 70 71 // ensure enough sequences are available -> 72 ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); ``` * Update examples/batched-bench/batched-bench.cpp Co-authored-by: compilade --------- Co-authored-by: Georgi Gerganov Co-authored-by: compilade --- examples/batched-bench/batched-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 718f0a61a..25e7c775a 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -69,7 +69,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_params_from_gpt_params(params); // ensure enough sequences are available - ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); + ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); llama_context * ctx = llama_new_context_with_model(model, ctx_params); From 978ba3d83d17b10fdf9807006048432b5b3769fc Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:16:23 +0000 Subject: [PATCH 062/154] Server: Don't ignore llama.cpp params (#8754) * Don't ignore llama.cpp params * Add fallback for max_tokens --- examples/server/server.cpp | 2 +- examples/server/utils.hpp | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7813a2957..d5f131d9b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -900,7 +900,7 @@ struct server_context { slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index db6b3b74d..e6a1f0697 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse( llama_params["__oaicompat"] = true; - // Map OpenAI parameters to llama.cpp parameters - // - // For parameters that are defined by the OpenAI documentation (e.g. - // temperature), we explicitly specify OpenAI's intended default; we - // need to do that because sometimes OpenAI disagrees with llama.cpp - // - // https://platform.openai.com/docs/api-reference/chat/create - llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); - llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); - llama_params["stream"] = json_value(body, "stream", false); - llama_params["temperature"] = json_value(body, "temperature", 1.0); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - // Apply chat template to the list of messages llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); From 0d6fb52be0c1b7e77eb855f3adc4952771c8ce4c Mon Sep 17 00:00:00 2001 From: Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Date: Sun, 4 Aug 2024 14:17:16 -0400 Subject: [PATCH 063/154] Install curl in runtime layer (#8693) --- .devops/llama-server.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index b631d5806..ff558604e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev curl + apt-get install -y build-essential git libcurl4-openssl-dev WORKDIR /app @@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 + apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/llama-server /llama-server From c02b0a8a4dee489b29073f25a27ed6e5628e86e1 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Mon, 5 Aug 2024 12:22:30 +0800 Subject: [PATCH 064/154] cann: support q4_0 model (#8822) --- ggml/src/ggml-cann.cpp | 12 +- ggml/src/ggml-cann/acl_tensor.cpp | 31 +- ggml/src/ggml-cann/acl_tensor.h | 36 ++- ggml/src/ggml-cann/aclnn_ops.cpp | 45 ++- ggml/src/ggml-cann/kernels/CMakeLists.txt | 3 +- ggml/src/ggml-cann/kernels/ascendc_kernels.h | 2 + .../kernels/quantize_float_to_q4_0.cpp | 273 ++++++++++++++++++ 7 files changed, 357 insertions(+), 45 deletions(-) create mode 100644 ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 461febcc0..a15bc8aa2 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base( GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, const void* src, void* dst) { - GGML_ASSERT(tensor->op == GGML_OP_NONE); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, */ GGML_CALL static void ggml_backend_cann_transform_back_q4_0( const ggml_tensor* tensor, void* src, void* dst) { - GGML_ASSERT(tensor->op == GGML_OP_NONE); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -1666,10 +1664,17 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, } case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { - // case GGML_TYPE_Q4_0: case GGML_TYPE_F16: case GGML_TYPE_F32: case GGML_TYPE_Q8_0: + // TODO: fix me + // Current groupsize should not be greater than k-1 in + // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(). + if (op->src[0]->ne[0]-1 > QK8_0) { + return true; + } + return false; + case GGML_TYPE_Q4_0: return true; default: return false; @@ -1694,6 +1699,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: return true; default: return false; diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index 960ce9a03..d120ce6ac 100644 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) { return ACL_INT16; case GGML_TYPE_I32: return ACL_INT32; + case GGML_TYPE_Q4_0: + return ACL_INT4; + case GGML_TYPE_Q8_0: + return ACL_INT8; default: return ACL_DT_UNDEFINED; } @@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { return false; } -aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, - size_t type_size, int64_t* ne, size_t* nb, - int64_t dims, aclFormat format, - size_t offset) { - int64_t tmp_ne[GGML_MAX_DIMS * 2]; - int64_t tmp_stride[GGML_MAX_DIMS * 2]; - - memcpy(tmp_ne, ne, dims * sizeof(int64_t)); - for (int i = 0; i < dims; i++) { - tmp_stride[i] = nb[i] / type_size; - } - - std::reverse(tmp_ne, tmp_ne + dims); - std::reverse(tmp_stride, tmp_stride + dims); - - int64_t acl_storage_len = 0; - for (int i = 0; i < dims; i++) { - acl_storage_len += (ne[i] - 1) * nb[i]; - } - - aclTensor* acl_tensor = - aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, - format, &acl_storage_len, 1, data_ptr); - - return acl_tensor; -} - int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, int64_t* bcast_src0_ne, diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h index 7d0bf04e0..4734a9cb8 100644 --- a/ggml/src/ggml-cann/acl_tensor.h +++ b/ggml/src/ggml-cann/acl_tensor.h @@ -23,6 +23,9 @@ #ifndef CANN_ACL_TENSOR_H #define CANN_ACL_TENSOR_H +#include +#include + #include #include "common.h" @@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null size_t offset = 0); /** - * @brief Creates an ACL tensor from provided parameters. + * @brief Template for creating an ACL tensor from provided parameters. typename TYPE + * should be size_t or float. * * @details This function creates an ACL tensor using the provided data pointer, * data type, dimensions, strides, format, offset, and additional parameters. @@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. * @return Pointer to the created ACL tensor. */ +template aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, - size_t type_size, int64_t* ne, size_t* nb, - int64_t dims, aclFormat format = ACL_FORMAT_ND, - size_t offset = 0); + TYPE type_size, int64_t* ne, TYPE* nb, + int64_t dims, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0) { + int64_t tmp_ne[GGML_MAX_DIMS * 2]; + int64_t tmp_stride[GGML_MAX_DIMS * 2]; + + memcpy(tmp_ne, ne, dims * sizeof(int64_t)); + for (int i = 0; i < dims; i++) { + tmp_stride[i] = nb[i] / type_size; + } + + std::reverse(tmp_ne, tmp_ne + dims); + std::reverse(tmp_stride, tmp_stride + dims); + + int64_t acl_storage_len = 0; + for (int i = 0; i < dims; i++) { + acl_storage_len += (ne[i] - 1) * nb[i]; + } + + aclTensor* acl_tensor = + aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, + format, &acl_storage_len, 1, data_ptr); + + return acl_tensor; +} /** * @brief Checks if tensors require broadcasting based on their shapes. diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 556284888..171439132 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -910,6 +910,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f16_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F16) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -971,6 +978,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f32_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -2463,21 +2477,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { +static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC // is regarded as batch. weight need transpose. int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; - size_t weight_elem_size = sizeof(uint8_t); - size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + float weight_elem_size; + if (type == GGML_TYPE_Q4_0) { + weight_elem_size = float(sizeof(uint8_t)) / 2; + } + else if (type == GGML_TYPE_Q8_0) { + weight_elem_size = float(sizeof(uint8_t)); + } + else { + GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + } + float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + // size of one matrix is element_size * height * width. size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; // scale stored at the end of weight. Also need transpose. + GGML_ASSERT(QK4_0 == QK8_0); int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; size_t scale_elem_size = sizeof(uint16_t); size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, @@ -2541,8 +2567,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, ACL_INT8, - weight_elem_size, weight_ne, weight_nb, 2); + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2); @@ -2596,11 +2623,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { case GGML_TYPE_F16: ggml_cann_mat_mul_fp(ctx, dst); break; - // case GGML_TYPE_Q4_0: - // ggml_cann_mul_mat_q4_0(ctx, dst); - // break; + case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_q8_0(ctx, dst); + ggml_cann_mul_mat_quant(ctx, dst, type); break; default: GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index f12a4d43f..5b4fef91b 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -9,6 +9,7 @@ file(GLOB SRC_FILES get_row_q8_0.cpp quantize_f32_q8_0.cpp quantize_f16_q8_0.cpp + quantize_float_to_q4_0.cpp dup.cpp ) @@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) -#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) +# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h index bf8914751..7e153208c 100644 --- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h +++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h @@ -8,6 +8,8 @@ #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" +#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" +#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp new file mode 100644 index 000000000..f6deee3c5 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -0,0 +1,273 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +#define BUFFER_NUM 2 +#define Group_Size 32 + +template +class QUANTIZE_FLOAT_TO_Q4_0 { + public: + __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // input stride of data elements + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + output_ne[i] = output_ne_ub[i]; + } + + // output stride of data elements + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + // scale saved one by one after data:. [group1_scale, group2_scale, ...] + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / Group_Size; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t) / 2; + + input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * + group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); + pipe.InitBuffer(output_queue, BUFFER_NUM, + Group_Size * sizeof(int8_t) / 2); + pipe.InitBuffer(cast_queue , BUFFER_NUM, Group_Size * sizeof(float)); + pipe.InitBuffer(work_queue, BUFFER_NUM, Group_Size*sizeof(float)); + pipe.InitBuffer(max_queue, BUFFER_NUM, Group_Size*sizeof(float)); + pipe.InitBuffer(min_queue, BUFFER_NUM, Group_Size*sizeof(float)); + pipe.InitBuffer(scale_queue, BUFFER_NUM, 16*sizeof(half)); + pipe.InitBuffer(int8_queue, BUFFER_NUM, Group_Size * sizeof(int8_t)); + pipe.InitBuffer(half_queue, BUFFER_NUM, Group_Size * sizeof(half)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], Group_Size); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, + // and using DataCopyPad to avoid 32 bits align. + LocalTensor output_local = output_queue.DeQue(); + LocalTensor output_int8_local = + output_local.ReinterpretCast(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); + DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); + + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void input_to_cast(LocalTensor cast_local, + LocalTensor input_local) { + DataCopy(cast_local, input_local, Group_Size); + } + + __aicore__ inline void input_to_cast(LocalTensor cast_local, + LocalTensor input_local) { + Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + Group_Size * group; + + // output_offset is stride for output_gm which datatype is int8_t and + // divided by 2 is needed for int4b_t. + const int64_t output_offset = (i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + + Group_Size * group) / 2; + copy_in(input_offset); + + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor min_local = min_queue.AllocTensor(); + LocalTensor int8_local = int8_queue.AllocTensor(); + LocalTensor half_local = half_queue.AllocTensor(); + + input_to_cast(cast_local, input_local); + + ReduceMax(max_local, cast_local, work_local, Group_Size); + ReduceMin(min_local, cast_local, work_local, Group_Size); + const float max_value = max_local.GetValue(0); + const float min_value = min_local.GetValue(0); + float d = max_value; + if (min_value < 0 && (-1 * min_value) > max_value) { + d = min_value; + } + + d = d / (-8); + if (d != 0) { + Muls(cast_local, cast_local, 1.0f / d, Group_Size); + } + + // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] + float scalar = 8.5f; + Adds(cast_local, cast_local, scalar, Group_Size); + Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); + scalar = 15.0f; + Mins(cast_local, cast_local, scalar, Group_Size); + scalar = -8.0f; + Adds(cast_local, cast_local, scalar, Group_Size); + + // float->half->int4b + Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); + Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); + + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + max_queue.FreeTensor(max_local); + min_queue.FreeTensor(min_local); + int8_queue.FreeTensor(int8_local); + half_queue.FreeTensor(half_local); + cast_queue.FreeTensor(cast_local); + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + if (scale_local_offset == 16) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, 16); + pipe_barrier(PIPE_ALL); + scale_global_offset += 16; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + scale_queue.FreeTensor(scale_local); + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue min_queue; + TQue scale_queue; + TQue cast_queue; + TQue int8_queue; + TQue half_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} From 655858ace0cf2720e56eb01f84ad05e0c94ada3c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 29 Jul 2024 15:06:06 +0200 Subject: [PATCH 065/154] ggml : move c parameter comment to ggml_rope_ext (ggml/901) This commit moves the comment for the c parameter from ggml_rope to ggml_rope_ext. The comment is currently incorrect as ggml_rope does not have a c parameter (freq_factors tensor). Signed-off-by: Daniel Bevenius --- ggml/include/ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d8d3dceef..a9e88e592 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1456,7 +1456,6 @@ extern "C" { // if mode & 2 == 1, GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions - // c is freq factors (e.g. phi3-128k), (optional) GGML_API struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1473,6 +1472,7 @@ extern "C" { int mode); // custom RoPE + // c is freq factors (e.g. phi3-128k), (optional) GGML_API struct ggml_tensor * ggml_rope_ext( struct ggml_context * ctx, struct ggml_tensor * a, From a3738b2fa7c60ef2c4592435d1aa7fb8f1f69c3e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Aug 2024 17:28:08 +0200 Subject: [PATCH 066/154] vulkan : implement Stable Diffusion operators (ggml/904) * Fix Vulkan repeat op * Implement Vulkan concat op * Delete old Vulkan shader generator * Implement Vulkan im2col op * Implement Vulkan unary gelu_quick op * Implement Vulkan group_norm op * Implement Vulkan timestep_embedding op * Implement Vulkan upscale op * Fix Vulkan vk_context tensor extra index issue * Fix Vulkan matmul shader parameter bug * Properly fix Vulkan matmul shader parameter bug * Add Vulkan ADD f16 + f32 -> f16 operator support * Implement Vulkan tanh op * Fix Vulkan group count too large Validation error on non-Nvidia GPUs * Throw error when too much memory is requested * Fix another Vulkan group count too large Validation error on non-Nvidia GPUs * Fix matmul MMQ condition * Implement Vulkan pad op * Fix Vulkan crash when tensor is used multiple times in a compute graph * Add Vulkan CONCAT f16 + f16 -> f16 op * Add Vulkan LEAKY_RELU op --- ggml/src/ggml-vulkan.cpp | 840 +++++++++++++----- ggml/src/vulkan-shaders/add.comp | 6 +- ggml/src/vulkan-shaders/clamp.comp | 8 +- ggml/src/vulkan-shaders/concat.comp | 35 + ggml/src/vulkan-shaders/copy.comp | 8 +- ggml/src/vulkan-shaders/div.comp | 6 +- ggml/src/vulkan-shaders/gelu.comp | 2 +- ggml/src/vulkan-shaders/gelu_quick.comp | 23 + .../vulkan-shaders/generic_binary_head.comp | 6 +- .../vulkan-shaders/generic_unary_head.comp | 4 + ggml/src/vulkan-shaders/group_norm.comp | 66 ++ ggml/src/vulkan-shaders/im2col.comp | 57 ++ ggml/src/vulkan-shaders/leaky_relu.comp | 22 + ggml/src/vulkan-shaders/mul.comp | 6 +- ggml/src/vulkan-shaders/norm.comp | 2 +- ggml/src/vulkan-shaders/pad.comp | 26 + ggml/src/vulkan-shaders/relu.comp | 2 +- ggml/src/vulkan-shaders/rms_norm.comp | 2 +- ggml/src/vulkan-shaders/scale.comp | 6 +- ggml/src/vulkan-shaders/silu.comp | 2 +- ggml/src/vulkan-shaders/soft_max.comp | 2 +- ggml/src/vulkan-shaders/square.comp | 8 +- ggml/src/vulkan-shaders/sum_rows.comp | 2 +- ggml/src/vulkan-shaders/tanh.comp | 21 + .../vulkan-shaders/timestep_embedding.comp | 41 + ggml/src/vulkan-shaders/types.comp | 4 +- ggml/src/vulkan-shaders/upscale.comp | 36 + .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 82 +- 28 files changed, 1032 insertions(+), 293 deletions(-) create mode 100644 ggml/src/vulkan-shaders/concat.comp create mode 100644 ggml/src/vulkan-shaders/gelu_quick.comp create mode 100644 ggml/src/vulkan-shaders/group_norm.comp create mode 100644 ggml/src/vulkan-shaders/im2col.comp create mode 100644 ggml/src/vulkan-shaders/leaky_relu.comp create mode 100644 ggml/src/vulkan-shaders/pad.comp create mode 100644 ggml/src/vulkan-shaders/tanh.comp create mode 100644 ggml/src/vulkan-shaders/timestep_embedding.comp create mode 100644 ggml/src/vulkan-shaders/upscale.comp diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index fa68360b9..d7fea78d0 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -177,24 +177,33 @@ struct vk_device_struct { vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16; vk_pipeline pipeline_mul_f32; vk_pipeline pipeline_div_f32; - vk_pipeline pipeline_add_f32; + vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; + vk_pipeline pipeline_upscale_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_gelu_quick_f32; vk_pipeline pipeline_silu_f32; vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_leaky_relu_f32; + vk_pipeline pipeline_tanh_f32; vk_pipeline pipeline_diag_mask_inf_f32; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; vk_pipeline pipeline_argsort_f32; vk_pipeline pipeline_sum_rows_f32; + vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; + vk_pipeline pipeline_timestep_embedding_f32; std::vector pipelines; @@ -320,7 +329,7 @@ struct vk_op_binary_push_constants { uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; uint32_t d_offset; - float param1; float param2; + float param1; float param2; int32_t param3; }; struct vk_op_diag_mask_push_constants { @@ -358,6 +367,25 @@ struct vk_op_argsort_push_constants { int32_t order; }; +struct vk_op_im2col_push_constants { + uint32_t batch_offset; uint32_t offset_delta; + uint32_t IC; + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t KW; uint32_t KH; + uint32_t pelements; + uint32_t CHW; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; + int32_t d0; int32_t d1; +}; + +struct vk_op_timestep_embedding_push_constants { + uint32_t nb1; + uint32_t dim; + uint32_t max_period; +}; + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -367,28 +395,32 @@ struct vk_staging_memcpy { size_t n; }; -struct vk_context { - size_t idx; +struct vk_op_upscale_push_constants { + uint32_t ne; uint32_t d_offset; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; + float sf0; float sf1; float sf2; float sf3; +}; +struct vk_context_struct { vk_submission * s; std::vector seqs; - ggml_tensor * exit_tensor; + int exit_tensor_idx; std::vector in_memcpys; std::vector out_memcpys; vk_queue * q; }; +typedef std::shared_ptr vk_context; +typedef std::weak_ptr vk_context_ref; struct ggml_tensor_extra_gpu { - size_t ctx_idx; - vk_buffer_ref buffer_gpu; uint64_t offset; void reset() { - ctx_idx = 0; buffer_gpu.reset(); offset = 0; } @@ -459,8 +491,10 @@ struct ggml_backend_vk_context { vk_buffer buffer_pool[MAX_VK_BUFFERS]; - vk_context * compute_ctx; - vk_context * transfer_ctx; + vk_context_ref compute_ctx; + vk_context_ref transfer_ctx; + + std::vector tensor_ctxs; }; #ifdef GGML_VULKAN_MEMORY_DEBUG @@ -510,12 +544,12 @@ static vk_instance_t vk_instance; static size_t vk_skip_checks; static size_t vk_output_tensor; -static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor); -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor); +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_tensor * tensor); +static void ggml_vk_check_results_1(ggml_tensor * tensor); #endif -typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); @@ -708,11 +742,11 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s return s; } -static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { - VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")"); +static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { return; } + VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); std::vector> tl_wait_vals; std::vector> tl_signal_vals; @@ -844,21 +878,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_ q.stage_flags = stage_flags; } -static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { - VK_LOG_DEBUG("ggml_vk_create_context()"); - ctx->gc.contexts.emplace_back(); - vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; - memset((void *) result, 0, sizeof(vk_context)); - result->idx = ctx->gc.contexts.size() - 1; +static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { + vk_context result = std::make_shared(); + VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); + ctx->gc.contexts.emplace_back(result); result->q = &q; return result; } -static vk_context * ggml_vk_create_temporary_context(vk_queue& q) { - VK_LOG_DEBUG("ggml_vk_create_temporary_context()"); - vk_context * result = new vk_context; - memset((void *) result, 0, sizeof(vk_context)); - result->idx = 0; +static vk_context ggml_vk_create_temporary_context(vk_queue& q) { + vk_context result = std::make_shared(); + VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); result->q = &q; return result; } @@ -915,6 +945,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")"); + if (size > device->max_memory_allocation_size) { + throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit"); + } + std::lock_guard guard(device->mutex); vk_buffer buf = std::make_shared(); @@ -1027,7 +1061,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { return { buf, 0, VK_WHOLE_SIZE }; } -static void ggml_vk_sync_buffers(vk_context * ctx) { +static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); const std::vector mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; @@ -1041,7 +1075,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) { ); } -static void ggml_vk_wait_events(vk_context * ctx, std::vector&& events) { +static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events) { VK_LOG_DEBUG("ggml_vk_wait_events()"); if (events.empty()) { return; @@ -1598,6 +1632,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -1605,20 +1640,31 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); @@ -1634,6 +1680,11 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); } static vk_device ggml_vk_get_device(size_t idx) { @@ -2077,9 +2128,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->staging_size = 0; ctx->staging_offset = 0; - ctx->compute_ctx = nullptr; - ctx->transfer_ctx = nullptr; - #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); @@ -2112,7 +2160,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()"); + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; } @@ -2126,7 +2174,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return ctx->device->pipeline_matmul_f16; } - GGML_ASSERT(src1_type == GGML_TYPE_F32); + if (src1_type != GGML_TYPE_F32) { + return nullptr; + } switch (src0_type) { case GGML_TYPE_Q4_0: @@ -2370,7 +2420,7 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); @@ -2410,7 +2460,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector w s.signal_semaphores = std::move(signal_semaphores); } -static void ggml_vk_ctx_end(vk_context * ctx) { +static void ggml_vk_ctx_end(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { return; @@ -2420,7 +2470,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) { ctx->s = nullptr; } -static void ggml_vk_ctx_begin(vk_device& device, vk_context * subctx) { +static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")"); if (subctx->s != nullptr) { ggml_vk_ctx_end(subctx); @@ -2453,7 +2503,7 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { } } -static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { +static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); GGML_ASSERT(!ggml_is_contiguous(tensor)); // Buffer is already mapped @@ -2558,7 +2608,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } } -static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { @@ -2623,7 +2673,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s } } -static void ggml_vk_buffer_write_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging); } @@ -2638,7 +2688,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); ggml_vk_ctx_begin(dst->device, subctx); ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true); ggml_vk_ctx_end(subctx); @@ -2650,8 +2700,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); dst->device->device.resetFences({ dst->device->fence }); - - delete subctx; } } @@ -2660,12 +2708,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); } -static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); GGML_ASSERT(src != nullptr); + // TODO: staging_offset is not used + // Check if dst is pinned memory vk_buffer buf = nullptr; size_t buf_offset; @@ -2714,18 +2764,18 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); } -static void ggml_vk_buffer_read_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging); } static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")"); + VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true); ggml_vk_ctx_end(subctx); @@ -2737,12 +2787,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - - delete subctx; } } -static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); // Make sure both buffers are on same device GGML_ASSERT(src->device == dst->device); @@ -2756,15 +2804,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr if (src->device == dst->device) { VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device - vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); src->device->device.resetFences({ src->device->fence }); - - delete subctx; } else { VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); // Copy device to device @@ -2783,7 +2829,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); @@ -2791,8 +2837,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); dst->device->device.resetFences({ dst->device->fence }); - - delete subctx; } static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { @@ -2855,7 +2899,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct } static void ggml_vk_matmul( - ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, @@ -2879,7 +2923,7 @@ static void ggml_vk_matmul( } static void ggml_vk_matmul_id( - ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, @@ -2916,7 +2960,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_ GGML_ABORT("fatal error"); } -static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { +static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); const int tensor_type_size = ggml_type_size(tensor->type); @@ -2934,7 +2978,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); } -static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3107,7 +3151,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su ); // NOLINT } -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3272,7 +3316,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3343,7 +3387,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3418,7 +3462,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); @@ -3431,7 +3475,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, } } -static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -3618,7 +3662,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * ); // NOLINT } -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -3794,7 +3838,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); } -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst); @@ -3803,8 +3847,8 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subct } } -static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // guaranteed to be an integer due to the check in ggml_can_repeat +static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")"); const uint64_t ne0 = dst->ne[0]; const uint64_t ne1 = dst->ne[1]; const uint64_t ne2 = dst->ne[2]; @@ -3825,6 +3869,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx const uint64_t nb02 = src0->nb[2]; const uint64_t nb03 = src0->nb[3]; + // guaranteed to be an integer due to the check in ggml_can_repeat const uint64_t nr0 = ne0/ne00; const uint64_t nr1 = ne1/ne01; const uint64_t nr2 = ne2/ne02; @@ -3852,8 +3897,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx for (uint64_t k1 = 0; k1 < ne01; k1++) { for (uint64_t i0 = 0; i0 < nr0; i0++) { copies.push_back({ - src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, - dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, ne00*nb0, }); } @@ -3874,11 +3919,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { - case GGML_OP_ADD: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_add_f32; - } - return nullptr; case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (dst->type == GGML_TYPE_F16) { @@ -3888,6 +3928,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; + case GGML_OP_ADD: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_add_f32; + } + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_add_f16_f32_f16; + } + return nullptr; case GGML_OP_MUL: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_mul_f32; @@ -3898,6 +3946,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_div_f32; } return nullptr; + case GGML_OP_CONCAT: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_concat_f32; + } + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_concat_f16; + } + if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + return ctx->device->pipeline_concat_i32; + } + return nullptr; + case GGML_OP_UPSCALE: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_upscale_f32; + } + return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_scale_f32; @@ -3913,6 +3977,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_clamp_f32; } return nullptr; + case GGML_OP_PAD: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_pad_f32; + } + return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -3922,6 +3991,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_norm_f32; } return nullptr; + case GGML_OP_GROUP_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_group_norm_f32; + } + return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rms_norm_f32; @@ -3939,11 +4013,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_gelu_f32; } break; + case GGML_UNARY_OP_GELU_QUICK: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_gelu_quick_f32; + } + break; case GGML_UNARY_OP_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_relu_f32; } break; + case GGML_UNARY_OP_TANH: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_tanh_f32; + } + break; default: break; } @@ -3995,6 +4079,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_sum_rows_f32; } return nullptr; + case GGML_OP_IM2COL: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_im2col_f32; + } + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_im2col_f32_f16; + } + return nullptr; + case GGML_OP_TIMESTEP_EMBEDDING: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_timestep_embedding_f32; + } + return nullptr; + case GGML_OP_LEAKY_RELU: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_leaky_relu_f32; + } + return nullptr; default: return nullptr; } @@ -4018,9 +4120,12 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: return true; default: return false; @@ -4028,7 +4133,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4124,7 +4229,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c vk_buffer d_D = extra->buffer_gpu.lock(); // Workaround for tiny tensor inputs on ROPE - if (use_src1 && y_sz > d_D->size) { + if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } @@ -4173,13 +4278,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1); - switch (dst->op) { + switch (op) { case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_SOFT_MAX: case GGML_OP_SUM_ROWS: - elements = { (uint32_t)ggml_nrows(src0), 1, 1 }; - break; + { + const uint32_t nr = ggml_nrows(src0); + if (nr > 262144) { + elements = { 512, 512, CEIL_DIV(nr, 262144) }; + } else if (nr > 512) { + elements = { 512, CEIL_DIV(nr, 512), 1 }; + } else { + elements = { nr, 1, 1 }; + } + } break; + case GGML_OP_GROUP_NORM: + { + const uint32_t num_groups = dst->op_params[0]; + elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; + } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_ROPE: elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; @@ -4190,6 +4308,49 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c case GGML_OP_ARGSORT: elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 }; break; + case GGML_OP_IM2COL: + { + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t batch = src1->ne[3]; + + elements = { OW * KW * KH, OH, batch * IC }; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + const uint32_t dim = dst->op_params[0]; + uint32_t half_ceil = (dim + 1) / 2; + elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; + } break; + case GGML_OP_ADD: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_CPY: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_UNARY: + { + const uint32_t ne = ggml_nelements(dst); + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + } break; default: elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; break; @@ -4216,7 +4377,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1) { subbuf_y = { d_Y, y_buf_offset, y_sz }; } else { - subbuf_y = { d_X, 0, d_X->size }; + subbuf_y = { d_X, 0, x_sz }; } ggml_vk_sync_buffers(subctx); @@ -4227,11 +4388,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src2) { subbuf_z = { d_Z, z_buf_offset, z_sz }; } else { - subbuf_z = { d_X, 0, d_X->size }; + subbuf_z = { d_X, 0, x_sz }; } ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (op == GGML_OP_IM2COL) { + // im2col uses only src1 and dst buffers + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); @@ -4249,8 +4414,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03); - switch (dst->op) { + switch (op) { case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: elements = { (uint32_t)ne01, 1, 1 }; break; @@ -4286,11 +4452,11 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } } -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {}); } -static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4301,11 +4467,11 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4316,11 +4482,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4331,11 +4497,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4346,11 +4512,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + int * op_params = (int *)dst->op_params; + + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, op_params[0], + }); +} + +static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + + const float sf0 = (float)dst->ne[0] / src0->ne[0]; + const float sf1 = (float)dst->ne[1] / src0->ne[1]; + const float sf2 = (float)dst->ne[2] / src0->ne[2]; + const float sf3 = (float)dst->ne[3] / src0->ne[3]; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, { + (uint32_t)ggml_nelements(dst), 0, + (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], + sf0, sf1, sf2, sf3, + }); +} + +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4364,7 +4563,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co }); } -static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4377,7 +4576,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons }); } -static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4391,7 +4590,20 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co }); } -static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + }); +} + +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4406,27 +4618,37 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons }); } -static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + int * op_params = (int *)dst->op_params; + + uint32_t num_groups = op_params[0]; + uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + static const float eps = 1e-6f; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }); +} + +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } -static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; float scale = op_params[0]; @@ -4451,7 +4673,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, }); } -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const int n_dims = ((int32_t *) dst->op_params)[1]; // const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -4475,7 +4697,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con }); } -static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; uint32_t ncols = src0->ne[0]; @@ -4494,10 +4716,59 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, }); } -static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }); } +static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int32_t s0 = dst->op_params[0]; + const int32_t s1 = dst->op_params[1]; + const int32_t p0 = dst->op_params[2]; + const int32_t p1 = dst->op_params[3]; + const int32_t d0 = dst->op_params[4]; + const int32_t d1 = dst->op_params[5]; + + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + const uint32_t IH = is_2D ? src1->ne[1] : 1; + const uint32_t IW = src1->ne[0]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + + const uint32_t pelements = OW * KW * KH; + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, { + batch_offset, offset_delta, + IC, IW, IH, OW, OH, KW, KH, + pelements, + IC * KH * KW, + s0, s1, p0, p1, d0, d1, + }); +} + +static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t dim = dst->op_params[0]; + const uint32_t max_period = dst->op_params[1]; + const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { + nb1, dim, max_period, + }); +} + +static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const float * op_params = (const float *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); +} + #ifdef GGML_VULKAN_RUN_TESTS static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { @@ -4686,7 +4957,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( @@ -4894,7 +5165,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); @@ -5027,7 +5298,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( @@ -5175,7 +5446,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig; - bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; + bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig); const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); @@ -5211,24 +5482,33 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: break; default: return; @@ -5236,6 +5516,13 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: + if ( + x_sz > ctx->device->max_memory_allocation_size || + y_sz > ctx->device->max_memory_allocation_size || + d_sz > ctx->device->max_memory_allocation_size || + split_k_size > ctx->device->max_memory_allocation_size) { + GGML_ABORT("Requested preallocation size is too large"); + } if (ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; } @@ -5430,7 +5717,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ +static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (ggml_is_empty(node) || extra == nullptr) { @@ -5457,7 +5744,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: break; default: return; @@ -5468,13 +5757,17 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -5483,6 +5776,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_MUL_MAT_ID: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; @@ -5490,102 +5786,137 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod return; } - if (ctx->compute_ctx == nullptr) { - ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); - ggml_vk_ctx_begin(ctx->device, ctx->compute_ctx); + vk_context compute_ctx; + + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + } else { + compute_ctx = ctx->compute_ctx.lock(); } switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_repeat(ctx, compute_ctx, src0, node); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD: - ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_add(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL: - ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_DIV: - ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_div(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_CONCAT: + ggml_vk_concat(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_UPSCALE: + ggml_vk_upscale(ctx, compute_ctx, src0, node); break; case GGML_OP_SCALE: - ggml_vk_scale(ctx, ctx->compute_ctx, src0, node); + ggml_vk_scale(ctx, compute_ctx, src0, node); break; case GGML_OP_SQR: - ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node); + ggml_vk_sqr(ctx, compute_ctx, src0, node); break; case GGML_OP_CLAMP: - ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node); + ggml_vk_clamp(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_PAD: + ggml_vk_pad(ctx, compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); + ggml_vk_cpy(ctx, compute_ctx, src0, node); break; case GGML_OP_NORM: - ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); + ggml_vk_norm(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_GROUP_NORM: + ggml_vk_group_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node); + ggml_vk_rms_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: - ggml_vk_unary(ctx, ctx->compute_ctx, src0, node); + case GGML_UNARY_OP_TANH: + ggml_vk_unary(ctx, compute_ctx, src0, node); break; default: return; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node); + ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: - ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: - ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node); + ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node); break; case GGML_OP_ARGSORT: - ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node); + ggml_vk_argsort(ctx, compute_ctx, src0, node); break; case GGML_OP_SUM_ROWS: - ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node); + ggml_vk_sum_rows(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_IM2COL: + ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_LEAKY_RELU: + ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node); + ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node); break; default: return; } - extra->ctx_idx = ctx->compute_ctx->idx; + ctx->tensor_ctxs[node_idx] = compute_ctx; #ifdef GGML_VULKAN_CHECK_RESULTS // Force context reset on each node so that each tensor ends up in its own context @@ -5594,13 +5925,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod #endif if (last_node) { - ggml_vk_ctx_end(ctx->compute_ctx); - ctx->compute_ctx->exit_tensor = node; - ctx->compute_ctx = nullptr; + ggml_vk_ctx_end(compute_ctx); + compute_ctx->exit_tensor_idx = node_idx; + ctx->compute_ctx.reset(); } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){ ggml_tensor_extra_gpu * extra = nullptr; switch (tensor->op) { @@ -5608,13 +5939,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_GET_ROWS: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -5626,6 +5961,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_NONE: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + case GGML_OP_REPEAT: extra = (ggml_tensor_extra_gpu *) tensor->extra; break; @@ -5633,7 +5972,9 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: extra = (ggml_tensor_extra_gpu *) tensor->extra; break; default: @@ -5656,31 +5997,31 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(ctx, tensor); + ggml_vk_check_results_0(tensor); #endif - vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; + vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); // Only run if ctx hasn't been submitted yet - if (!subctx.seqs.empty()) { + if (!subctx->seqs.empty()) { // Do staging buffer copies - for (auto& cpy : subctx.in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(&subctx, ctx->fence); + ggml_vk_submit(subctx, ctx->fence); } - if (tensor == subctx.exit_tensor) { + if (tensor_idx == subctx->exit_tensor_idx) { VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); ctx->device->device.resetFences({ ctx->fence }); // Do staging buffer copies - for (auto& cpy : subctx.out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - subctx.in_memcpys.clear(); - subctx.out_memcpys.clear(); + subctx->in_memcpys.clear(); + subctx->out_memcpys.clear(); } return true; @@ -5725,8 +6066,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->staging_offset = 0; - ctx->compute_ctx = nullptr; - ctx->transfer_ctx = nullptr; + ctx->tensor_ctxs.clear(); ctx->gc.contexts.clear(); } @@ -6063,15 +6403,20 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_write_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6081,15 +6426,20 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_read_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { @@ -6099,16 +6449,21 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); - ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); return true; } @@ -6118,25 +6473,27 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - if(ctx->transfer_ctx == nullptr) { + if(ctx->transfer_ctx.expired()) { return; } - ggml_vk_ctx_end(ctx->transfer_ctx); + vk_context transfer_ctx = ctx->transfer_ctx.lock(); - for (auto& cpy : ctx->transfer_ctx->in_memcpys) { + ggml_vk_ctx_end(transfer_ctx); + + for (auto& cpy : transfer_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(ctx->transfer_ctx, ctx->fence); + ggml_vk_submit(transfer_ctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); ctx->device->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->transfer_ctx->out_memcpys) { + for (auto& cpy : transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->transfer_ctx = nullptr; + ctx->transfer_ctx.reset(); } static bool ggml_vk_is_empty(ggml_tensor * node) { @@ -6159,8 +6516,11 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen last_node -= 1; } + // Reserve tensor context space for all nodes + ctx->tensor_ctxs.resize(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node); } for (int i = 0; i < cgraph->n_nodes; i++) { @@ -6170,13 +6530,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen continue; } - bool ok = ggml_vk_compute_forward(ctx, node); + bool ok = ggml_vk_compute_forward(ctx, node, i); if (!ok) { - fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + if (node->op == GGML_OP_UNARY) { + std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; + } else { + std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + } } #ifdef GGML_VULKAN_CHECK_RESULTS else { - ggml_vk_check_results_1(ctx, node); + ggml_vk_check_results_1(node); } #endif GGML_ASSERT(ok); @@ -6196,8 +6560,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: return ggml_is_contiguous(op->src[0]); default: return false; @@ -6270,11 +6636,11 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return false; } break; - // case GGML_OP_REPEAT: - // { - // ggml_type src0_type = op->src[0]->type; - // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - // } break; + case GGML_OP_REPEAT: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_NONE: @@ -6283,18 +6649,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: + case GGML_OP_RMS_NORM: case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: - case GGML_OP_RMS_NORM: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: return true; default: return false; @@ -6509,10 +6882,12 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } } -static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (ggml_backend_buffer_is_vk(tensor->buffer)) { + const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer); + + if (is_gpu) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -6533,13 +6908,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); - std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); - if (ggml_backend_buffer_is_vk(tensor->buffer)) { + if (is_gpu) { free(tensor_data); } } @@ -6548,8 +6920,8 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { - if (tensor->op == GGML_OP_TRANSPOSE) { +static void ggml_vk_check_results_0(ggml_tensor * tensor) { + if (tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -6565,7 +6937,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * ggml_tensor * src2 = tensor->src[2]; struct ggml_init_params iparams = { - /*.mem_size =*/ 1024*1024*1024, + /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; @@ -6624,7 +6996,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src0, "src0"); + ggml_vk_print_tensor(src0, "src0"); } } if (src1 != nullptr) { @@ -6666,23 +7038,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src1, "src1"); - std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; - if (src1->src[0] != nullptr) { - std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl; - } - if (src1->src[1] != nullptr) { - std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl; - } - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0); - std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(src1_clone, done); + ggml_vk_print_tensor(src1, "src1"); } } if (src2 != nullptr) { @@ -6724,23 +7080,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src2, "src2"); - std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl; - if (src2->src[0] != nullptr) { - std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl; - } - if (src2->src[1] != nullptr) { - std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl; - } - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0); - std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(src2_clone, done); + ggml_vk_print_tensor(src2, "src2"); } } @@ -6752,16 +7092,24 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_DIV) { tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone); + } else if (tensor->op == GGML_OP_CONCAT) { + tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_UPSCALE) { + tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_SCALE) { tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); } else if (tensor->op == GGML_OP_SQR) { tensor_clone = ggml_sqr(ggml_ctx, src0_clone); } else if (tensor->op == GGML_OP_CLAMP) { tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + } else if (tensor->op == GGML_OP_PAD) { + tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_GROUP_NORM) { + tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_RMS_NORM) { tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { @@ -6777,12 +7125,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * const int mode = ((int32_t *) tensor->op_params)[2]; //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; - float freq_base = ((float *) tensor->op_params)[5]; - float freq_scale = ((float *) tensor->op_params)[6]; - float ext_factor = ((float *) tensor->op_params)[7]; - float attn_factor = ((float *) tensor->op_params)[8]; - float beta_fast = ((float *) tensor->op_params)[9]; - float beta_slow = ((float *) tensor->op_params)[10]; + const float freq_base = ((float *) tensor->op_params)[5]; + const float freq_scale = ((float *) tensor->op_params)[6]; + const float ext_factor = ((float *) tensor->op_params)[7]; + const float attn_factor = ((float *) tensor->op_params)[8]; + const float beta_fast = ((float *) tensor->op_params)[9]; + const float beta_slow = ((float *) tensor->op_params)[10]; tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { @@ -6792,9 +7140,15 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_UNARY_OP_GELU: tensor_clone = ggml_gelu(ggml_ctx, src0_clone); break; + case GGML_UNARY_OP_GELU_QUICK: + tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone); + break; case GGML_UNARY_OP_RELU: tensor_clone = ggml_relu(ggml_ctx, src0_clone); break; + case GGML_UNARY_OP_TANH: + tensor_clone = ggml_tanh(ggml_ctx, src0_clone); + break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); @@ -6823,6 +7177,23 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_SUM_ROWS) { tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); + } else if (tensor->op == GGML_OP_IM2COL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + + const bool is_2D = tensor->op_params[6] == 1; + tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type); + } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { + const int32_t dim = tensor->op_params[0]; + const int32_t max_period = tensor->op_params[1]; + tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period); + } else if (tensor->op == GGML_OP_LEAKY_RELU) { + const float * op_params = (const float *)tensor->op_params; + tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); @@ -6834,7 +7205,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); + ggml_vk_print_tensor(tensor_clone, "tensor_clone"); } comp_size = ggml_nbytes(tensor_clone); @@ -6851,9 +7222,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * } ggml_free(ggml_ctx); + + VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); } -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_tensor * tensor) { if (tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -6977,11 +7350,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl << "Correct:" << std::endl; ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0); std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); - std::cerr << std::endl << "Correct:" << std::endl; - ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0); - std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); } @@ -7018,5 +7386,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * if (ggml_backend_buffer_is_vk(tensor->buffer)) { free(tensor_data); } + + VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif diff --git a/ggml/src/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp index 8475b0119..3974845d6 100644 --- a/ggml/src/vulkan-shaders/add.comp +++ b/ggml/src/vulkan-shaders/add.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp index ca272e227..7071302a4 100644 --- a/ggml/src/vulkan-shaders/clamp.comp +++ b/ggml/src/vulkan-shaders/clamp.comp @@ -4,10 +4,12 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); + const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); } diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/vulkan-shaders/concat.comp new file mode 100644 index 000000000..08ab5514b --- /dev/null +++ b/ggml/src/vulkan-shaders/concat.comp @@ -0,0 +1,35 @@ +#version 450 + +#include "types.comp" +#include "generic_binary_head.comp" + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + const int dim = p.param3; + + if (idx >= p.ne) { + return; + } + + const uint i3 = idx / (p.ne22*p.ne21*p.ne20); + const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20; + const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20); + const uint i2_offset = i2*p.ne21*p.ne20; + const uint i1 = (idx - i3_offset - i2_offset) / p.ne20; + const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20; + + uint o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03)); + + const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; + const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10; + const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20; + + const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; + +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]); +#else + data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx]; +#endif +} diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp index efb55876e..c26917c0f 100644 --- a/ggml/src/vulkan-shaders/copy.comp +++ b/ggml/src/vulkan-shaders/copy.comp @@ -4,13 +4,15 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } #ifndef OPTIMIZATION_ERROR_WORKAROUND - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]); #else - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)]; + data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)]; #endif } diff --git a/ggml/src/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp index 8ee4bfc73..8cfce58b1 100644 --- a/ggml/src/vulkan-shaders/div.comp +++ b/ggml/src/vulkan-shaders/div.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp index 9fe807cce..4cc7a68ca 100644 --- a/ggml/src/vulkan-shaders/gelu.comp +++ b/ggml/src/vulkan-shaders/gelu.comp @@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/gelu_quick.comp b/ggml/src/vulkan-shaders/gelu_quick.comp new file mode 100644 index 000000000..e6e6fcfd2 --- /dev/null +++ b/ggml/src/vulkan-shaders/gelu_quick.comp @@ -0,0 +1,23 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const float GELU_QUICK_COEF = -1.702f; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float x = float(data_a[i]); + data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); +} diff --git a/ggml/src/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp index ab45d2564..b6beaff1c 100644 --- a/ggml/src/vulkan-shaders/generic_binary_head.comp +++ b/ggml/src/vulkan-shaders/generic_binary_head.comp @@ -7,7 +7,7 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; uint d_offset; - float param1; float param2; + float param1; float param2; int param3; } p; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; @@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +uint get_idx() { + return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; +} + uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp index de08de7cd..eacdefc7d 100644 --- a/ggml/src/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/vulkan-shaders/generic_unary_head.comp @@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; +uint get_idx() { + return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; +} + uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; diff --git a/ggml/src/vulkan-shaders/group_norm.comp b/ggml/src/vulkan-shaders/group_norm.comp new file mode 100644 index 000000000..5ad9b28da --- /dev/null +++ b/ggml/src/vulkan-shaders/group_norm.comp @@ -0,0 +1,66 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#define BLOCK_SIZE 512 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +shared float tmp[BLOCK_SIZE]; + +void main() { + const uint group_size = p.KX; + const float eps = p.param1; + + const uint tid = gl_LocalInvocationID.x; + const uint start = gl_WorkGroupID.x * group_size + tid; + const uint end = start + group_size; + + tmp[tid] = 0.0f; + + // Calculate mean + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + tmp[tid] += float(data_a[col]); + } + + // tmp up partial tmps and write back result + barrier(); + [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + + const float mean = tmp[0] / group_size; + barrier(); + tmp[tid] = 0.0f; + + // Calculate variance + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + const float xi = float(data_a[col]) - mean; + data_d[col] = D_TYPE(xi); + tmp[tid] += xi * xi; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + + const float variance = tmp[0] / group_size; + const float scale = inversesqrt(variance + eps); + + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + data_d[col] *= D_TYPE(scale); + } +} diff --git a/ggml/src/vulkan-shaders/im2col.comp b/ggml/src/vulkan-shaders/im2col.comp new file mode 100644 index 000000000..4d48610a3 --- /dev/null +++ b/ggml/src/vulkan-shaders/im2col.comp @@ -0,0 +1,57 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require + +layout (push_constant) uniform parameter +{ + uint batch_offset; uint offset_delta; + uint IC; + uint IW; uint IH; + uint OW; uint OH; + uint KW; uint KH; + uint pelements; + uint CHW; + int s0; int s1; + int p0; int p1; + int d0; int d1; +} p; + +#include "types.comp" + +#define BLOCK_SIZE 256 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.x; + if (i >= p.pelements) { + return; + } + + const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1); + const uint kx = i / ksize; + const uint kd = kx * ksize; + const uint ky = (i - kd) / p.OW; + const uint ix = i % p.OW; + + const uint oh = gl_GlobalInvocationID.y; + const uint batch = gl_GlobalInvocationID.z / p.IC; + const uint ic = gl_GlobalInvocationID.z % p.IC; + + const uint iiw = ix * p.s0 + kx * p.d0 - p.p0; + const uint iih = oh * p.s1 + ky * p.d1 - p.p1; + + const uint offset_dst = + ((batch * p.OH + oh) * p.OW + ix) * p.CHW + + (ic * (p.KW * p.KH) + ky * p.KW + kx); + + if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) { + data_d[offset_dst] = D_TYPE(0.0f); + } else { + const uint offset_src = ic * p.offset_delta + batch * p.batch_offset; + data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]); + } +} diff --git a/ggml/src/vulkan-shaders/leaky_relu.comp b/ggml/src/vulkan-shaders/leaky_relu.comp new file mode 100644 index 000000000..d90a99aea --- /dev/null +++ b/ggml/src/vulkan-shaders/leaky_relu.comp @@ -0,0 +1,22 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float val = float(data_a[i]); + data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); +} diff --git a/ggml/src/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp index bbb0aa1d2..bfb61c92d 100644 --- a/ggml/src/vulkan-shaders/mul.comp +++ b/ggml/src/vulkan-shaders/mul.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp index 803dbdcb3..6627a50bd 100644 --- a/ggml/src/vulkan-shaders/norm.comp +++ b/ggml/src/vulkan-shaders/norm.comp @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; shared vec2 sum[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; sum[tid] = vec2(0.0f, 0.0f); diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp new file mode 100644 index 000000000..a465cd52b --- /dev/null +++ b/ggml/src/vulkan-shaders/pad.comp @@ -0,0 +1,26 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (idx >= p.ne) { + return; + } + + const uint i3 = idx / (p.ne12*p.ne11*p.ne10); + const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; + const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10); + const uint i2_offset = i2*p.ne11*p.ne10; + const uint i1 = (idx - i3_offset - i2_offset) / p.ne10; + const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; + + const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; + const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; + + const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; + + data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f); +} diff --git a/ggml/src/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp index 7e5baa5b8..52a19b62a 100644 --- a/ggml/src/vulkan-shaders/relu.comp +++ b/ggml/src/vulkan-shaders/relu.comp @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp index cfd08d345..b554400ba 100644 --- a/ggml/src/vulkan-shaders/rms_norm.comp +++ b/ggml/src/vulkan-shaders/rms_norm.comp @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; shared FLOAT_TYPE sum[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp index 510cb7237..5cd2f668d 100644 --- a/ggml/src/vulkan-shaders/scale.comp +++ b/ggml/src/vulkan-shaders/scale.comp @@ -4,9 +4,11 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1)); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); } diff --git a/ggml/src/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp index 15920f06e..4d36f88e0 100644 --- a/ggml/src/vulkan-shaders/silu.comp +++ b/ggml/src/vulkan-shaders/silu.comp @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp index 1b8419c7c..0bd51ecab 100644 --- a/ggml/src/vulkan-shaders/soft_max.comp +++ b/ggml/src/vulkan-shaders/soft_max.comp @@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE]; void main() { const uint tid = gl_LocalInvocationID.x; - const uint rowx = gl_WorkGroupID.x; + const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint rowy = rowx % p.KY; float slope = 1.0f; diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp index 8dd19333d..1fa118c99 100644 --- a/ggml/src/vulkan-shaders/square.comp +++ b/ggml/src/vulkan-shaders/square.comp @@ -4,10 +4,12 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val); + const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val); } diff --git a/ggml/src/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp index ce2f1e2f3..961e5ffa1 100644 --- a/ggml/src/vulkan-shaders/sum_rows.comp +++ b/ggml/src/vulkan-shaders/sum_rows.comp @@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint col = gl_LocalInvocationID.x; tmp[col] = FLOAT_TYPE(0.0f); diff --git a/ggml/src/vulkan-shaders/tanh.comp b/ggml/src/vulkan-shaders/tanh.comp new file mode 100644 index 000000000..74630dc7f --- /dev/null +++ b/ggml/src/vulkan-shaders/tanh.comp @@ -0,0 +1,21 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + data_d[i] = D_TYPE(tanh(data_a[i])); +} diff --git a/ggml/src/vulkan-shaders/timestep_embedding.comp b/ggml/src/vulkan-shaders/timestep_embedding.comp new file mode 100644 index 000000000..79e065a93 --- /dev/null +++ b/ggml/src/vulkan-shaders/timestep_embedding.comp @@ -0,0 +1,41 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require + +layout (push_constant) uniform parameter +{ + uint nb1; + uint dim; + uint max_period; +} p; + +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#define BLOCK_SIZE 256 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_WorkGroupID.y; + const uint j = gl_GlobalInvocationID.x; + const uint d_offset = i * p.nb1; + + if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { + data_d[d_offset + p.dim] = 0.f; + } + + const uint half_dim = p.dim / 2; + if (j >= half_dim) { + return; + } + + const float timestep = float(data_a[i]); + const float freq = float(exp(-log(p.max_period) * j / half_dim)); + const float arg = timestep * freq; + data_d[d_offset + j] = D_TYPE(cos(arg)); + data_d[d_offset + j + half_dim] = D_TYPE(sin(arg)); +} diff --git a/ggml/src/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp index d24c172ca..21dce72fc 100644 --- a/ggml/src/vulkan-shaders/types.comp +++ b/ggml/src/vulkan-shaders/types.comp @@ -6,7 +6,7 @@ #define QUANT_K 1 #define QUANT_R 1 -#ifndef LOAD_VEC_A +#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float #elif LOAD_VEC_A == 4 #define A_TYPE vec4 @@ -19,7 +19,7 @@ #define QUANT_K 1 #define QUANT_R 1 -#ifndef LOAD_VEC_A +#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float16_t #elif LOAD_VEC_A == 4 #define A_TYPE f16vec4 diff --git a/ggml/src/vulkan-shaders/upscale.comp b/ggml/src/vulkan-shaders/upscale.comp new file mode 100644 index 000000000..511a086ea --- /dev/null +++ b/ggml/src/vulkan-shaders/upscale.comp @@ -0,0 +1,36 @@ +#version 450 + +layout (push_constant) uniform parameter +{ + uint ne; uint d_offset; + uint nb00; uint nb01; uint nb02; uint nb03; + uint ne10; uint ne11; uint ne12; uint ne13; + float sf0; float sf1; float sf2; float sf3; +} p; + +#include "types.comp" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (idx >= p.ne) { + return; + } + + const uint i10 = idx % p.ne10; + const uint i11 = (idx / p.ne10) % p.ne11; + const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; + const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; + + const uint i00 = uint(i10 / p.sf0); + const uint i01 = uint(i11 / p.sf1); + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + + data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); +} diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index c9dbf9dfd..258a1933f 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,20 +30,6 @@ #define ASYNCIO_CONCURRENCY 64 -// define prototypes -void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str); -bool directory_exists(const std::string& path); -bool create_directory(const std::string& path); -std::string to_uppercase(const std::string& input); -bool string_ends_with(const std::string& str, const std::string& suffix); -std::string join_paths(const std::string& path1, const std::string& path2); -std::string basename(const std::string &path); -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16); -std::map merge_maps(const std::map& a, const std::map& b); -void matmul_shaders(std::vector>& tasks, bool fp16, bool matmul_id); -void process_shaders(std::vector>& tasks); -void write_output_files(); - std::mutex lock; std::vector> shader_fnames; @@ -52,7 +38,7 @@ std::string input_dir = "vulkan-shaders"; std::string output_dir = "/tmp"; std::string target_hpp = "ggml-vulkan-shaders.hpp"; std::string target_cpp = "ggml-vulkan-shaders.cpp"; -bool clean = true; +bool no_clean = false; const std::vector type_names = { "f32", @@ -283,9 +269,12 @@ void matmul_shaders(std::vector>& tasks, bool fp16, bool matmu for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); + // For unaligned, load one at a time for f32/f16, or two at a time for quants + std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; + // For aligned matmul loads std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; tasks.push_back(std::async(std::launch::async, [=] { - string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); + string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); })); tasks.push_back(std::async(std::launch::async, [=] { string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); @@ -354,6 +343,9 @@ void process_shaders(std::vector>& tasks) { tasks.push_back(std::async(std::launch::async, [=] { string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); tasks.push_back(std::async(std::launch::async, [=] { string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); @@ -371,6 +363,9 @@ void process_shaders(std::vector>& tasks) { tasks.push_back(std::async(std::launch::async, [] { string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); + })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); @@ -396,15 +391,42 @@ void process_shaders(std::vector>& tasks) { string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}); + })); + + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); @@ -438,6 +460,17 @@ void process_shaders(std::vector>& tasks) { tasks.push_back(std::async(std::launch::async, [=] { string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); + + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); + })); + + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); } void write_output_files() { @@ -478,9 +511,8 @@ void write_output_files() { } fprintf(src, "\n};\n\n"); - if (clean) { + if (!no_clean) { std::remove(path.c_str()); - // fprintf(stderr, "Removed: %s\n", path.c_str()); } } @@ -496,18 +528,6 @@ int main(int argc, char** argv) { } } - if (argc <= 1 || args.find("--help") != args.end()) { - std::cout << "Usage:\n" - "\tvulkan-shaders-gen [options]\n\n" - "Options:\n" - "\t--glslc Path to glslc executable (default: /usr/bin/glslc)\n" - "\t--input-dir Directory containing shader sources (required)\n" - "\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n" - "\t--target-hpp Path to generate a header file with shader declarations in C++ format\n" - "\t--target-cpp Path to generate a source code file implementing the declared shaders (optional)\n" - "\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n"; - return EXIT_SUCCESS; - } if (args.find("--glslc") != args.end()) { GLSLC = args["--glslc"]; // Path to glslc } @@ -524,7 +544,7 @@ int main(int argc, char** argv) { target_cpp = args["--target-cpp"]; // Path to generated cpp file } if (args.find("--no-clean") != args.end()) { - clean = false; // Keep temporary SPIR-V files in output-dir after build + no_clean = true; // Keep temporary SPIR-V files in output-dir after build } if (!directory_exists(input_dir)) { From 5587e57a76630651752031223cc7024cb32cf308 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 4 Aug 2024 19:13:25 +0300 Subject: [PATCH 067/154] sync : ggml ggml-ci --- scripts/sync-ggml-am.sh | 1 + scripts/sync-ggml.last | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index c40025356..aa4895c6d 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -64,6 +64,7 @@ while read c; do src/ggml*.cu \ src/ggml-cuda/* \ src/ggml-sycl/* \ + src/vulkan-shaders/* \ include/ggml*.h \ tests/test-opt.cpp \ tests/test-grad0.cpp \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 998b23ac6..1b82b1047 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -31d544f87835a55602883fe09156bb85a4c163d8 +18703ad600cc68dbdb04d57434c876989a841d12 From 064cdc265fb63590c7c8f04a609d36ef200d55a7 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 5 Aug 2024 07:52:55 +0200 Subject: [PATCH 068/154] vulkan : fix Qantized Mat-Vec Mul on AMD GPUs for ncols < 64 (#8855) * Fix Vulkan mul mat vec invalid results when ncols < warp size * Only run backend ops mul mat vec block size test if block size not already covered --- ggml/src/vulkan-shaders/mul_mat_vec.comp | 13 ++++++++++--- tests/test-backend-ops.cpp | 7 ++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ggml/src/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp index 15d2a8063..46a6369bc 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp @@ -16,6 +16,13 @@ void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; const uint tid = gl_LocalInvocationID.x; + // There are not enough cols to use all threads + if (tid >= p.ncols) { + return; + } + + const uint block_size = min(p.ncols, BLOCK_SIZE); + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -23,8 +30,8 @@ void main() { tmp[tid] = FLOAT_TYPE(0.0f); - [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) { - const uint col = i*BLOCK_SIZE + 2*tid; + [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) { + const uint col = i*block_size + 2*tid; const uint ib = (row*p.ncols + col)/QUANT_K; // block index const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index @@ -38,7 +45,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f5065f145..54cef05c3 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2271,9 +2271,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { - - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), { 1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); + if (ggml_blck_size(type_a) != 256) { + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1})); + } + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1})); } } From f1ea5146d741a0c9be6d8fbfab9323fea6c4a3f0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Aug 2024 08:53:39 +0300 Subject: [PATCH 069/154] llama : better replace_all (#8852) --- src/llama.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e6f303d31..ff234565d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -122,17 +122,14 @@ static std::string trim(const std::string & str) { } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static bool is_float_close(float a, float b, float abs_tol) { From 400ae6f65f0b55babd48d1e3ec7fd663a97fc8d0 Mon Sep 17 00:00:00 2001 From: BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> Date: Mon, 5 Aug 2024 01:54:10 -0400 Subject: [PATCH 070/154] readme : update model list (#8851) --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 775ce2c88..34992b0af 100644 --- a/README.md +++ b/README.md @@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) +- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) +- [x] [Smaug](https://huggingface.co/models?search=Smaug) +- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B) +- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM) +- [x] [Flan T5](https://huggingface.co/models?search=flan-t5) +- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) +- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) From e31a4f679779220312c165b0f5994c680a610e38 Mon Sep 17 00:00:00 2001 From: stduhpf Date: Mon, 5 Aug 2024 08:18:27 +0200 Subject: [PATCH 071/154] cmake: fix paths for vulkan shaders compilation on Windows (#8573) * Vulkan-shaders: attempt fix compilation on windows * fix miss-matched parenthesis --- .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index 258a1933f..f6f4f116a 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -179,11 +179,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) { return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin()); } -#ifdef _WIN32 - static const char path_separator = '\\'; -#else - static const char path_separator = '/'; -#endif +static const char path_separator = '/'; std::string join_paths(const std::string& path1, const std::string& path2) { return path1 + path_separator + path2; @@ -198,7 +194,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + #ifdef _WIN32 + std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + #else + std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + #endif for (const auto& define : defines) { cmd.push_back("-D" + define.first + "=" + define.second); } @@ -482,10 +482,16 @@ void write_output_files() { for (const auto& pair : shader_fnames) { const std::string& name = pair.first; - const std::string& path = pair.second; + #ifdef _WIN32 + std::string path = pair.second; + std::replace(path.begin(), path.end(), '/', '\\' ); + #else + const std::string& path = pair.second; + #endif + FILE* spv = fopen(path.c_str(), "rb"); if (!spv) { - std::cerr << "Error opening SPIR-V file: " << path << "\n"; + std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n"; continue; } @@ -497,7 +503,7 @@ void write_output_files() { size_t read_size = fread(data.data(), 1, size, spv); fclose(spv); if (read_size != size) { - std::cerr << "Error reading SPIR-V file: " << path << "\n"; + std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n"; continue; } From d3f0c7166adfa952237e0f437a5344362d8256d4 Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:38:01 +0200 Subject: [PATCH 072/154] Stop the generation when <|eom_id|> token is encountered - needed for Llama 3.1 tool call support (#8858) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * gguf-py, llama : add constants and methods related to Llama-3.1 <|eom_id|> token * llama : find Llama-3.1 <|eom_id|> token id during vocab loading * llama-vocab : add Llama-3.1 <|eom_id|> token to the set of tokens stopping the generation --------- Co-authored-by: Stanisław Szymczyk --- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama-vocab.cpp | 7 ++++++- src/llama-vocab.h | 2 ++ src/llama.cpp | 14 ++++++++++++++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e343c2ef1..59ffd92ea 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -161,6 +161,7 @@ class Keys: SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -1327,3 +1328,4 @@ KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2e0b335ee..76385a828 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -828,6 +828,9 @@ class GGUFWriter: def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def add_eom_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 133094904..9be076f6d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { return token != -1 && ( token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) + token == llama_token_eot_impl(vocab) || + token == llama_token_eom_impl(vocab) ); } @@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { return vocab.special_eot_id; } +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + int32_t llama_tokenize_impl( const struct llama_vocab & vocab, const char * text, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 30b565d55..7adfc16da 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -45,6 +45,7 @@ struct llama_vocab { id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token + id special_eom_id = -1; // tokenizer flags bool tokenizer_add_space_prefix = false; @@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_eot_impl (const struct llama_vocab & vocab); +llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/src/llama.cpp b/src/llama.cpp index ff234565d..a7b1c9ebd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -359,6 +359,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -456,6 +457,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -5583,6 +5585,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5635,6 +5638,17 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; + } + } } // build special tokens cache From 1ef14b30075da594cb24f0ab858a14bf1d8d1797 Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 5 Aug 2024 21:15:28 +1000 Subject: [PATCH 073/154] py: Add more authorship metadata from model card (#8810) * py: add more authorship metadata from model card * fixup! py: add more authorship metadata from model card --- gguf-py/gguf/metadata.py | 129 +++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 15189f717..ea4d02705 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -284,20 +284,67 @@ class Metadata: ######################## if model_card is not None: - if "model_name" in model_card and metadata.name is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.name = model_card.get("model_name") + def use_model_card_metadata(metadata_key: str, model_card_key: str): + if model_card_key in model_card and getattr(metadata, metadata_key, None) is None: + setattr(metadata, metadata_key, model_card.get(model_card_key)) - if "model_creator" in model_card and metadata.author is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.author = model_card.get("model_creator") + def use_array_model_card_metadata(metadata_key: str, model_card_key: str): + # Note: Will append rather than replace if already exist + tags_value = model_card.get(model_card_key, None) + if tags_value is None: + return - if "model_type" in model_card and metadata.basename is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.basename = model_card.get("model_type") + current_value = getattr(metadata, metadata_key, None) + if current_value is None: + current_value = [] + + if isinstance(tags_value, str): + current_value.append(tags_value) + elif isinstance(tags_value, list): + current_value.extend(tags_value) + + setattr(metadata, metadata_key, current_value) + + # LLAMA.cpp's direct internal convention + # (Definitely not part of hugging face formal/informal standard) + ######################################### + use_model_card_metadata("name", "name") + use_model_card_metadata("author", "author") + use_model_card_metadata("version", "version") + use_model_card_metadata("organization", "organization") + use_model_card_metadata("description", "description") + use_model_card_metadata("finetune", "finetune") + use_model_card_metadata("basename", "basename") + use_model_card_metadata("size_label", "size_label") + use_model_card_metadata("source_url", "url") + use_model_card_metadata("source_doi", "doi") + use_model_card_metadata("source_uuid", "uuid") + use_model_card_metadata("source_repo_url", "repo_url") + + # LLAMA.cpp's huggingface style convention + # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style) + ########################################### + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_author") + use_model_card_metadata("version", "model_version") + use_model_card_metadata("organization", "model_organization") + use_model_card_metadata("description", "model_description") + use_model_card_metadata("finetune", "model_finetune") + use_model_card_metadata("basename", "model_basename") + use_model_card_metadata("size_label", "model_size_label") + use_model_card_metadata("source_url", "model_url") + use_model_card_metadata("source_doi", "model_doi") + use_model_card_metadata("source_uuid", "model_uuid") + use_model_card_metadata("source_repo_url", "model_repo_url") + + # Hugging Face Direct Convention + ################################# + + # Not part of huggingface model card standard but notice some model creator using it + # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_creator") + use_model_card_metadata("basename", "model_type") if "base_model" in model_card: # This represents the parent models that this is based on @@ -329,58 +376,18 @@ class Metadata: base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" metadata.base_models.append(base_model) - if "license" in model_card and metadata.license is None: - metadata.license = model_card.get("license") + use_model_card_metadata("license", "license") + use_model_card_metadata("license_name", "license_name") + use_model_card_metadata("license_link", "license_link") - if "license_name" in model_card and metadata.license_name is None: - metadata.license_name = model_card.get("license_name") + use_array_model_card_metadata("tags", "tags") + use_array_model_card_metadata("tags", "pipeline_tag") - if "license_link" in model_card and metadata.license_link is None: - metadata.license_link = model_card.get("license_link") + use_array_model_card_metadata("languages", "languages") + use_array_model_card_metadata("languages", "language") - tags_value = model_card.get("tags", None) - if tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(tags_value, str): - metadata.tags.append(tags_value) - elif isinstance(tags_value, list): - metadata.tags.extend(tags_value) - - pipeline_tags_value = model_card.get("pipeline_tag", None) - if pipeline_tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(pipeline_tags_value, str): - metadata.tags.append(pipeline_tags_value) - elif isinstance(pipeline_tags_value, list): - metadata.tags.extend(pipeline_tags_value) - - language_value = model_card.get("languages", model_card.get("language", None)) - if language_value is not None: - - if metadata.languages is None: - metadata.languages = [] - - if isinstance(language_value, str): - metadata.languages.append(language_value) - elif isinstance(language_value, list): - metadata.languages.extend(language_value) - - dataset_value = model_card.get("datasets", model_card.get("dataset", None)) - if dataset_value is not None: - - if metadata.datasets is None: - metadata.datasets = [] - - if isinstance(dataset_value, str): - metadata.datasets.append(dataset_value) - elif isinstance(dataset_value, list): - metadata.datasets.extend(dataset_value) + use_array_model_card_metadata("datasets", "datasets") + use_array_model_card_metadata("datasets", "dataset") # Hugging Face Parameter Heuristics #################################### From b9dfc25ca385a83bde9e9456c4d4fae15377bc7b Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Mon, 5 Aug 2024 05:43:40 -0700 Subject: [PATCH 074/154] ggml : fix overflows in elu function (#8866) It's helpful to use expm1f(x), because expf(x)-1 will result in overflow for 25% of single-precision floating point numbers. --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 42f4a34b8..910981e4a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2312,7 +2312,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } From b42978e7e4d56eaaa93588414e804d9fbbc3cae2 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Mon, 5 Aug 2024 13:45:01 +0100 Subject: [PATCH 075/154] readme : add ramalama to the availables UI (#8811) ramalama is a repo agnostic boring CLI tool that supports pulling from ollama, huggingface and oci registries. Signed-off-by: Eric Curtin --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 34992b0af..1283f6805 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) +- [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) From bc0f887e159c0d78c28121e2c8b5c58094170875 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Mon, 5 Aug 2024 21:10:37 +0800 Subject: [PATCH 076/154] cann: fix buffer_num and runtime speed slowly error (#8865) --- ggml/src/ggml-cann.cpp | 4 --- .../kernels/quantize_float_to_q4_0.cpp | 25 +++++++++++-------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index a15bc8aa2..81783b7b1 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1670,10 +1670,6 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, // TODO: fix me // Current groupsize should not be greater than k-1 in // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(). - if (op->src[0]->ne[0]-1 > QK8_0) { - return true; - } - return false; case GGML_TYPE_Q4_0: return true; default: diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp index f6deee3c5..9c8c86b66 100644 --- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 { __aicore__ inline void init(GM_ADDR input, GM_ADDR output, int64_t *input_ne_ub, size_t *input_nb_ub, int64_t *output_ne_ub) { + // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], + // permute=[0,0,0,0]): + // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); @@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 { pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); pipe.InitBuffer(output_queue, BUFFER_NUM, Group_Size * sizeof(int8_t) / 2); - pipe.InitBuffer(cast_queue , BUFFER_NUM, Group_Size * sizeof(float)); - pipe.InitBuffer(work_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(max_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(min_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(scale_queue, BUFFER_NUM, 16*sizeof(half)); - pipe.InitBuffer(int8_queue, BUFFER_NUM, Group_Size * sizeof(int8_t)); - pipe.InitBuffer(half_queue, BUFFER_NUM, Group_Size * sizeof(half)); + pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); + pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); + pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); + pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); } __aicore__ inline void copy_in(uint32_t offset) { @@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 { for (int64_t j = 0; j < group_size_in_row; j++) { half scale = calculate_group(i, j); scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { + // Copy Group_Size/2 length data each time. + if (scale_local_offset == Group_Size / 2) { scale_local_offset = 0; // TODO: OPTIMIZE ME pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); + DataCopy(scale_gm[scale_global_offset], scale_local, + Group_Size / 2); pipe_barrier(PIPE_ALL); - scale_global_offset += 16; + scale_global_offset += Group_Size / 2; } } } From 0a4ce786814b123096d18aadca89cd352b9e590b Mon Sep 17 00:00:00 2001 From: Liu Jia <109258120+Septa2112@users.noreply.github.com> Date: Tue, 6 Aug 2024 00:14:10 +0800 Subject: [PATCH 077/154] common : Changed tuple to struct (TODO fix) (#8823) * common : Changed tuple to struct (TODO fix) Use struct `llama_init_result` to replace the previous std::tuple * delete llama_init_default_params() * delete the extra whitespace --- common/common.cpp | 18 ++++++++++-------- common/common.h | 8 ++++++-- .../cvector-generator/cvector-generator.cpp | 7 ++++--- examples/embedding/embedding.cpp | 8 ++++---- examples/eval-callback/eval-callback.cpp | 7 ++++--- examples/imatrix/imatrix.cpp | 6 +++--- examples/infill/infill.cpp | 5 ++++- examples/lookahead/lookahead.cpp | 8 ++++---- examples/lookup/lookup-create.cpp | 8 ++++---- examples/lookup/lookup-stats.cpp | 8 ++++---- examples/lookup/lookup.cpp | 8 ++++---- examples/main/main.cpp | 5 ++++- examples/parallel/parallel.cpp | 8 ++++---- examples/perplexity/perplexity.cpp | 8 ++++---- examples/retrieval/retrieval.cpp | 9 +++++---- examples/save-load-state/save-load-state.cpp | 7 ++++--- examples/server/server.cpp | 5 ++++- examples/speculative/speculative.cpp | 8 ++++++-- 18 files changed, 82 insertions(+), 59 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 521f849e2..ee7fbcba3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2039,8 +2039,8 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // - -std::tuple llama_init_from_gpt_params(gpt_params & params) { +struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; @@ -2055,7 +2055,7 @@ std::tuple llama_init_from_gpt_par if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); + return iparams; } auto cparams = llama_context_params_from_gpt_params(params); @@ -2064,7 +2064,7 @@ std::tuple llama_init_from_gpt_par if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } if (!params.control_vectors.empty()) { @@ -2075,7 +2075,7 @@ std::tuple llama_init_from_gpt_par if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } int err = llama_control_vector_apply(lctx, @@ -2087,7 +2087,7 @@ std::tuple llama_init_from_gpt_par if (err) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } } @@ -2099,7 +2099,7 @@ std::tuple llama_init_from_gpt_par fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } llama_lora_adapter_set(lctx, adapter, lora_scale); } @@ -2135,7 +2135,9 @@ std::tuple llama_init_from_gpt_par llama_reset_timings(lctx); } - return std::make_tuple(model, lctx); + iparams.model = model; + iparams.context = lctx; + return iparams; } struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { diff --git a/common/common.h b/common/common.h index 8240ff99b..51dcc0d39 100644 --- a/common/common.h +++ b/common/common.h @@ -308,8 +308,12 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -// TODO: avoid tuplue, use struct -std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_init_result { + struct llama_model * model = nullptr; + struct llama_context * context = nullptr; +}; + +struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index d4e126ac2..a12e90d82 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -414,9 +414,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 1466e5b2b..cd7b448a6 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -79,11 +79,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 37d30ab8c..ef35ba2c0 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -163,9 +163,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 6ce1863cf..58814b96e 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -611,10 +611,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_model * model; - llama_context * ctx; + llama_init_result llama_init = llama_init_from_gpt_params(params); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index dc93d2301..92d630b15 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -179,7 +179,10 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; if (model == NULL) { LOG_TEE("%s: error: unable to load model\n", __func__); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index fb20ad93f..81cf1629c 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -58,11 +58,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the target model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // Tokenize the prompt std::vector inp; diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index d713f6f21..5f04709f5 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -22,11 +22,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; GGML_ASSERT(model != nullptr); // tokenize the prompt diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 2fe67100e..400f3e0b0 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -26,11 +26,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index bb571bac4..d53a9828c 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -34,11 +34,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 61e960ea2..6e0635a66 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -207,7 +207,10 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; if (sparams.cfg_scale > 1.f) { struct llama_context_params lparams = llama_context_params_from_gpt_params(params); ctx_guidance = llama_new_context_with_model(model, lparams); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7faeaec97..621a1c959 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -129,11 +129,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the target model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // load the prompts from an external file if there are any if (params.prompt.empty()) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index dbe445391..372684f09 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2018,11 +2018,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index eb89d16da..65b19ce71 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -148,11 +148,12 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index d8afdc141..3ea7c790d 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -28,10 +28,11 @@ int main(int argc, char ** argv) { std::string result2; // init - llama_model * model; - llama_context * ctx; + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; - std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d5f131d9b..d178ca0f7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -677,7 +677,10 @@ struct server_context { // dedicate one sequence to the system prompt params.n_parallel += 1; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 0939a1a6a..b051a18f1 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -66,7 +66,9 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); + llama_init_result llama_init_tgt = llama_init_from_gpt_params(params); + model_tgt = llama_init_tgt.model; + ctx_tgt = llama_init_tgt.context; // load the draft model params.model = params.model_draft; @@ -75,7 +77,9 @@ int main(int argc, char ** argv) { params.n_threads = params.n_threads_draft; } params.n_threads_batch = params.n_threads_batch_draft; - std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + llama_init_result llama_init_dft = llama_init_from_gpt_params(params); + model_dft = llama_init_dft.model; + ctx_dft = llama_init_dft.context; const bool vocab_type_tgt = llama_vocab_type(model_tgt); LOG("vocab_type tgt: %d\n", vocab_type_tgt); From d4ff847153e9cf7220d1b39aa21172069e6e8cea Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Tue, 6 Aug 2024 09:09:12 +0800 Subject: [PATCH 078/154] [SYCL] correct cmd name (#8877) --- examples/sycl/win-run-llama2.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index f0385cdf0..c2918d6dc 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force -.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0 +.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0 From c21a896405de4cdf4207eb8130555ceaac0ab110 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Tue, 6 Aug 2024 12:42:42 +0800 Subject: [PATCH 079/154] [CANN]: Fix ggml_backend_cann_buffer_get_tensor (#8871) * cann: fix ggml_backend_cann_buffer_get_tensor 1. fix data ptr offset 2. enable the acquisition of incomplete tensors * fix backend cann set_tensor --- ggml/src/ggml-cann.cpp | 81 ++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 81783b7b1..06930ba2e 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -896,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor( * @param size Size of the data to be copied, in bytes. */ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( - ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, + ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { - // GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; + ggml_backend_cann_buffer_context *ctx = + (ggml_backend_cann_buffer_context *)buffer->context; ggml_cann_set_device(ctx->device); // TODO: refer to cann(#6017), it use thread's default stream. @@ -908,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( // Why aclrtSynchronizeDevice? if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset, - size, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, + ACL_MEMCPY_HOST_TO_DEVICE)); } else { - void* transform_buffer = malloc(size); - ggml_backend_cann_transform(tensor, (const char*)data + offset, - transform_buffer); + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); #ifndef NDEBUG - void* check_buffer = malloc(size); + void *check_buffer = malloc(size); ggml_backend_cann_transform_back(tensor, transform_buffer, check_buffer); - GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) == - 0); + GGML_ASSERT(memcmp(data, check_buffer, size) == 0); free(check_buffer); #endif - ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, + transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE)); free(transform_buffer); } @@ -945,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( GGML_CALL static void ggml_backend_cann_buffer_get_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { - GGML_ASSERT(size == ggml_nbytes(tensor)); ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; ggml_cann_set_device(ctx->device); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size, + ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); } else { void* transform_buffer = malloc(size); - ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size, + ACL_CHECK(aclrtMemcpy(transform_buffer, size, + (char*)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); - ggml_backend_cann_transform_back(tensor, transform_buffer, - (char*)data + offset); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); free(transform_buffer); } } @@ -1448,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { * @param size Size of the data to copy in bytes. */ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, - ggml_tensor* tensor, - const void* data, + ggml_tensor *tensor, + const void *data, size_t offset, size_t size) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpyAsync( - tensor->data, size, (const char*)data + offset, size, - ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); + ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data, + size, ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); } else { - void* transform_buffer = malloc(size); - ggml_backend_cann_transform(tensor, (const char*)data + offset, - transform_buffer); + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); #ifndef NDEBUG - void* check_buffer = malloc(size); + void *check_buffer = malloc(size); ggml_backend_cann_transform_back(tensor, transform_buffer, check_buffer); - GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size)); + GGML_ASSERT(memcmp(data, check_buffer, size)); free(check_buffer); #endif - ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size, - ACL_MEMCPY_HOST_TO_DEVICE, - cann_ctx->stream())); + ACL_CHECK(aclrtMemcpyAsync( + (char *)tensor->data + offset, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); free(transform_buffer); } } GGML_CALL static void ggml_backend_cann_get_tensor_async( - ggml_backend_t backend, const ggml_tensor* tensor, void* data, + ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -1491,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async( "unsupported buffer type"); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data, + ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream())); } else { - void* transform_buffer = malloc(size); - ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size, - ACL_MEMCPY_DEVICE_TO_HOST, - cann_ctx->stream())); + void *transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpyAsync( + transform_buffer, size, (char *)tensor->data + offset, size, + ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream())); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); - ggml_backend_cann_transform_back(tensor, transform_buffer, - (char*)data + offset); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); free(transform_buffer); } } From cdd1889de62a7140c8c016405ec917464bde8bd3 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Tue, 6 Aug 2024 02:20:54 -0500 Subject: [PATCH 080/154] convert : add support for XLMRoberta embedding models (#8658) * add conversion for bge-m3; small fix in unigram tokenizer * clean up and simplify XLMRoberta conversion --- convert_hf_to_gguf.py | 106 ++++++++++++++++++++++++++++++++++++++++++ src/llama-vocab.cpp | 5 +- 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8b33c30d9..38b92bc81 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2506,6 +2506,112 @@ class NomicBertModel(BertModel): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) +@Model.register("XLMRobertaModel") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + # realign tokens (see HF tokenizer code) + tokens = [b'', b'', b'', b''] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + @Model.register("GemmaForCausalLM") class GemmaModel(Model): model_arch = gguf.MODEL_ARCH.GEMMA diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9be076f6d..e6d6059d0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -816,6 +816,9 @@ struct llm_tokenizer_ugm { * the best tokenization. */ void tokenize(const std::string & text, std::vector & output) { + // get current size of output (for reversal later) + size_t output_size = output.size(); + // normalize the input first std::string normalized; normalize(text, &normalized); @@ -895,7 +898,7 @@ struct llm_tokenizer_ugm { } // reverse the output since we added tokens starting from the end of the input - std::reverse(output.begin(), output.end()); + std::reverse(output.begin() + output_size, output.end()); } private: From 2d5dd7bb3fa382806cd3e0bfc7a1d92349bc0ccf Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 6 Aug 2024 15:26:46 +0800 Subject: [PATCH 081/154] ggml : add epsilon as a parameter for group_norm (#8818) Signed-off-by: Molly Sophia --- ggml/include/ggml.h | 7 ++++--- ggml/src/ggml-cann/aclnn_ops.cpp | 4 +++- ggml/src/ggml-cuda/norm.cu | 9 ++++++--- ggml/src/ggml-metal.m | 6 ++---- ggml/src/ggml-sycl/norm.cpp | 9 ++++++--- ggml/src/ggml.c | 19 ++++++++++++------- tests/test-backend-ops.cpp | 8 +++++--- 7 files changed, 38 insertions(+), 24 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a9e88e592..15602a96d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1140,16 +1140,17 @@ extern "C" { // group normalize along ne0*ne1*n_groups // used in stable-diffusion - // TODO: eps is hardcoded to 1e-6 for now GGML_API struct ggml_tensor * ggml_group_norm( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups); + int n_groups, + float eps); GGML_API struct ggml_tensor * ggml_group_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups); + int n_groups, + float eps); // a - x // b - dy diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 171439132..8c4132f5b 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = ggml_cann_create_tensor(src); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - const float eps = 1e-6f; // TODO: make this a parameter int n_groups = dst->op_params[0]; + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 30866d512..133e219f0 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i } } -static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) { - static const float eps = 1e-6f; +static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { if (group_size < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); @@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) GGML_ASSERT( dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream); + group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream); } void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 48b813131..b512eb0be 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -2229,10 +2229,8 @@ static enum ggml_status ggml_metal_graph_compute( GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ggml_is_contiguous(src0)); - //float eps; - //memcpy(&eps, dst->op_params, sizeof(float)); - - const float eps = 1e-6f; // TODO: temporarily hardcoded + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); const int32_t n_groups = ((int32_t *) dst->op_params)[0]; diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index cccf87d06..b3159b9d1 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, } static void group_norm_f32_sycl(const float* x, float* dst, - const int num_groups, const int group_size, + const int num_groups, const float eps, const int group_size, const int ne_elements, queue_ptr stream, int device) { - static const float eps = 1e-6f; if (group_size < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { @@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* GGML_ASSERT(dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); (void)src1; (void)dst; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 910981e4a..daceec414 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5374,6 +5374,7 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_groups, + float eps, bool inplace) { bool is_node = false; @@ -5384,7 +5385,8 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op_params[0] = n_groups; + ggml_set_op_params_i32(result, 0, n_groups); + ggml_set_op_params_f32(result, 1, eps); result->op = GGML_OP_GROUP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5396,15 +5398,17 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_tensor * ggml_group_norm( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, false); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct ggml_tensor * ggml_group_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, true); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } // ggml_mul_mat @@ -12095,10 +12099,11 @@ static void ggml_compute_forward_group_norm_f32( GGML_TENSOR_UNARY_OP_LOCALS - const float eps = 1e-6f; // TODO: make this a parameter - // TODO: optimize + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int n_channels = src0->ne[2]; int n_groups = dst->op_params[0]; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 54cef05c3..2f4117a62 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1511,6 +1511,7 @@ struct test_group_norm : public test_case { const ggml_type type; const std::array ne; const int32_t num_groups; + const float eps; std::string vars() override { return VARS_TO_STR3(type, ne, num_groups); @@ -1518,12 +1519,13 @@ struct test_group_norm : public test_case { test_group_norm(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 64, 320, 1}, - int32_t num_groups = 32) - : type(type), ne(ne), num_groups(num_groups) {} + int32_t num_groups = 32, + float eps = 1e-6f) + : type(type), ne(ne), num_groups(num_groups), eps(eps) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_tensor * out = ggml_group_norm(ctx, a, num_groups); + ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps); return out; } }; From 0bf16de07b0692e7df26b9a633e232bbd66e0360 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 6 Aug 2024 11:48:01 +0300 Subject: [PATCH 082/154] contributing : add note about write access --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b688f78ec..a9e000e52 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,6 +5,7 @@ - Execute [the full CI locally on your machine](ci/README.md) before publishing - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience +- Consider allowing write access to your branch for faster review - If your PR becomes stale, don't hesitate to ping the maintainers in the comments # Pull requests (for collaborators) From efda90c93a62274ec0b0bfa80c4eee4bdb6966d0 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Tue, 6 Aug 2024 16:32:03 +0500 Subject: [PATCH 083/154] [Vulkan] Fix compilation of `vulkan-shaders-gen` on w64devkit after `e31a4f6` (#8880) * Fix compilation issue in `vulkan-shaders-gen` https://github.com/ggerganov/llama.cpp/commit/e31a4f679779220312c165b0f5994c680a610e38 broke compilation on w64devkit. Including `algorithm` seems to fix that. * Guard it under `#ifdef _WIN32` --- ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index f6f4f116a..a792e203b 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -22,6 +22,7 @@ #ifdef _WIN32 #include #include // For _mkdir on Windows + #include // For std::replace on w64devkit #else #include #include From db20f50cf4710c46e3a996919a26858cae8c80ed Mon Sep 17 00:00:00 2001 From: Jaeden Amero Date: Tue, 6 Aug 2024 17:21:47 +0400 Subject: [PATCH 084/154] cmake : Link vulkan-shaders-gen with pthreads (#8835) When using CMake to build with Vulkan support, compiling vulkan-shaders-gen fails due to missing a CMakeLists.txt specification to link vulkan-shaders-gen with the threading library, resulting in the following error. [5/172] Linking CXX executable bin/vulkan-shaders-gen FAILED: bin/vulkan-shaders-gen : && /usr/bin/c++ ggml/src/vulkan-shaders/CMakeFiles/vulkan-shaders-gen.dir/vulkan-shaders-gen.cpp.o -o bin/vulkan-shaders-gen && : ld: error: undefined symbol: pthread_create >>> referenced by vulkan-shaders-gen.cpp >>> ggml/src/vulkan-shaders/CMakeFiles/vulkan-shaders-gen.dir/vulkan-shaders-gen.cpp.o:(std::__1::__libcpp_thread_create[abi:se180100](pthread**, >>> void* (*)(void*), void*)) c++: error: linker command failed with exit code 1 (use -v to see invocation) [6/172] Generating build details from Git -- Found Git: /usr/local/bin/git (found version "2.45.2") ninja: build stopped: subcommand failed. Add the CMakeLists.txt specification to link vulkan-shaders-gen with the threading library and fix the above error. Fixes #8834 --- ggml/src/vulkan-shaders/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/vulkan-shaders/CMakeLists.txt index 41551e009..10075db33 100644 --- a/ggml/src/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/vulkan-shaders/CMakeLists.txt @@ -1,5 +1,7 @@ +find_package (Threads REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) From 5f4dcb1e60bbfe936b45778dad177a5b9a09b066 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 6 Aug 2024 16:44:35 +0200 Subject: [PATCH 085/154] simple : update name of executable to llama-simple (#8885) This commit updates the name of the executable in README.md from `simple` to `llama-simple`. --- examples/simple/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple/README.md b/examples/simple/README.md index 49e24501c..0ff342535 100644 --- a/examples/simple/README.md +++ b/examples/simple/README.md @@ -3,7 +3,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. ```bash -./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" +./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" ... From 641f5dd2a6422941faa9f32ab5cb50fc1b24c1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 6 Aug 2024 17:13:55 +0200 Subject: [PATCH 086/154] CUDA: fix padding logic for FP16/FP32 (#8884) --- ggml/src/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 68605fff6..654f93e83 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1501,7 +1501,7 @@ static void ggml_cuda_op_mul_mat( } // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared: - if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { + if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream)); From 1e6f6554aa11fa10160a5fda689e736c3c34169f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 6 Aug 2024 17:33:39 +0200 Subject: [PATCH 087/154] server : add lora hotswap endpoint (WIP) (#8857) * server : add lora hotswap endpoint * handle lora_no_apply * fix build * updae docs * clean up struct def * fix build * add LoRA test * fix style --- common/common.cpp | 60 +++++--- common/common.h | 19 ++- examples/export-lora/export-lora.cpp | 10 +- examples/server/README.md | 128 +++++++++--------- examples/server/server.cpp | 66 ++++++++- examples/server/tests/features/lora.feature | 36 +++++ examples/server/tests/features/steps/steps.py | 21 +++ examples/server/tests/requirements.txt | 1 + gguf-py/gguf/metadata.py | 2 +- 9 files changed, 251 insertions(+), 92 deletions(-) create mode 100644 examples/server/tests/features/lora.feature diff --git a/common/common.cpp b/common/common.cpp index ee7fbcba3..2e8374d50 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + }); + return true; + } + if (arg == "--lora-init-without-apply") { + params.lora_init_without_apply = true; return true; } if (arg == "--control-vector") { @@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -2091,17 +2102,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return iparams; } - llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); } if (params.ignore_eos) { @@ -2140,6 +2156,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { return iparams; } +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } +} + struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { auto mparams = llama_model_default_params(); @@ -3162,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } + fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 51dcc0d39..d88966ece 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_info { + std::string path; + float scale; +}; + +struct llama_lora_adapter_container : llama_lora_adapter_info { + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -126,8 +135,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -309,8 +318,9 @@ std::string fs_get_cache_file(const std::string & filename); // struct llama_init_result { - struct llama_model * model = nullptr; + struct llama_model * model = nullptr; struct llama_context * context = nullptr; + std::vector lora_adapters; }; struct llama_init_result llama_init_from_gpt_params(gpt_params & params); @@ -321,6 +331,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 150f7e8d5..d228ae66e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -135,7 +135,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector> & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors @@ -144,9 +144,9 @@ struct lora_merge_ctx { throw std::runtime_error("split model is not yet supported"); } - for (auto lora_inp : lora_files) { - auto fname = std::get<0>(lora_inp); - auto scale = std::get<1>(lora_inp); + for (auto & lora_inp : lora_files) { + auto fname = lora_inp.path; + auto scale = lora_inp.scale; std::unique_ptr adapter(new file_input(fname, scale)); check_metadata_lora(adapter.get()); adapters.push_back(std::move(adapter)); @@ -407,7 +407,7 @@ int main(int argc, char ** argv) { g_verbose = (params.verbosity == 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/examples/server/README.md b/examples/server/README.md index de83ee7d0..e17595fe8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -207,41 +207,6 @@ model: -hff, --hf-file FILE Hugging Face model file (default: unused) -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) -retrieval: - - --context-file FNAME file to load context from (repeat to specify multiple files) - --chunk-size N minimum length of embedded text chunks (default: 64) - --chunk-separator STRING - separator between chunks (default: ' - ') - -passkey: - - --junk N number of times to repeat the junk text (default: 250) - --pos N position of the passkey in the junk text (default: -1) - -imatrix: - - -o, --output FNAME output file (default: 'imatrix.dat') - --output-frequency N output the imatrix every N iterations (default: 10) - --save-frequency N save an imatrix copy every N iterations (default: 0) - --process-output collect data for the output tensor (default: false) - --no-ppl do not compute perplexity (default: true) - --chunk N start processing the input from chunk N (default: 0) - -bench: - - -pps is the prompt shared across parallel sequences (default: false) - -npp n0,n1,... number of prompt tokens - -ntg n0,n1,... number of text generation tokens - -npl n0,n1,... number of parallel prompts - -embedding: - - --embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) - --embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix - --embd-separator separator of embendings (default \n) for example "<#sep#>" - server: --host HOST ip address to listen (default: 127.0.0.1) @@ -267,7 +232,8 @@ server: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template -sps, --slot-prompt-similarity SIMILARITY how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) - + --lora-init-without-apply + load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) logging: @@ -279,15 +245,6 @@ logging: --log-file FNAME Specify a log filename (without extension) --log-new Create a separate new log file on start. Each log file will have unique name: "..log" --log-append Don't truncate the old log file. - -cvector: - - -o, --output FNAME output file (default: 'control_vector.gguf') - --positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt') - --negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt') - --pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100) - --pca-iter N number of iterations used for PCA (default: 1000) - --method {pca,mean} dimensionality reduction method to be used (default: pca) ``` @@ -411,7 +368,8 @@ node index.js ## API Endpoints -- **GET** `/health`: Returns the current state of the server: +### GET `/health`: Returns the current state of the server + - 503 -> `{"status": "loading model"}` if the model is still being loaded. - 500 -> `{"status": "error"}` if the model failed to load. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. @@ -420,7 +378,7 @@ node index.js If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. -- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. +### POST `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* @@ -498,7 +456,7 @@ node index.js `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. -### Result JSON +**Response format** - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. @@ -537,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`. - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) -- **POST** `/tokenize`: Tokenize a given text. +### POST `/tokenize`: Tokenize a given text *Options:* @@ -545,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`. `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` -- **POST** `/detokenize`: Convert tokens to text. +### POST `/detokenize`: Convert tokens to text *Options:* `tokens`: Set the tokens to detokenize. -- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. +### POST `/embedding`: Generate embedding of a given text + +The same as [the embedding example](../embedding) does. *Options:* @@ -559,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. +### POST `/infill`: For code infilling. + +Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* @@ -571,7 +533,7 @@ Notice that each `probs` is an array of length `n_probs`. - **GET** `/props`: Return current server settings. -### Result JSON +**Response format** ```json { @@ -589,7 +551,9 @@ Notice that each `probs` is an array of length `n_probs`. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. +### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API + +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* @@ -641,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. +### POST `/v1/embeddings`: OpenAI-compatible embeddings API *Options:* @@ -675,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. +### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. -### Result JSON +**Response format** ```json [ @@ -738,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` -- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: +### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled: Available metrics: - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. @@ -750,13 +714,13 @@ Available metrics: - `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_deferred`: Number of requests deferred. -- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. +### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. *Options:* `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -770,13 +734,13 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. +### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. *Options:* `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -790,9 +754,9 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. +### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. -### Result JSON +**Response format** ```json { @@ -801,6 +765,42 @@ Available metrics: } ``` +### GET `/lora-adapters`: Get list of all LoRA adapters + +If an adapter is disabled, the scale will be set to 0. + +**Response format** + +```json +[ + { + "id": 0, + "path": "my_adapter_1.gguf", + "scale": 0.0 + }, + { + "id": 1, + "path": "my_adapter_2.gguf", + "scale": 0.0 + } +] +``` + +### POST `/lora-adapters`: Set list of LoRA adapters + +To disable an adapter, either remove it from the list below, or set scale to 0. + +**Request format** + +To know the `id` of the adapter, use GET `/lora-adapters` + +```json +[ + {"id": 0, "scale": 0.2}, + {"id": 1, "scale": 0.8} +] +``` + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d178ca0f7..898c83ea3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -78,6 +78,7 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, }; struct server_task { @@ -622,6 +623,7 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; + std::vector lora_adapters; gpt_params params; @@ -681,6 +683,7 @@ struct server_context { model = llama_init.model; ctx = llama_init.context; + lora_adapters = llama_init.lora_adapters; params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -1850,6 +1853,14 @@ struct server_context { }; queue_results.send(result); } break; + case SERVER_TASK_TYPE_SET_LORA: + { + llama_lora_adapters_apply(ctx, lora_adapters); + server_task_result result; + result.id = task.id; + result.data = json{{ "success", true }}; + queue_results.send(result); + } break; } } @@ -3328,6 +3339,55 @@ int main(int argc, char ** argv) { return res.set_content(root.dump(), "application/json; charset=utf-8"); }; + const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json result = json::array(); + for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { + auto & la = ctx_server.lora_adapters[i]; + result.push_back({ + {"id", i}, + {"path", la.path}, + {"scale", la.scale}, + }); + } + res.set_content(result.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + + const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + const std::vector body = json::parse(req.body); + int max_idx = ctx_server.lora_adapters.size(); + + // clear existing value + for (auto & la : ctx_server.lora_adapters) { + la.scale = 0.0f; + } + + // set value + for (auto entry : body) { + int id = entry.at("id"); + float scale = entry.at("scale"); + if (0 <= id && id < max_idx) { + ctx_server.lora_adapters[id].scale = scale; + } else { + throw std::runtime_error("invalid adapter id"); + } + } + + server_task task; + task.type = SERVER_TASK_TYPE_SET_LORA; + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(content), len, mime_type); @@ -3366,7 +3426,6 @@ int main(int argc, char ** argv) { // register API routes svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); @@ -3381,6 +3440,11 @@ int main(int argc, char ** argv) { svr->Post("/v1/embeddings", handle_embeddings); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); + // LoRA adapters hotswap + svr->Get ("/lora-adapters", handle_lora_adapters_list); + svr->Post("/lora-adapters", handle_lora_adapters_apply); + // Save & load slots + svr->Get ("/slots", handle_slots); if (!params.slot_save_path.empty()) { // only enable slot endpoints if slot_save_path is set svr->Post("/slots/:id_slot", handle_slots_action); diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature new file mode 100644 index 000000000..7b85988ac --- /dev/null +++ b/examples/server/tests/features/lora.feature @@ -0,0 +1,36 @@ +@llama.cpp +@lora +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf + And a model file stories15M_MOE-F16.gguf + And a model alias stories15M_MOE + And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf + And 42 as server seed + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And 64 max tokens to predict + And 0.0 temperature + Then the server is starting + Then the server is healthy + + Scenario: Completion LoRA disabled + Given switch off lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching little|girl|three|years|old + + Scenario: Completion LoRA enabled + Given switch on lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching eye|love|glass|sun diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index df0814cc9..6705a34fc 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -7,6 +7,7 @@ import subprocess import sys import threading import time +import requests from collections.abc import Sequence from contextlib import closing from re import RegexFlag @@ -70,6 +71,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.user_api_key = None context.response_format = None context.temperature = None + context.lora_file = None context.tasks_result = [] context.concurrent_tasks = [] @@ -82,6 +84,12 @@ def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_file = hf_file context.model_file = os.path.basename(hf_file) +@step('a lora adapter file from {lora_file_url}') +def step_download_lora_file(context, lora_file_url: str): + file_name = lora_file_url.split('/').pop() + context.lora_file = f'../../../{file_name}' + with open(context.lora_file, 'wb') as f: + f.write(requests.get(lora_file_url).content) @step('a model file {model_file}') def step_model_file(context, model_file: str): @@ -849,6 +857,17 @@ async def step_erase_slot(context, slot_id): context.response = response +@step('switch {on_or_off} lora adapter {lora_id:d}') +@async_run_until_complete +async def toggle_lora_adapter(context, on_or_off: str, lora_id: int): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/lora-adapters', + json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}], + headers={"Content-Type": "application/json"}) as response: + context.response = response + print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}]) + + @step('the server responds with status code {status_code:d}') def step_server_responds_with_status_code(context, status_code): assert context.response.status == status_code @@ -1326,6 +1345,8 @@ def start_server_background(context): server_args.extend(['--grp-attn-w', context.n_ga_w]) if context.debug: server_args.append('--verbose') + if context.lora_file: + server_args.extend(['--lora', context.lora_file]) if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2c741ea10..f2d7e5c57 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -4,3 +4,4 @@ huggingface_hub~=0.20.3 numpy~=1.26.4 openai~=1.30.3 prometheus-client~=0.20.0 +requests~=2.32.3 diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index ea4d02705..db318542a 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -174,7 +174,7 @@ class Metadata: org_component, model_full_name_component = None, model_id # Check if we erroneously matched against './' or '../' etc... - if org_component is not None and org_component[0] == '.': + if org_component is not None and len(org_component) > 0 and org_component[0] == '.': org_component = None name_parts: list[str] = model_full_name_component.split('-') From 31958546c3e4695a8a24bb7ba3b79a8f76d05afe Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 7 Aug 2024 01:41:54 +0200 Subject: [PATCH 088/154] typo correction (#8891) --- include/llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index f23355a6b..66c266298 100644 --- a/include/llama.h +++ b/include/llama.h @@ -345,7 +345,7 @@ extern "C" { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // itoken embeddings tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored From 725e3d94379d5b619c027347308bccf2e0ead89f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 7 Aug 2024 01:43:00 +0200 Subject: [PATCH 089/154] quantize : update usage comment in quantize.cpp (#8889) This commit updates the usage comment in quantize.cpp to reflect the new name of the executable, which is llama-quantize. --- examples/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d7647258..7312309ae 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -91,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // [[noreturn]] static void usage(const char * executable) { From 506122d854c5d05b4a3d45a294f14bd4c02d9868 Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:01:06 +0800 Subject: [PATCH 090/154] llama-bench : add support for getting cpu info on Windows (#8824) * Add support for getting cpu info on Windows for llama_bench * refactor --------- Co-authored-by: slaren --- examples/llama-bench/llama-bench.cpp | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 521fa8880..42918bfc7 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -27,6 +27,14 @@ #include "ggml-cann.h" #endif +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -96,6 +104,27 @@ static std::string get_cpu_info() { } fclose(f); } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) != ERROR_SUCCESS) { + // fail to open registry key + return ""; + } + char cpu_brand[256]; + DWORD cpu_brand_size = sizeof(cpu_brand); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)cpu_brand, + &cpu_brand_size) == ERROR_SUCCESS) { + id.assign(cpu_brand, cpu_brand_size); + } + RegCloseKey(hKey); #endif // TODO: other platforms return id; From a8dbc6f753f296108121d50f78f67463403dd6fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 7 Aug 2024 09:07:52 +0200 Subject: [PATCH 091/154] CUDA/HIP: fix tests/test-backend-ops (#8896) --- ggml/src/ggml-cuda.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 654f93e83..a00a7af6c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2742,11 +2742,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_MUL_MAT_ID: { struct ggml_tensor * a = op->src[0]; - if (op->op == GGML_OP_MUL_MAT) { - struct ggml_tensor * b = op->src[1]; - if (a->ne[3] != b->ne[3]) { - return false; - } + struct ggml_tensor * b = op->src[1]; + if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { + return false; + } + if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) { + return false; } switch (a->type) { case GGML_TYPE_F32: @@ -2877,7 +2878,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return true; case GGML_OP_FLASH_ATTN_EXT: #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128; + return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128; #else if (op->src[0]->ne[0] == 128) { return true; From 0478174d5959b66096ae6609fcb0df14cab66b51 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 7 Aug 2024 11:25:36 +0100 Subject: [PATCH 092/154] [SYCL] Updated SYCL device filtering (#8901) * Updated device filter to depend on default_selector (fixes non-intel device issues) * Small related update to example/sycl Readme --- examples/sycl/README.md | 24 +++++++++--------------- ggml/src/ggml-sycl/dpct/helper.hpp | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/sycl/README.md b/examples/sycl/README.md index 0e3acd35b..8819d87f5 100644 --- a/examples/sycl/README.md +++ b/examples/sycl/README.md @@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU. List all SYCL devices with ID, compute capability, max work group size, ect. -1. Build the llama.cpp for SYCL for all targets. +1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*. -2. Enable oneAPI running environment +2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)* ``` source /opt/intel/oneapi/setvars.sh @@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh Check the ID in startup log, like: ``` -found 4 SYCL devices: - Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 - Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, - max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 - Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, - max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 - Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 +found 2 SYCL devices: +| | | | |Max | |Max |Global | | +| | | | |compute|Max work|sub |mem | | +|ID| Device Type| Name|Version|units |group |group|size | Driver version| +|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------| +| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138| +| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138| ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero running time, recommended | -|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index ef4609e32..fe4a8f744 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -874,7 +874,7 @@ namespace dpct inline std::string get_preferred_gpu_platform_name() { std::string result; - std::string filter = "level-zero"; + std::string filter = ""; char* env = getenv("ONEAPI_DEVICE_SELECTOR"); if (env) { if (std::strstr(env, "level_zero")) { @@ -892,11 +892,24 @@ namespace dpct else { throw std::runtime_error("invalid device filter: " + std::string(env)); } + } else { + auto default_device = sycl::device(sycl::default_selector_v); + auto default_platform_name = default_device.get_platform().get_info(); + + if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) { + filter = "level-zero"; + } + else if (std::strstr(default_platform_name.c_str(), "CUDA")) { + filter = "cuda"; + } + else if (std::strstr(default_platform_name.c_str(), "HIP")) { + filter = "hip"; + } } - auto plaform_list = sycl::platform::get_platforms(); + auto platform_list = sycl::platform::get_platforms(); - for (const auto& platform : plaform_list) { + for (const auto& platform : platform_list) { auto devices = platform.get_devices(); auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { return d.is_gpu(); From be55695eff44784a141a863f273661a6bce63dfc Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 7 Aug 2024 13:29:02 +0200 Subject: [PATCH 093/154] ggml-backend : fix async copy from CPU (#8897) * ggml-backend : fix async copy from CPU * cuda : more reliable async copy, fix stream used when the devices are the same --- ggml/src/ggml-backend.c | 25 +++++++++++++++---------- ggml/src/ggml-cuda.cu | 30 ++++++++++++++++-------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 954ab2072..e1651cc64 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b } // an async copy would normally happen after all the queued operations on both backends are completed - // sync src, set_async dst - if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src)); - } else { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_copy(src, dst); - ggml_backend_synchronize(backend_dst); - } + // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy + ggml_backend_synchronize(backend_src); + ggml_backend_synchronize(backend_dst); + ggml_backend_tensor_copy(src, dst); } // events @@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); + // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events + // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + ggml_backend_synchronize(input_backend); + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + ggml_backend_tensor_copy(input, input_cpy); + } } } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index a00a7af6c..682c30d45 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, } GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst)); - ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (!ggml_backend_buffer_is_cuda(src->buffer)) { + if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { return false; } - if (!ggml_backend_buffer_is_cuda(dst->buffer)) { + if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { return false; } - // device -> device + // device -> device copy ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; + ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; + ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; + + if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { +#ifndef NDEBUG + GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); +#endif + return false; + } + if (backend_src != backend_dst) { - ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; - ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; - - GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); - GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); - // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } else { #ifdef GGML_CUDA_NO_PEER_COPY return false; @@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #endif } - // record event on src stream + // record event on src stream after the copy if (!cuda_ctx_src->copy_event) { ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); @@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); } else { // src and dst are on the same backend - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; } From 15fa07a5c564d3ed7e7eb64b73272cedb27e73ec Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 7 Aug 2024 18:24:05 +0200 Subject: [PATCH 094/154] make : use C compiler to build metal embed object (#8899) * make : use C compiler to build metal embed object * use rm + rmdir to avoid -r flag in rm --- Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index f4ce4f1fb..d41ebfd42 100644 --- a/Makefile +++ b/Makefile @@ -888,15 +888,16 @@ ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-common.h @echo "Embedding Metal library" @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) - @$(AS) $(TEMP_ASSEMBLY) -o $@ - @rm -f ${TEMP_ASSEMBLY} + $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ + @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s + @rmdir ${TEMP_ASSEMBLY} endif endif # GGML_METAL From ebd541a5705b6f7a4ce67824d1c2d4fc790f1770 Mon Sep 17 00:00:00 2001 From: Pablo Duboue Date: Thu, 8 Aug 2024 04:44:51 -0400 Subject: [PATCH 095/154] make : clean llamafile objects (#8923) `ggml/src/llamafile/sgemm.o` was not deleted on `make clean` --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index d41ebfd42..5000f5819 100644 --- a/Makefile +++ b/Makefile @@ -1206,6 +1206,7 @@ clean: rm -rvf ggml/*.dll rm -rvf ggml/*.so rm -vrf ggml/src/*.o + rm -rvf ggml/src/llamafile/*.o rm -rvf common/build-info.cpp rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-cuda/*.o From 85fca8deb6273b956bc9184bc9d70a07e1d50d07 Mon Sep 17 00:00:00 2001 From: Conrad Kramer Date: Wed, 7 Aug 2024 02:55:49 -0400 Subject: [PATCH 096/154] metal : add abort callback (ggml/905) --- ggml/include/ggml-metal.h | 2 ++ ggml/src/ggml-metal.m | 41 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index 6c3226c37..d483cf1ac 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); +GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); + GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index b512eb0be..c19274176 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -224,6 +224,10 @@ struct ggml_metal_context { bool support_simdgroup_mm; bool should_capture_next_compute; + + // abort ggml_metal_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void * abort_callback_data; }; // MSL code @@ -878,8 +882,11 @@ static enum ggml_status ggml_metal_graph_compute( id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; command_buffer_builder[cb_idx] = command_buffer; - // enqueue the command buffers in order to specify their execution order - [command_buffer enqueue]; + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } } const id *command_buffers = command_buffer_builder; @@ -2827,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder endEncoding]; - [command_buffer commit]; + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } }); // Wait for completion and check status of each command buffer @@ -2847,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute( return GGML_STATUS_FAILED; } + + id next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } if (should_capture) { @@ -3242,6 +3268,15 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } +void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; +} + bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { GGML_ASSERT(ggml_backend_is_metal(backend)); From 5b33ea1ee72fadfa4f96d86dc614631739758cce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 7 Aug 2024 09:57:00 +0300 Subject: [PATCH 097/154] metal : fix struct name (ggml/912) ggml-ci --- ggml/src/ggml-metal.m | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index c19274176..9fc08ab3a 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -210,7 +210,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; -struct ggml_metal_context { +struct ggml_backend_metal_context { int n_cb; id device; @@ -293,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } -static struct ggml_metal_context * ggml_metal_init(int n_cb) { +static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG @@ -310,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; @@ -672,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { return ctx; } -static void ggml_metal_free(struct ggml_metal_context * ctx) { +static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { @@ -738,7 +738,7 @@ static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs return nil; } -static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { +static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; @@ -849,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } static enum ggml_status ggml_metal_graph_compute( - struct ggml_metal_context * ctx, + struct ggml_backend_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { @@ -3176,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { } GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } @@ -3188,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe } GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); } @@ -3239,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) { } ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); - + struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } @@ -3263,7 +3263,7 @@ bool ggml_backend_is_metal(ggml_backend_t backend) { void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } @@ -3271,7 +3271,7 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->abort_callback = abort_callback; ctx->abort_callback_data = user_data; @@ -3280,7 +3280,7 @@ void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_ca bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -3288,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->should_capture_next_compute = true; } From f93d49ab1e2d1d698af8f17ead6d635405d4b289 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 7 Aug 2024 10:00:56 +0300 Subject: [PATCH 098/154] ggml : ignore more msvc warnings (ggml/906) --- ggml/src/ggml-aarch64.c | 2 ++ ggml/src/ggml.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index d7a608997..7adaadc92 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -16,6 +16,8 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" +#elif defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data #endif #define UNUSED GGML_UNUSED diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index daceec414..c937b5e53 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0; // disable POSIX deprecation warnings // these functions are never going away, anyway #pragma warning(disable: 4996) + +// unreachable code because of multiple instances of code after GGML_ABORT +#pragma warning(disable: 4702) #endif #if defined(_WIN32) From e44a561ab090b11d3a6fe539b768404663e4c3d2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Aug 2024 13:19:47 +0300 Subject: [PATCH 099/154] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 1b82b1047..32b198983 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -18703ad600cc68dbdb04d57434c876989a841d12 +6c71d5a071d842118fb04c03c4b15116dff09621 From 366d486c163ea883442313cff1a3b154ab93e168 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Aug 2024 14:40:12 +0300 Subject: [PATCH 100/154] scripts : fix sync filenames (#0) --- scripts/sync-ggml-am.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index aa4895c6d..43ff70927 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -175,12 +175,12 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \ - -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/\1examples\/common-ggml.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \ + -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch From afd27f01fe832ece3d07ef03b7d34a9e80c4a895 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Aug 2024 14:56:52 +0300 Subject: [PATCH 101/154] scripts : sync cann files (#0) --- scripts/sync-ggml-am.sh | 7 +++++++ scripts/sync-ggml.sh | 3 +++ 2 files changed, 10 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 43ff70927..b29892565 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -62,6 +62,7 @@ while read c; do src/ggml*.m \ src/ggml*.metal \ src/ggml*.cu \ + src/ggml-cann/* \ src/ggml-cuda/* \ src/ggml-sycl/* \ src/vulkan-shaders/* \ @@ -108,6 +109,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend.c -> ggml/src/ggml-backend.c + # src/ggml-cann/* -> ggml/src/ggml-cann/ + # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp # src/ggml-common.h -> ggml/src/ggml-common.h # src/ggml-cuda/* -> ggml/src/ggml-cuda/ # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu @@ -126,6 +129,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # include/ggml-alloc.h -> ggml/include/ggml-alloc.h # include/ggml-backend.h -> ggml/include/ggml-backend.h # include/ggml-blas.h -> ggml/include/ggml-blas.h + # include/ggml-cann.h -> ggml/include/ggml-cann.h # include/ggml-cuda.h -> ggml/include/ggml-cuda.h # include/ggml-kompute.h -> ggml/include/ggml-kompute.h # include/ggml-metal.h -> ggml/include/ggml-metal.h @@ -152,6 +156,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ @@ -169,6 +175,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index d6d7d0a60..30a62e088 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -10,6 +10,8 @@ cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c +cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ +cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu @@ -29,6 +31,7 @@ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h +cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h From 3a14e00366399040a139c67dd5951177a8cb5695 Mon Sep 17 00:00:00 2001 From: compilade Date: Thu, 8 Aug 2024 13:33:09 -0400 Subject: [PATCH 102/154] gguf-py : simplify support for quant types (#8838) * gguf-py : use classes for quants * convert_hf : simplify internal quantization type selection * gguf-py : fix flake8 lint * gguf-py : fix BF16 numpy view type * gguf-py : remove LlamaFileTypeMap Too specific to 'llama.cpp', and would be a maintenance burden to keep up to date. * gguf-py : add generic quantize and dequantize functions The quant classes no longer need to be known, only the target or the source type, for 'quantize' and 'dequantize', respectively. --- convert_hf_to_gguf.py | 107 ++++++++--------- gguf-py/gguf/constants.py | 11 +- gguf-py/gguf/lazy.py | 2 + gguf-py/gguf/quants.py | 238 ++++++++++++++++++++++++++------------ 4 files changed, 226 insertions(+), 132 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 38b92bc81..7136db440 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -251,12 +251,7 @@ class Model: return [(self.map_tensor_name(name), data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused return False @@ -285,55 +280,47 @@ class Model: for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): data: np.ndarray # type hint n_dims = len(data.shape) - data_dtype = data.dtype - data_qtype: gguf.GGMLQuantizationType | None = None - - # when both are True, f32 should win - extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) - extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) - - # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) - - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data = gguf.quantize_bf16(data) - assert data.dtype == np.uint16 - data_qtype = gguf.GGMLQuantizationType.BF16 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): - data = gguf.quantize_q8_0(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q8_0 - - else: # default to float16 for quantized tensors - if data_dtype != np.float16: - data = data.astype(np.float16) - data_qtype = gguf.GGMLQuantizationType.F16 - - if data_qtype is None: # by default, convert to float32 - if data_dtype != np.float32: - data = data.astype(np.float32) + if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + ) + ) + or not name.endswith(".weight") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order @@ -1765,7 +1752,7 @@ class DbrxModel(Model): return [(new_name, data_torch)] - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused return n_dims > 1 @@ -2786,18 +2773,22 @@ class MambaModel(Model): return [(new_name, data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del n_dims # unused - - return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + if bid is not None and new_name in ( + self.format_tensor_name( + n, bid, ".weight" if name.endswith(".weight") else "" + ) + for n in [ gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.SSM_X, gguf.MODEL_TENSOR.SSM_DT, gguf.MODEL_TENSOR.SSM_A, gguf.MODEL_TENSOR.SSM_D, ] - ) + ): + return gguf.GGMLQuantizationType.F32 + + return super().tensor_force_quant(name, new_name, bid, n_dims) @Model.register("CohereForCausalLM") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 59ffd92ea..89efe0c80 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1146,6 +1146,9 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 + Q4_0_4_4 = 31 + Q4_0_4_8 = 32 + Q4_0_8_8 = 33 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -1158,7 +1161,7 @@ class LlamaFileType(IntEnum): MOSTLY_F16 = 1 # except 1d tensors MOSTLY_Q4_0 = 2 # except 1d tensors MOSTLY_Q4_1 = 3 # except 1d tensors - MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 + # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 # MOSTLY_Q4_2 = 5 # support has been removed # MOSTLY_Q4_3 = 6 # support has been removed MOSTLY_Q8_0 = 7 # except 1d tensors @@ -1187,6 +1190,9 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q4_0_4_4 = 33 # except 1d tensors + MOSTLY_Q4_0_4_8 = 34 # except 1d tensors + MOSTLY_Q4_0_8_8 = 35 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1260,6 +1266,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), + GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), + GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), } diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index ac98d9a92..8d4fece2d 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta): class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray + shape: tuple[int, ...] # Makes the type checker happy in quants.py + @classmethod def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index f4361d751..a443dd27e 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -1,5 +1,6 @@ from __future__ import annotations -from typing import Callable, Sequence +from abc import ABC, abstractmethod +from typing import Any, Callable, Sequence from numpy.typing import DTypeLike @@ -9,32 +10,22 @@ from .lazy import LazyNumpyTensor import numpy as np -def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % block_size != 0: raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})") return (*shape[:-1], shape[-1] // block_size * type_size) -def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % type_size != 0: raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})") return (*shape[:-1], shape[-1] // type_size * block_size) -# same as ggml_compute_fp32_to_bf16 in ggml-impl.h -def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: - n = n.astype(np.float32, copy=False).view(np.uint32) - # force nan to quiet - n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) - # round to nearest even - n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 - return n.astype(np.uint16) - - # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time -def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: +def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) osize = 1 for dim in oshape: @@ -46,27 +37,6 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. return out.reshape(oshape) -def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape) - - -__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16) - - -def quantize_bf16(n: np.ndarray): - if type(n) is LazyNumpyTensor: - return __quantize_bf16_lazy(n) - else: - return __quantize_bf16_array(n) - - -__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0] - - -def can_quantize_to_q8_0(n: np.ndarray) -> bool: - return n.shape[-1] % __q8_block_size == 0 - - # round away from zero # ref: https://stackoverflow.com/a/59143326/22827863 def np_roundf(n: np.ndarray) -> np.ndarray: @@ -76,46 +46,168 @@ def np_roundf(n: np.ndarray) -> np.ndarray: return np.sign(n) * b -def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: - return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size) +class QuantError(Exception): ... -# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c -def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray: - shape = n.shape - assert shape[-1] % __q8_block_size == 0 - - n_blocks = n.size // __q8_block_size - - blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False) - - d = abs(blocks).max(axis=1, keepdims=True) / 127 - with np.errstate(divide="ignore"): - id = np.where(d == 0, 0, 1 / d) - qs = np_roundf(blocks * id) - - # (n_blocks, 2) - d = d.astype(np.float16).view(np.uint8) - # (n_blocks, block_size) - qs = qs.astype(np.int8).view(np.uint8) - - assert d.shape[1] + qs.shape[1] == __q8_type_size - - return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape)) +_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {} -def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape)) - - -__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn( - __quantize_q8_0_array, - meta_noop=(np.uint8, __quantize_q8_0_shape_change), -) - - -def quantize_q8_0(data: np.ndarray): - if type(data) is LazyNumpyTensor: - return __quantize_q8_0_lazy(data) +def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.astype(np.float32, copy=False) + elif qtype == GGMLQuantizationType.F16: + return data.astype(np.float16, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.quantize(data) else: - return __quantize_q8_0_array(data) + raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented") + + +def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16: + return data.astype(np.float32, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.dequantize(data) + else: + raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented") + + +class __Quant(ABC): + qtype: GGMLQuantizationType + block_size: int + type_size: int + + def __init__(self): + return TypeError("Quant conversion classes can't have instances") + + def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: + cls.qtype = qtype + cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] + cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__quantize_array, + meta_noop=(np.uint8, cls.__shape_to_bytes) + ) + cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__dequantize_array, + meta_noop=(np.float32, cls.__shape_from_bytes) + ) + assert qtype not in _type_traits + _type_traits[qtype] = cls + + @classmethod + @abstractmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError + + @classmethod + @abstractmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError + + @classmethod + def quantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.astype(np.float32, copy=False) + shape = rows.shape + n_blocks = rows.size // cls.block_size + blocks = rows.reshape((n_blocks, cls.block_size)) + blocks = cls.quantize_blocks(blocks) + assert blocks.dtype == np.uint8 + assert blocks.shape[-1] == cls.type_size + return blocks.reshape(cls.__shape_to_bytes(shape)) + + @classmethod + def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.view(np.uint8) + shape = rows.shape + n_blocks = rows.size // cls.type_size + blocks = rows.reshape((n_blocks, cls.type_size)) + blocks = cls.dequantize_blocks(blocks) + assert blocks.dtype == np.float32 + assert blocks.shape[-1] == cls.block_size + return blocks.reshape(cls.__shape_from_bytes(shape)) + + @classmethod + def __shape_to_bytes(cls, shape: Sequence[int]): + return quant_shape_to_byte_shape(shape, cls.qtype) + + @classmethod + def __shape_from_bytes(cls, shape: Sequence[int]): + return quant_shape_from_byte_shape(shape, cls.qtype) + + @classmethod + def __quantize_array(cls, array: np.ndarray) -> np.ndarray: + return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape)) + + @classmethod + def __dequantize_array(cls, array: np.ndarray) -> np.ndarray: + return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) + + @classmethod + def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass + + @classmethod + def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass + + @classmethod + def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool: + return tensor.shape[-1] % cls.block_size == 0 + + @classmethod + def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if not cls.can_quantize(tensor): + raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}") + if isinstance(tensor, LazyNumpyTensor): + return cls.__quantize_lazy(tensor) + else: + return cls.__quantize_array(tensor) + + @classmethod + def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if isinstance(tensor, LazyNumpyTensor): + return cls.__dequantize_lazy(tensor) + else: + return cls.__dequantize_array(tensor) + + +class BF16(__Quant, qtype=GGMLQuantizationType.BF16): + @classmethod + # same as ggml_compute_fp32_to_bf16 in ggml-impl.h + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n = blocks.view(np.uint32) + # force nan to quiet + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) + # round to nearest even + n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 + return n.astype(np.uint16).view(np.uint8) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) + + +class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): + @classmethod + # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + + d = abs(blocks).max(axis=1, keepdims=True) / 127 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + + # (n_blocks, 2) + d = d.astype(np.float16).view(np.uint8) + # (n_blocks, block_size) + qs = qs.astype(np.int8).view(np.uint8) + + return np.concatenate([d, qs], axis=1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + d, x = np.split(blocks, [2], axis=1) + d = d.view(np.float16).astype(np.float32) + x = x.view(np.int8).astype(np.float32) + + return (x * d) From 345a686d8271a24db20d31789c0d4b9ed51dcb0c Mon Sep 17 00:00:00 2001 From: compilade Date: Thu, 8 Aug 2024 23:54:00 -0400 Subject: [PATCH 103/154] llama : reduce useless copies when saving session (#8916) * llama : avoid useless copies in dummy session writer * llama : avoid double tensor copy when saving session to buffer --- src/llama.cpp | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a7b1c9ebd..68512d2ef 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17343,6 +17343,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi // TODO: replace all non-fatal assertions with returned errors or exceptions struct llama_data_write { virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; virtual size_t get_size_written() = 0; virtual ~llama_data_write() = default; @@ -17465,9 +17466,8 @@ struct llama_data_write { // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * k_size_row; + write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); } } @@ -17486,9 +17486,8 @@ struct llama_data_write { // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_row; + write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); } } } else { @@ -17514,9 +17513,8 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_el; + write_tensor_data(kv_self.v_l[il], src_offset, buf_size); } } } @@ -17875,12 +17873,14 @@ struct llama_data_write_dummy : llama_data_write { llama_data_write_dummy() {} - // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context - void write(const void * /* src */, size_t size) override { size_written += size; } + void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; + } + size_t get_size_written() override { return size_written; } @@ -17903,6 +17903,16 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; + } + size_t get_size_written() override { return size_written; } @@ -17938,6 +17948,7 @@ struct llama_data_read_buffer : llama_data_read { struct llama_data_write_file : llama_data_write { llama_file * file; size_t size_written = 0; + std::vector temp_buffer; llama_data_write_file(llama_file * f) : file(f) {} @@ -17946,6 +17957,12 @@ struct llama_data_write_file : llama_data_write { size_written += size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); + } + size_t get_size_written() override { return size_written; } From daef3ab233fc02e4c53afd9b07366379876f00d1 Mon Sep 17 00:00:00 2001 From: Mathieu Geli Date: Fri, 9 Aug 2024 08:32:02 +0200 Subject: [PATCH 104/154] server : add one level list nesting for embeddings (#8936) --- examples/server/server.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 898c83ea3..360f571e4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -975,6 +975,8 @@ struct server_context { (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { slot.prompt = *prompt; + } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) { + slot.prompt = prompt->at(0); } else { send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); return false; From 6f6496bb0999d1bce5daff0cfc55ceb0dd13c888 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 9 Aug 2024 08:32:23 +0200 Subject: [PATCH 105/154] llama : fix typo in llama_tensor_get_type comment [no ci] (#8937) --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 68512d2ef..be6dbf88a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15304,7 +15304,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. From 5b2c04f4925e152c362b824f4b41eb2a081fa623 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 9 Aug 2024 08:33:30 +0200 Subject: [PATCH 106/154] embedding : add --pooling option to README.md [no ci] (#8934) This commit adds the `--pooling` option to the README.md file in the `examples/embedding` directory. The motivation for adding this options is that currently if the model used does not specify a pooling type the embedding example will fail with the following error message: ```console main: error: pooling type NONE not supported ``` This commit also updates the name of the executable in the examples section. --- examples/embedding/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index e3705b454..12b372bf1 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. @@ -50,11 +50,11 @@ The above command will output space-separated float values. ### Unix-based systems (Linux, macOS, etc.): ```bash -./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` ### Windows: ```powershell -embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` From 70c0ea35609a2ab87a358e25f4ffad1aad408992 Mon Sep 17 00:00:00 2001 From: Matt Stephenson Date: Tue, 16 Jul 2024 03:21:09 -0400 Subject: [PATCH 107/154] whisper : use vulkan as gpu backend when available (whisper/2302) * ggml: use vulkan as gpu backend when available Signed-off-by: Matt Stephenson * whisper: enable using vk as default buffer type Signed-off-by: Matt Stephenson --------- Signed-off-by: Matt Stephenson --- ggml/src/ggml-vulkan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index d7fea78d0..b0f36a513 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -2108,9 +2108,9 @@ void ggml_vk_instance_init() { } static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { - GGML_ASSERT(idx < vk_instance.device_indices.size()); VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); ggml_vk_instance_init(); + GGML_ASSERT(idx < vk_instance.device_indices.size()); ctx->name = GGML_VK_NAME + std::to_string(idx); From 4305b57c80eff4f0df5f6acb60b292f03b8f0dd0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 9 Aug 2024 10:03:48 +0300 Subject: [PATCH 108/154] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 32b198983..eef6768b1 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -6c71d5a071d842118fb04c03c4b15116dff09621 +797faa25af14126eb30134d4033139ae3c5428ed From 3071c0a5f218f107dabd13b73f6090af683ef5ec Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Fri, 9 Aug 2024 18:33:53 +0800 Subject: [PATCH 109/154] llava : support MiniCPM-V-2.5 (#7599) * init * rename * add run android for termux in readme * add android readme * add instructions in readme * change name in readme * Update README.md * fixed line * add result in readme * random pos_embed * add positions index * change for ollama * change for ollama * better pos_embed in clip * support ollama * updata cmakelist * updata cmakelist * rename wrapper * clear code * replace and organize code * add link * sync master * fix warnings * fix warnings * fix bug in bicubic resize when need resize iamge smaller * receive review comments and modify * receive review comments and modify * put all code into llava dir * fix quality problem in pr code * change n_layer * add space in "-1" * imitate reshape bug of python code * fix bug in clip * fix issues for merging * fix llama-minicpmv-cli in cmake file * change pr readme * fix code review * remove in line 33 directory in the /cmakelists.txt (not in example, in the main dir * fix cmakefile * add warn * fix KEY_HAS_MINICPMV_PROJ * remove load_image_size into clip_ctx * remove the extern "C", MINICPMV_API * fix uhd code for review comment * delete minicpmv-wrapper in pr * remove uhd_image_embed * Modify 2 notes * clip : style changes * del common.h in clip * fix Type-Check error * fix Type-Check error * fix Type-Check error * fix Type-Check error * fix makefile error * fix ubuntu-make error * try fix clip * try fix 1 --------- Co-authored-by: Hongji Zhu Co-authored-by: harvestingmoon Co-authored-by: Georgi Gerganov --- .gitignore | 1 - Makefile | 12 + examples/llava/CMakeLists.txt | 7 + examples/llava/README-minicpmv2.5.md | 99 +++ examples/llava/clip.cpp | 616 ++++++++++++++++-- examples/llava/clip.h | 13 +- examples/llava/llava.cpp | 75 ++- examples/llava/llava.h | 5 +- examples/llava/minicpmv-cli.cpp | 309 +++++++++ .../minicpmv-convert-image-encoder-to-gguf.py | 382 +++++++++++ examples/llava/minicpmv-surgery.py | 47 ++ examples/llava/requirements.txt | 1 + 12 files changed, 1488 insertions(+), 79 deletions(-) create mode 100644 examples/llava/README-minicpmv2.5.md create mode 100644 examples/llava/minicpmv-cli.cpp create mode 100644 examples/llava/minicpmv-convert-image-encoder-to-gguf.py create mode 100644 examples/llava/minicpmv-surgery.py diff --git a/.gitignore b/.gitignore index c9b4d9983..5ae030200 100644 --- a/.gitignore +++ b/.gitignore @@ -79,7 +79,6 @@ models-mnt !models/ggml-vocab-*.gguf* # Zig - zig-out/ zig-cache/ diff --git a/Makefile b/Makefile index 5000f5819..9584a44ad 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ + llama-minicpmv-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -1463,6 +1464,17 @@ llama-llava-cli: examples/llava/llava-cli.cpp \ $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) +llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ + examples/llava/clip.h \ + examples/llava/clip.cpp \ + examples/llava/llava.h \ + examples/llava/llava.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index e9fa73acb..bbf5fec58 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET llama-minicpmv-cli) +add_executable(${TARGET} minicpmv-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md new file mode 100644 index 000000000..4affc1d0f --- /dev/null +++ b/examples/llava/README-minicpmv2.5.md @@ -0,0 +1,99 @@ +## MiniCPM-Llama3-V 2.5 + +### Prepare models and code + +Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +### Usage + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) + +```bash +python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 +python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model + +# quantize int4 version +./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +make llama-minicpmv-cli +``` + +Inference on Linux or Mac +``` +# run f16 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7cda5f10c..97823a065 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -74,26 +74,27 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -127,12 +128,20 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_UNKNOWN, }; @@ -140,6 +149,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, }; @@ -492,12 +502,33 @@ struct clip_vision_model { struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_b; + + // MINICPMV projection + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; }; struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; bool has_llava_projector = false; + bool has_minicpmv_projector = false; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -522,9 +553,11 @@ struct clip_ctx { ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; + + struct clip_image_size * load_image_size; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -533,20 +566,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + if (load_image_size == nullptr) { + load_image_size = clip_image_size_init(); + } + LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + image_size_width = load_image_size->width; + image_size_height = load_image_size->height; + if (is_inf) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; + int n_layer = hparams.n_layer; const float eps = hparams.eps; const int batch_size = imgs->size; - if (ctx->has_llava_projector) { + if (ctx->has_llava_projector || ctx->has_minicpmv_projector) { GGML_ASSERT(batch_size == 1); } @@ -559,7 +605,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); @@ -572,19 +618,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); inp = ggml_add(ctx0, inp, model.patch_bias); } - - // concat class_embeddings and patch_embeddings struct ggml_tensor * embeddings = inp; - if (ctx->has_class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + struct ggml_tensor * pos_embed = nullptr; + if (ctx->has_llava_projector) { + // concat class_embeddings and patch_embeddings + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } + } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); @@ -593,6 +641,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + if (ctx->has_minicpmv_projector) { + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + } + // pre-layernorm if (ctx->has_pre_norm) { embeddings = ggml_norm(ctx0, embeddings, eps); @@ -602,6 +658,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers + if (ctx->has_minicpmv_projector) { + n_layer += 1; + } for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -691,7 +750,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // llava projector - { + if (ctx->has_llava_projector) { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); @@ -872,6 +931,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ABORT("fatal error"); } } + // minicpmv projector + else if (ctx->has_minicpmv_projector) + { + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + const int hidden_size = 4096; + const int d_head = 128; + const int n_head = hidden_size/d_head; + const int num_query = 96; + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + } + else { + GGML_ASSERT(false); + } + } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1029,7 +1147,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); } - GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); + if (idx != -1) { + new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); + } + + // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); @@ -1040,6 +1164,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } @@ -1281,6 +1406,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1319,7 +1465,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); @@ -1328,6 +1474,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return new_clip; } +void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { + ctx_clip->load_image_size = load_image_size; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + struct clip_image_u8 * clip_image_u8_init() { return new clip_image_u8(); } @@ -1598,9 +1755,184 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } +static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); +} + +static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_TEE("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if (multiple <= 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if (multiple > 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 * refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { + const int max_slice_nums=9; + const int scale_resolution=448; + const int original_width = ctx_clip->load_image_size->width; + const int original_height = ctx_clip->load_image_size->height; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + return best_grid.first; +} + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { + if (clip_is_minicpmv(ctx)) { + std::vector> imgs = uhd_slice_image(img); + res_imgs->size = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + res_imgs->size += imgs[i].size(); + } + res_imgs->data = new clip_image_f32[res_imgs->size]; + int idx = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t j = 0; j < imgs[i].size(); ++j) { + LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); + res_imgs->data[idx++] = *res; + clip_image_f32_free(res); + } + } + return true; + } + bool pad_to_square = true; if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1816,11 +2148,99 @@ int clip_n_patches(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; + } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + n_patches = 96; } return n_patches; } +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1843,18 +2263,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + image_size_width = imgs->data[0].nx; + image_size_height = imgs->data[0].ny; + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); { @@ -1864,7 +2293,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - GGML_ASSERT(nx == image_size && ny == image_size); + if (!ctx->has_minicpmv_projector) { + GGML_ASSERT(nx == image_size && ny == image_size); + } const int n = nx * ny; @@ -1881,37 +2312,75 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } - - { - if (ctx->has_class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); + if (ctx->has_minicpmv_projector) { + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = std::floor(70.0*i/num_positions); + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); } - } - { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); + if(ctx->load_image_size==nullptr){ + ctx->load_image_size= clip_image_size_init(); + } + int pos_w = ctx->load_image_size->width/patch_size; + int pos_h = ctx->load_image_size->height/patch_size; + int embed_dim = 4096; + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); + for(int i=0;ihas_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); } if (ggml_backend_is_cpu(ctx->backend)) { @@ -2081,7 +2550,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + return 4096; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } + +bool clip_is_minicpmv(const struct clip_ctx * ctx) { + return ctx->has_minicpmv_projector; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index ca3631384..2ff4d3992 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -18,14 +18,17 @@ # define CLIP_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif struct clip_ctx; +struct clip_image_size { + int width; + int height; +}; + struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); +CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); + +CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); +CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); + #ifdef __cplusplus } #endif diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 63878d176..916d9dc40 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector return true; } +static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { + int width = image->nx; + int height = image->ny; + int num_patches = (height / patch_size) * (width / patch_size); + clip_image_f32 * patch = clip_image_f32_init(); + patch->nx = patch_size * num_patches; + patch->ny = patch_size; + patch->buf.resize(3 * patch->nx * patch->ny); + + int patch_index = 0; + + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + for (int pi = 0; pi < patch_size; ++pi) { + for (int pj = 0; pj < patch_size; ++pj) { + int input_index = ((i + pi) * width + (j + pj)) * 3; + int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; + patch->buf[output_index] = image->buf[input_index]; + patch->buf[output_index+1] = image->buf[input_index+1]; + patch->buf[output_index+2] = image->buf[input_index+2]; + } + } + patch_index++; + } + } + return patch; +} static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 @@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { + if (clip_is_minicpmv(ctx_clip)) { + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size); + struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { + const int64_t t_img_enc_step_start_us = ggml_time_us(); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + int patch_size=14; + load_image_size->width = img_res_v.data[i].nx; + load_image_size->height = img_res_v.data[i].ny; + clip_add_load_image_size(ctx_clip, load_image_size); + const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + if (!encoded) { + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + return false; + } + const int64_t t_img_enc_steop_batch_us = ggml_time_us(); + LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + int n_img_pos_out = 0; + for (size_t i = 0; i < image_embd_v.size(); i++) { + std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); + n_img_pos_out += clip_n_patches(ctx_clip); + } + *n_img_pos = n_img_pos_out; + for (size_t i = 0; i < image_embd_v.size(); i++) { + free(image_embd_v[i]); + } + image_embd_v.clear(); + load_image_size->width = img->nx; + load_image_size->height = img->ny; + clip_add_load_image_size(ctx_clip, load_image_size); + LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + } + else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 @@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } else { + } + else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; @@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model + int num_max_patches = 6; + if (clip_is_minicpmv(ctx_clip)) { + num_max_patches = 10; + } + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 19212f6e9..b6feb3027 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -17,12 +17,11 @@ # define LLAVA_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif +struct clip_ctx; struct llava_image_embed { float * embed; int n_image_pos; @@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** free an embedding made with llava_image_embed_make_* */ +LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp new file mode 100644 index 000000000..f951b57b2 --- /dev/null +++ b/examples/llava/minicpmv-cli.cpp @@ -0,0 +1,309 @@ +#include "ggml.h" +#include "log.h" +#include "common.h" +#include "clip.h" +#include "llava.h" +#include "llama.h" + +#include +#include +#include + +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +static struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + if (params->n_ctx < 2048) { + // warn user here, "Image processing requires at least 2048 context, setting context to 2048" + LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + ctx_params.n_ctx = 2048; + } else { + ctx_params.n_ctx = params->n_ctx; + } + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->model = model; + return ctx_llava; +} + +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +static struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + return ctx_clip; +} + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); +} + +static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { + float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); + std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); + + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + slice_embed->embed = image_embed; + slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); + llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); + llava_image_embed_free(slice_embed); +} + +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { + std::string system_prompt; + int idx = 0; + int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + LOG_TEE("%s: image token past: %d\n", __func__, n_past); + eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (num_image_embeds > 1) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + LOG_TEE("%s: image token past: %d\n", __func__, n_past); +} + +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ + auto ctx_clip = clip_init_context(params); + auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + if (!embeds) { + std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + return NULL; + } + + // process the prompt + if (params->prompt.empty() && params->interactive == false) { + LOG_TEE("prompt should be given or interactive mode should be on"); + return NULL; + } + + auto model = llava_init(params); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); + return NULL; + } + const int64_t t_llava_init_start_us = ggml_time_us(); + auto ctx_llava = llava_init_context(params, model); + ctx_llava->ctx_clip = ctx_clip; + const int64_t t_llava_init_end_us = ggml_time_us(); + float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; + LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + + const int64_t t_process_image_start_us = ggml_time_us(); + process_image(ctx_llava, embeds, params, n_past); + const int64_t t_process_image_end_us = ggml_time_us(); + float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; + LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + + llava_image_embed_free(embeds); + return ctx_llava; +} + +static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ + std::string user_prompt = prompt; + if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + // generate the response + + LOG_TEE("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + return ctx_sampling; +} + +static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ + + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + return tmp; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + if (params.mmproj.empty() || (params.image.empty())) { + gpt_params_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; + } + + for (auto & image : params.image) { + int n_past = 0; + auto ctx_llava = minicpmv_init(¶ms, image, n_past); + + if (!params.prompt.empty()) { + LOG_TEE("%s\n", params.prompt.c_str()); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + bool have_tmp = false; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + if (strstr(tmp, "###")) break; // Yi-VL behavior + have_tmp = true; + printf("%s", tmp); + if (strstr(response.c_str(), "")) break; // minicpm-v + + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + }else { + while (true) { + LOG_TEE(""); + std::string prompt; + std::getline(std::cin, prompt); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp);// mistral llava-1.6 + if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + } + printf("\n"); + llama_print_timings(ctx_llava->ctx_llama); + + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + + return 0; +} diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 000000000..12cdd1281 --- /dev/null +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,382 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def add_key_str(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", 448) + fout.add_uint32("clip.vision.patch_size", 14) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + block_count = 26 + fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + ftype_cur = 0 + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py new file mode 100644 index 000000000..2b6bce7cf --- /dev/null +++ b/examples/llava/minicpmv-surgery.py @@ -0,0 +1,47 @@ +import argparse +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") +# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index fbfd0cdd7..dfe5fbe62 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,3 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 +torchvision==0.17.1 From 45a55b91aa87958a75d691be64b070266d4fbb94 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 9 Aug 2024 18:23:52 +0300 Subject: [PATCH 110/154] llama : better replace_all (cont) (#8926) * llama : better replace_all (cont) ggml-ci * code : deduplicate replace_all ggml-ci --- common/common.cpp | 11 +++++++++++ common/common.h | 2 ++ examples/export-lora/export-lora.cpp | 14 -------------- examples/llava/clip.cpp | 17 +++++++---------- src/llama-impl.h | 15 +++++++++++++++ src/llama-vocab.cpp | 14 -------------- src/llama.cpp | 11 ----------- 7 files changed, 35 insertions(+), 49 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2e8374d50..560e20d08 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1777,6 +1777,17 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } +void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; diff --git a/common/common.h b/common/common.h index d88966ece..bbc33a499 100644 --- a/common/common.h +++ b/common/common.h @@ -286,6 +286,8 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +void string_replace_all(std::string & s, const std::string & search, const std::string & replace); + template static std::vector string_split(const std::string & str, char delim) { std::vector values; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index d228ae66e..3176d6e26 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -50,20 +50,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context return ctx_gguf; } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - struct file_input { struct ggml_context * ctx_meta = nullptr; struct gguf_context * ctx_gguf = nullptr; diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 97823a065..54aa822c9 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -210,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { diff --git a/src/llama-impl.h b/src/llama-impl.h index dcc8c1c15..399b134a7 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +// +// helpers +// + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index e6d6059d0..749f85718 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -16,20 +16,6 @@ // helpers // -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - LLAMA_ATTRIBUTE_FORMAT(1, 2) static std::string format(const char * fmt, ...) { va_list ap; diff --git a/src/llama.cpp b/src/llama.cpp index be6dbf88a..decdcebdb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -121,17 +121,6 @@ static std::string trim(const std::string & str) { return str.substr(start, end - start); } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; // Avoid infinite loop if 'search' is an empty string - } - size_t pos = 0; - while ((pos = s.find(search, pos)) != std::string::npos) { - s.replace(pos, search.length(), replace); - pos += replace.length(); - } -} - static bool is_float_close(float a, float b, float abs_tol) { // Check for non-negative tolerance if (abs_tol < 0.0) { From 272e3bd95e620d285b2fb9faaa7b6b1b8edbbf3a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 9 Aug 2024 18:24:30 +0300 Subject: [PATCH 111/154] make : fix llava obj file race (#8946) ggml-ci --- Makefile | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 9584a44ad..649671ed6 100644 --- a/Makefile +++ b/Makefile @@ -1454,26 +1454,20 @@ libllava.a: examples/llava/llava.cpp \ $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llama-llava-cli: examples/llava/llava-cli.cpp \ - examples/llava/clip.h \ - examples/llava/clip.cpp \ - examples/llava/llava.h \ examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ - examples/llava/clip.h \ - examples/llava/clip.cpp \ - examples/llava/llava.h \ examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift From 6afd1a99dc9792096d4567ab9fa1ad530c81c6cd Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Fri, 9 Aug 2024 18:53:09 +0200 Subject: [PATCH 112/154] llama : add support for lora adapters in T5 model (#8938) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stanisław Szymczyk --- src/llama.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index decdcebdb..97dd1b3fe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13167,13 +13167,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -13207,7 +13207,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); cb(cur, "kqv_out", il); } @@ -13281,13 +13281,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); @@ -13334,7 +13334,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cb(cur, "kqv_out", il); } @@ -13351,13 +13351,13 @@ struct llm_build_context { // cross-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -13386,7 +13386,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); cb(cur, "kqv_out", il); } @@ -13443,7 +13443,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); } From b72942fac998672a79a1ae3c03b340f7e629980b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 9 Aug 2024 23:03:21 +0300 Subject: [PATCH 113/154] Merge commit from fork --- examples/rpc/README.md | 4 ++++ examples/rpc/rpc-server.cpp | 13 ++++++++++++- ggml/src/ggml-rpc.cpp | 36 +++++++++++++++++++++++++++++++++++- ggml/src/ggml.c | 3 ++- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/examples/rpc/README.md b/examples/rpc/README.md index e1da801f2..adedc8909 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -1,5 +1,9 @@ ## Overview +> [!IMPORTANT] +> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and +> insecure. **Never run the RPC server on an open network or in a sensitive environment!** + The `rpc-server` allows running `ggml` backend on a remote host. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4..6342e6488 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -16,7 +16,7 @@ #include struct rpc_server_params { - std::string host = "0.0.0.0"; + std::string host = "127.0.0.1"; int port = 50052; size_t backend_mem = 0; }; @@ -114,6 +114,17 @@ int main(int argc, char * argv[]) { fprintf(stderr, "Invalid parameters\n"); return 1; } + + if (params.host != "127.0.0.1") { + fprintf(stderr, "\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str()); + fprintf(stderr, " Never expose the RPC server to an open network!\n"); + fprintf(stderr, " This is an experimental feature and is not secure!\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "\n"); + } + ggml_backend_t backend = create_backend(); if (!backend) { fprintf(stderr, "Failed to create backend\n"); diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index b01ad2674..7757615f5 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -197,6 +197,10 @@ static std::shared_ptr create_server_socket(const char * host, int por fprintf(stderr, "Failed to set SO_REUSEADDR\n"); return nullptr; } + if (inet_addr(host) == INADDR_NONE) { + fprintf(stderr, "Invalid host address: %s\n", host); + return nullptr; + } struct sockaddr_in serv_addr; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = inet_addr(host); @@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp if (result->buffer && buffers.find(result->buffer) == buffers.end()) { return nullptr; } + + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { result->op_params[i] = tensor->op_params[i]; @@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector & input) { const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); uint64_t offset; memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); - size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); + const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), @@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector & input) { return false; } GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); ggml_backend_tensor_set(tensor, data, offset, size); ggml_free(ctx); @@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector & input, std::vectorbuffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + // output serialization format: | data (size bytes) | output.resize(size, 0); ggml_backend_tensor_get(tensor, output.data(), offset, size); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c937b5e53..38990e3a0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3724,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_tensor * view_src, size_t view_offs) { - assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); + GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT); + GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); // find the base tensor and absolute offset if (view_src != NULL && view_src->view_src != NULL) { From 911b437f228e75aa3d235acec21bfddd23ecce2f Mon Sep 17 00:00:00 2001 From: Matteo Mortari Date: Sat, 10 Aug 2024 07:58:49 +0200 Subject: [PATCH 114/154] gguf-py : fix double call to add_architecture() (#8952) Signed-off-by: tarilabs --- gguf-py/examples/writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/examples/writer.py b/gguf-py/examples/writer.py index f39eed1af..731873a7d 100755 --- a/gguf-py/examples/writer.py +++ b/gguf-py/examples/writer.py @@ -15,7 +15,6 @@ def writer_example() -> None: # Example usage with a file gguf_writer = GGUFWriter("example.gguf", "llama") - gguf_writer.add_architecture() gguf_writer.add_block_count(12) gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float From 7c3f55c10051c634546247387c5c359c9d499360 Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Sat, 10 Aug 2024 11:43:26 +0200 Subject: [PATCH 115/154] Add support for encoder-only T5 models (#8900) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * gguf-py : add T5ENCODER model architecture * common : call llama_decode() during warmup only if the model has decoder * convert-hf : add T5EncoderModel * llama : add llama_model_has_decoder() API function * llama : split build_t5() into build_t5_encoder() and build_t5_decoder() * llama : add support for LLM_ARCH_T5ENCODER * llama-embedding : add support for LLAMA_POOLING_TYPE_NONE * llama-embedding : add support for encoder-only models --------- Co-authored-by: Stanisław Szymczyk --- common/common.cpp | 4 +- convert_hf_to_gguf.py | 139 ++++++ examples/embedding/embedding.cpp | 144 ++++-- gguf-py/gguf/constants.py | 17 + include/llama.h | 3 + src/llama.cpp | 730 +++++++++++++++++++------------ 6 files changed, 702 insertions(+), 335 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 560e20d08..d3d896115 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2156,7 +2156,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { tmp.clear(); tmp.push_back(decoder_start_token_id); } - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + if (llama_model_has_decoder(model)) { + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + } llama_kv_cache_clear(lctx); llama_synchronize(lctx); llama_reset_timings(lctx); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7136db440..550dd5cfd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3324,6 +3324,145 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("T5EncoderModel") +class T5EncoderModel(Model): + model_arch = gguf.MODEL_ARCH.T5ENCODER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("JAISLMHeadModel") class JaisModel(Model): model_arch = gguf.MODEL_ARCH.JAIS diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index cd7b448a6..b05aa006e 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -31,13 +31,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + const struct llama_model * model = llama_get_model(ctx); + // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); // run model fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { + // encoder-only model + if (llama_encode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to encode\n", __func__); + } + } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + // decoder-only model + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } } for (int i = 0; i < batch.n_tokens; i++) { @@ -45,11 +56,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu continue; } - // try to get sequence embeddings - supported only when pooling_type is not NONE - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + const float * embd = nullptr; + int embd_pos = 0; - float * out = output + batch.seq_id[i][0] * n_embd; + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + // try to get token embeddings + embd = llama_get_embeddings_ith(ctx, i); + embd_pos = i; + GGML_ASSERT(embd != NULL && "failed to get token embeddings"); + } else { + // try to get sequence embeddings - supported only when pooling_type is not NONE + embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + embd_pos = batch.seq_id[i][0]; + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + } + + float * out = output + embd_pos * n_embd; llama_embd_normalize(embd, out, n_embd, embd_norm); } } @@ -93,8 +115,9 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + + if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); return 1; } @@ -153,13 +176,23 @@ int main(int argc, char ** argv) { const int n_prompts = prompts.size(); struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + // count number of embeddings + int n_embd_count = 0; + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + for (int k = 0; k < n_prompts; k++) { + n_embd_count += inputs[k].size(); + } + } else { + n_embd_count = n_prompts; + } + // allocate output const int n_embd = llama_n_embd(model); - std::vector embeddings(n_prompts * n_embd, 0); + std::vector embeddings(n_embd_count * n_embd, 0); float * emb = embeddings.data(); // break into batches - int p = 0; // number of prompts processed already + int e = 0; // number of embeddings already stored int s = 0; // number of prompts in current batch for (int k = 0; k < n_prompts; k++) { // clamp to n_batch tokens @@ -169,11 +202,11 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { - float * out = emb + p * n_embd; + float * out = emb + e * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - llama_batch_clear(batch); - p += s; + e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; + llama_batch_clear(batch); } // add to batch @@ -182,40 +215,63 @@ int main(int argc, char ** argv) { } // final batch - float * out = emb + p * n_embd; + float * out = emb + e * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); if (params.embd_out.empty()) { - // print the first part of the embeddings or for a single prompt, the full embedding fprintf(stdout, "\n"); - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - } - fprintf(stdout, "\n"); - } - // print cosine similarity matrix - if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); - } - fprintf(stdout, "\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + for (int j = 0; j < n_embd_count; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(3, n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, " ... "); + for (int i = n_embd - 3; i < n_embd; i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } } - fprintf(stdout, "%1.10s", prompts[i].c_str()); fprintf(stdout, "\n"); } + } else { + // print the first part of the embeddings or for a single prompt, the full embedding + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); + } + fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); + } + } } } @@ -233,23 +289,23 @@ int main(int argc, char ** argv) { } fprintf(stdout, notArray ? "]\n }" : "]"); j++; - if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; + if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; } fprintf(stdout, notArray ? "\n ]" : "]\n"); if (params.embd_out == "json+" && n_prompts > 1) { fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); - for (int i = 0;;) { // at least two iteration (n_prompts > 1) + for (int i = 0;;) { // at least two iteration (n_embd_count > 1) fprintf(stdout, " ["); - for (int j = 0;;) { // at least two iteration (n_prompts > 1) + for (int j = 0;;) { // at least two iteration (n_embd_count > 1) float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); fprintf(stdout, "%6.2f", sim); j++; - if (j < n_prompts) fprintf(stdout, ", "); else break; + if (j < n_embd_count) fprintf(stdout, ", "); else break; } fprintf(stdout, " ]"); i++; - if (i < n_prompts) fprintf(stdout, ",\n"); else break; + if (i < n_embd_count) fprintf(stdout, ",\n"); else break; } fprintf(stdout, "\n ]"); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 89efe0c80..f63ec450a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -217,6 +217,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() BITNET = auto() T5 = auto() + T5ENCODER = auto() JAIS = auto() @@ -344,6 +345,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.JAIS: "jais", } @@ -1036,6 +1038,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.T5ENCODER: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ENC_ATTN_NORM, + MODEL_TENSOR.ENC_ATTN_Q, + MODEL_TENSOR.ENC_ATTN_K, + MODEL_TENSOR.ENC_ATTN_V, + MODEL_TENSOR.ENC_ATTN_OUT, + MODEL_TENSOR.ENC_ATTN_REL_B, + MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, + MODEL_TENSOR.ENC_FFN_DOWN, + MODEL_TENSOR.ENC_FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + ], MODEL_ARCH.JAIS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/include/llama.h b/include/llama.h index 66c266298..ce07f4fac 100644 --- a/include/llama.h +++ b/include/llama.h @@ -504,6 +504,9 @@ extern "C" { // Returns true if the model contains an encoder that requires llama_encode() call LLAMA_API bool llama_model_has_encoder(const struct llama_model * model); + // Returns true if the model contains a decoder that requires llama_decode() call + LLAMA_API bool llama_model_has_decoder(const struct llama_model * model); + // For encoder-decoder models, this function returns id of the token that must be provided // to the decoder to start generating output sequence. For other models, it returns -1. LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); diff --git a/src/llama.cpp b/src/llama.cpp index 97dd1b3fe..9c4f2aa72 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -208,6 +208,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -252,6 +253,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1261,6 +1263,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_T5ENCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_JAIS, { @@ -5187,6 +5207,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_T5ENCODER: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + model.type = e_model::MODEL_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -7421,6 +7447,42 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_T5ENCODER: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}); + + layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_JAIS: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -13135,7 +13197,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5() { + struct ggml_cgraph * build_t5_encoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -13150,303 +13212,323 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - if (lctx.is_encoding) { - struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); + GGML_ASSERT(lctx.is_encoding); + struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); + cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); + cb(cur, "kqv_out", il); } - cur = inpL; - cb(cur, "result_embd", -1); - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - } else { - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - - struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); - - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); - - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); - - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); - - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); - - ggml_build_forward_expand(gf, cur); - - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } - - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); - - struct ggml_tensor * inpCA = cur; - - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); - - // cross-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); - - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); - - ggml_build_forward_expand(gf, cur); - - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - cur = inpL; - cb(cur, "result_embd", -1); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_enc, NULL, NULL, + model.layers[il].ffn_gate_enc, NULL, NULL, + model.layers[il].ffn_down_enc, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } + cur = inpL; + cb(cur, "result_embd", -1); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm_enc, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_t5_decoder() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + GGML_ASSERT(!lctx.is_encoding); + GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + + struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); + struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + + struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); + + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); + + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); + + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "kqv_out", il); + } + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "cross_inp", il); + + struct ggml_tensor * inpCA = cur; + + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_norm_cross, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm_cross", il); + + // cross-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); + + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); + cb(cur, "kqv_out", il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + cb(cur, "result_embd", -1); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); return gf; @@ -13898,7 +13980,15 @@ static struct ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_T5: { - result = llm.build_t5(); + if (lctx.is_encoding) { + result = llm.build_t5_encoder(); + } else { + result = llm.build_t5_decoder(); + } + } break; + case LLM_ARCH_T5ENCODER: + { + result = llm.build_t5_encoder(); } break; case LLM_ARCH_JAIS: { @@ -14346,7 +14436,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // TODO: use a per-batch flag for logits presence instead const bool has_logits = !cparams.embeddings; - const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; @@ -14829,9 +14919,24 @@ static int llama_encode_internal( ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output embeddings after the final encoder normalization - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embd = nullptr; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + // there are two cases here + if (llama_model_has_decoder(&lctx.model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + embd = gf->nodes[gf->n_nodes - 1]; + GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + embd = gf->nodes[gf->n_nodes - 1]; + if (strcmp(embd->name, "result_embd_pooled") != 0) { + embd = gf->nodes[gf->n_nodes - 2]; + } + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } ggml_backend_sched_alloc_graph(lctx.sched, gf); @@ -14844,20 +14949,54 @@ static int llama_encode_internal( ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); GGML_ASSERT(backend_embd != nullptr); - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); + if (llama_model_has_decoder(&lctx.model)) { + lctx.embd_enc.resize(n_tokens*n_embd); + float * embd_out = lctx.embd_enc.data(); - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + // remember the sequence ids used during the encoding - needed for cross attention later + lctx.seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < batch.n_seq_id[i]; s++) { + llama_seq_id seq_id = batch.seq_id[i][s]; + lctx.seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(lctx.embd != nullptr); - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < batch.n_seq_id[i]; s++) { - llama_seq_id seq_id = batch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } } } } @@ -16567,6 +16706,8 @@ struct llama_context * llama_new_context_with_model( ctx->sampling.rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; + // build worst-case graph for encoder if a model contains encoder + ctx->is_encoding = llama_model_has_encoder(model); uint32_t kv_size = cparams.n_ctx; ggml_type type_k = params.type_k; @@ -16881,6 +17022,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; @@ -17028,8 +17170,16 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - default: return false; + case LLM_ARCH_T5: return true; + case LLM_ARCH_T5ENCODER: return true; + default: return false; + } +} + +bool llama_model_has_decoder(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_T5ENCODER: return false; + default: return true; } } From 7eb23840ed0f388e10c4bbc4d65802fdfb977b40 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 10 Aug 2024 13:04:40 +0200 Subject: [PATCH 116/154] llama : default n_swa for phi-3 (#8931) * default n_swa for phi-3 * fix * double check swa --- src/llama.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9c4f2aa72..e0fe8013b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4901,7 +4901,6 @@ static void llm_load_hparams( } break; case LLM_ARCH_PHI3: { - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -4910,6 +4909,22 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931 + if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) { + // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct + hparams.n_swa = 2047; + } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-mini-128k-instruct + hparams.n_swa = 262144; + } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-medium-128k-instruct + hparams.n_swa = 131072; + } + bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (!found_swa && hparams.n_swa == 0) { + throw std::runtime_error("invalid value for sliding_window"); + } } break; case LLM_ARCH_PLAMO: { From 6e02327e8b7837358e0406bf90a4632e18e27846 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 10 Aug 2024 15:42:10 +0200 Subject: [PATCH 117/154] metal : fix uninitialized abort_callback (#8968) --- ggml/src/ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 9fc08ab3a..aad189430 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -310,7 +310,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context - struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context)); + struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; From 7c5bfd57f83fd3630934cfa70892aa4022d3faf7 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Sun, 11 Aug 2024 10:09:09 +0200 Subject: [PATCH 118/154] Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. (#8943) * Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. - Allocation overhead for the temporary std::vectors was easily detectable with a sampling profiler and simple to remove. - ggml_vk_sync_buffer introduce a full pipeline sync which has a significant cost on the GPU side, sometimes larger than the actual kernel execution. Adding only barriers for shader read/writes and transfers seems to be sufficient looking at the code which either launches compute kernels or copies tensors. * Fix small typo --------- Co-authored-by: 0cc4m --- ggml/src/ggml-vulkan.cpp | 65 +++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index b0f36a513..867328372 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -268,6 +268,10 @@ struct vk_subbuffer { vk_buffer buffer; uint64_t offset; uint64_t size; + + operator vk::DescriptorBufferInfo() const { + return { buffer->buffer, offset, size }; + } }; struct vk_semaphore { @@ -1063,13 +1067,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - const std::vector mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; - ctx->s->buffer.pipelineBarrier( ctx->q->stage_flags, ctx->q->stage_flags, {}, - mem_barriers, + { { + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} + } }, {}, {} ); @@ -2420,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { + + +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; - for (auto& buffer : buffers) { - std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; + for (auto& buffer : descriptor_buffer_infos) { + std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), "; } std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); - std::vector descriptor_buffer_infos; - std::vector write_descriptor_sets; GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); - GGML_ASSERT(buffers.size() == pipeline->parameter_count); - vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); - } - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); - } + GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); - ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); + vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; + vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; + ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); @@ -3123,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3312,7 +3312,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } @@ -3384,7 +3384,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3459,7 +3459,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3634,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3834,7 +3836,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); } @@ -4381,7 +4384,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (op == GGML_OP_ROPE) { // Empty src2 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf_z; @@ -4392,20 +4395,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (op == GGML_OP_IM2COL) { // im2col uses only src1 and dst buffers ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } } else { GGML_ASSERT(op != GGML_OP_SOFT_MAX); @@ -4442,10 +4445,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } } } From 33309f661a93c9c0ab65a79e5e7e30fa6162992e Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Sun, 11 Aug 2024 10:35:26 +0200 Subject: [PATCH 119/154] llama : check all graph nodes when searching for result_embd_pooled (#8956) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stanisław Szymczyk --- src/llama.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e0fe8013b..aaf8db496 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14722,12 +14722,15 @@ static int llama_decode_internal( res = nullptr; embd = nullptr; } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = gf->nodes[gf->n_nodes - 1]; - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + res = nullptr; // do not extract logits for embedding case + embd = nullptr; + for (int i = gf->n_nodes - 1; i >= 0; --i) { + if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { + embd = gf->nodes[i]; + break; + } } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); } else { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); From a21c6fd45032a20180e026773582d21294c85619 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Sun, 11 Aug 2024 16:37:43 +0800 Subject: [PATCH 120/154] update guide (#8909) Co-authored-by: Neo Zhang <> --- docs/backend/SYCL.md | 145 +++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 39 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index d36ac0a15..59a39fbb6 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -80,7 +80,14 @@ The following release is verified with good quality: ### Intel GPU -**Verified devices** +SYCL backend supports Intel GPU Family: + +- Intel Data Center Max Series +- Intel Flex Series, Arc Series +- Intel Built-in Arc GPU +- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). + +#### Verified devices | Intel GPU | Status | Verified Model | |-------------------------------|---------|---------------------------------------| @@ -88,7 +95,7 @@ The following release is verified with good quality: | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | -| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | +| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -237,6 +244,13 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic ### II. Build llama.cpp #### Intel GPU + +``` +./examples/sycl/build.sh +``` + +or + ```sh # Export relevant ENV variables source /opt/intel/oneapi/setvars.sh @@ -276,23 +290,26 @@ cmake --build build --config Release -j -v ### III. Run the inference -1. Retrieve and prepare model +#### Retrieve and prepare model You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -2. Enable oneAPI running environment +##### Check device + +1. Enable oneAPI running environment ```sh source /opt/intel/oneapi/setvars.sh ``` -3. List devices information +2. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh ./build/bin/llama-ls-sycl-device ``` + This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` found 2 SYCL devices: @@ -304,12 +321,37 @@ found 2 SYCL devices: | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| ``` +#### Choose level-zero devices -4. Launch inference +|Chosen Device ID|Setting| +|-|-| +|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| +|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`| +|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| + +#### Execute + +Choose one of following methods to run. + +1. Script + +- Use device 0: + +```sh +./examples/sycl/run_llama2.sh 0 +``` +- Use multiple devices: + +```sh +./examples/sycl/run_llama2.sh +``` + +2. Command line +Launch inference There are two device selection modes: -- Single device: Use one device target specified by the user. +- Single device: Use one device assigned by user. Default device id is 0. - Multiple devices: Automatically choose the devices with the same backend. In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. @@ -326,11 +368,6 @@ Examples: ```sh ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` -or run by script: - -```sh -./examples/sycl/run_llama2.sh 0 -``` - Use multiple devices: @@ -338,12 +375,6 @@ or run by script: ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` -Otherwise, you can run the script: - -```sh -./examples/sycl/run_llama2.sh -``` - *Notes:* - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: @@ -390,7 +421,7 @@ c. Verify installation In the oneAPI command line, run the following to print the available SYCL devices: ``` -sycl-ls +sycl-ls.exe ``` There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: @@ -411,6 +442,18 @@ b. The new Visual Studio will install Ninja as default. (If not, please install ### II. Build llama.cpp +You could download the release package for Windows directly, which including binary files and depended oneAPI dll files. + +Choose one of following methods to build from source code. + +1. Script + +```sh +.\examples\sycl\win-build-sycl.bat +``` + +2. CMake + On the oneAPI command line window, step into the llama.cpp main directory and run the following: ``` @@ -425,12 +468,8 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI cmake --build build --config Release -j ``` -Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: -```sh -.\examples\sycl\win-build-sycl.bat -``` - Or, use CMake presets to build: + ```sh cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli @@ -442,7 +481,9 @@ cmake --preset x64-windows-sycl-debug cmake --build build-x64-windows-sycl-debug -j --target llama-cli ``` -Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. +3. Visual Studio + +You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. *Notes:* @@ -450,23 +491,25 @@ Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choos ### III. Run the inference -1. Retrieve and prepare model +#### Retrieve and prepare model -You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -2. Enable oneAPI running environment +##### Check device + +1. Enable oneAPI running environment On the oneAPI command line window, run the following and step into the llama.cpp directory: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` -3. List devices information +2. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ``` -build\bin\ls-sycl-device.exe +build\bin\llama-ls-sycl-device.exe ``` This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: @@ -479,9 +522,27 @@ found 2 SYCL devices: | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| ``` +#### Choose level-zero devices +|Chosen Device ID|Setting| +|-|-| +|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| +|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`| +|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| -4. Launch inference +#### Execute + +Choose one of following methods to run. + +1. Script + +``` +examples\sycl\win-run-llama2.bat +``` + +2. Command line + +Launch inference There are two device selection modes: @@ -508,11 +569,7 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca ``` build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` -Otherwise, run the following wrapper script: -``` -.\examples\sycl\win-run-llama2.bat -``` Note: @@ -526,17 +583,18 @@ Or use 1 SYCL GPUs: [0] with Max compute units:512 ``` + ## Environment Variable #### Build | Name | Value | Function | |--------------------|-----------------------------------|---------------------------------------------| -| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | -| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | +| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | +| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | #### Runtime @@ -572,9 +630,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ``` Otherwise, please double-check the GPU driver installation steps. +- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend? + + No. We can't support Ollama issue directly, because we aren't familiar with Ollama. + + Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. + + It's same for other projects including llama.cpp SYCL backend. + + ### **GitHub contribution**: Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. ## TODO -- Support row layer split for multiple card runs. +- NA From 8cd1bcfd3fc9f2b5cbafd7fb7581b3278acec25f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 11 Aug 2024 16:58:58 +0300 Subject: [PATCH 121/154] flake.lock: Update (#8979) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index c54af88ea..f9e1548a2 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1722421184, - "narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=", + "lastModified": 1723175592, + "narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58", + "rev": "5e0ca22929f3342b19569b21b2f3462f053e497b", "type": "github" }, "original": { From 4134999e01f31256b15342b41c4de9e2477c4a6c Mon Sep 17 00:00:00 2001 From: compilade Date: Sun, 11 Aug 2024 14:45:41 -0400 Subject: [PATCH 122/154] gguf-py : Numpy dequantization for most types (#8939) * gguf-py : Numpy dequantization for most types * gguf-py : Numpy dequantization for grid-based i-quants --- gguf-py/gguf/quants.py | 981 ++++++++++++++++++++++++++++++++++- gguf-py/tests/test_quants.py | 237 +++++++++ 2 files changed, 1215 insertions(+), 3 deletions(-) create mode 100755 gguf-py/tests/test_quants.py diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index a443dd27e..ff589b852 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -1,10 +1,11 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import Any, Callable, Sequence +from math import log2, ceil from numpy.typing import DTypeLike -from .constants import GGML_QUANT_SIZES, GGMLQuantizationType +from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K from .lazy import LazyNumpyTensor import numpy as np @@ -64,8 +65,10 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: - if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16: - return data.astype(np.float32, copy=False) + if qtype == GGMLQuantizationType.F32: + return data.view(np.float32) + elif qtype == GGMLQuantizationType.F16: + return data.view(np.float16).astype(np.float32) elif (q := _type_traits.get(qtype)) is not None: return q.dequantize(data) else: @@ -77,6 +80,11 @@ class __Quant(ABC): block_size: int type_size: int + grid: np.ndarray[Any, np.dtype[np.float32]] | None = None + grid_shape: tuple[int, int] = (0, 0) + grid_map: tuple[int | float, ...] = () + grid_hex: bytes | None = None + def __init__(self): return TypeError("Quant conversion classes can't have instances") @@ -94,6 +102,27 @@ class __Quant(ABC): assert qtype not in _type_traits _type_traits[qtype] = cls + @classmethod + def init_grid(cls): + if cls.grid is not None or cls.grid_hex is None: + return + + bits_per_elem = ceil(log2(len(cls.grid_map))) + assert bits_per_elem != 0, cls.qtype.name + elems_per_byte = 8 // bits_per_elem + + grid = np.frombuffer(cls.grid_hex, dtype=np.uint8) + # decode hexadecimal chars from grid + grid = grid.reshape((-1, 2)) + grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2)) + grid = grid[..., 0] | grid[..., 1] + # unpack the grid values + grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte)) + grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1)) + grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1)) + grid = np.take_along_axis(grid_map, grid, axis=-1) + cls.grid = grid.reshape((1, 1, *cls.grid_shape)) + @classmethod @abstractmethod def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: @@ -140,6 +169,7 @@ class __Quant(ABC): @classmethod def __dequantize_array(cls, array: np.ndarray) -> np.ndarray: + cls.init_grid() return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) @classmethod @@ -187,6 +217,166 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16): return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) +class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -8 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q4_0's reference rounding is cursed and depends on FMA + qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8) + + return (d * qs.astype(np.float32)) + + +class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 15 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, qs = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32) + + return (d * qs) + m + + +class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -16 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q5_0's reference rounding is cursed and depends on FMA + q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16) + + return (d * qs.astype(np.float32)) + + +class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 31 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, rest = np.hsplit(rest, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * qs) + m + + class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): @classmethod # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c @@ -211,3 +401,788 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): x = x.view(np.int8).astype(np.float32) return (x * d) + + +class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + scales, rest = np.hsplit(blocks, [QK_K // 16]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + d, dmin = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + # (n_blocks, 16, 1) + dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) + ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) + + shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + + qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3) + + qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32) + + qs = dl * qs - ml + + return qs.reshape((n_blocks, -1)) + + +class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + hmask, rest = np.hsplit(blocks, [QK_K // 8]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [12]) + + d = d.view(np.float16).astype(np.float32) + + # The scales are packed at 6-bit each in this pattern: + # 0: IIIIAAAA + # 1: JJJJBBBB + # 2: KKKKCCCC + # 3: LLLLDDDD + # 4: MMMMEEEE + # 5: NNNNFFFF + # 6: OOOOGGGG + # 7: PPPPHHHH + # 8: MMIIEEAA + # 9: NNJJFFBB + # 10: OOKKGGCC + # 11: PPLLHHDD + lscales, hscales = np.hsplit(scales, [8]) + lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1)) + lscales = lscales.reshape((n_blocks, 16)) + hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1)) + hscales = hscales.reshape((n_blocks, 16)) + scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4)) + scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32) + + dl = (d * scales).reshape((n_blocks, 16, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) + ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3) + qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1)) + qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1 + q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32) + + return (dl * q).reshape((n_blocks, QK_K)) + + +class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): + K_SCALE_SIZE = 12 + + @staticmethod + def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + n_blocks = scales.shape[0] + scales = scales.view(np.uint8) + ### Unpacking the following: ### + # 0 EEAAAAAA + # 1 FFBBBBBB + # 2 GGCCCCCC + # 3 HHDDDDDD + # 4 eeaaaaaa + # 5 ffbbbbbb + # 6 ggcccccc + # 7 hhdddddd + # 8 eeeeEEEE + # 9 ffffFFFF + # 10 ggggGGGG + # 11 hhhhHHHH + scales = scales.reshape((n_blocks, 3, 4)) + d, m, m_d = np.split(scales, 3, axis=-2) + + sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1) + min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1) + + return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8))) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + sc, m = Q4_K.get_scale_min(scales) + + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32) + + return (d * qs - dm).reshape((n_blocks, QK_K)) + + +class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE]) + qh, qs = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + sc, m = Q4_K.get_scale_min(scales) + + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * q - dm).reshape((n_blocks, QK_K)) + + +class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + ql, rest = np.hsplit(blocks, [QK_K // 2]) + qh, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [QK_K // 16]) + + scales = scales.view(np.int8).astype(np.float32) + d = d.view(np.float16).astype(np.float32) + d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) + + ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32) + q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32) + + return (d * q).reshape((n_blocks, QK_K)) + + +class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): + ksigns: bytes = ( + b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f" + b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f" + b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf" + b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f" + b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf" + b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f" + b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f" + b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff" + ) + + # iq2xxs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (256, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a00110014002000220028002a0041004400500058006100" + b"6400800082008a00a20001010401100115014001840198010002020222028202" + b"010404041004210424044004420448046004810484049004a404000502050805" + b"200546056905800591050906100640068406a406000805080808140828084108" + b"440850085208880804094009020a140a01100410101021104010601084109010" + b"951000110811201150115a118011241245120014081420142514491480141815" + b"6215001616160118041810184018811800190519a019511a002002200a204420" + b"6120802082202921482100220222012404241024402456240025412564259026" + b"082820289428442a014004401040184021402440404048405640604081408440" + b"9040004120416141804185410142104248425642684200440844204480449944" + b"124524450046014804481048404845480049584961498249454a904a00500850" + b"1150195020508050885004514251a4519152905492540a550156545600581158" + b"195864584059085a046010604060686000615561186260620064056410651265" + b"84654268008002800a8041808280048118814081118201840484108415844084" + b"608400854685948509864086608602880489118a0490109024904090a1901691" + b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.view(np.uint32).reshape(n_blocks, -1, 2) + + db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) + ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS): + # iq2xs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (512, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"49005000520055005800610064008000820085008800910094009900a0000101" + b"04010601090110011201150118011a0121012401400142014501480151015401" + b"6001680181018401900100020202050208021102140220024102440250025502" + b"80028a0201040404060409041004120415041804210424044004420445044804" + b"5104540456046004810484049004000502050505080511051405200541054405" + b"500561058005010604061006260640064206840600080208050808080a081108" + b"14082008250841084408500858088008a008aa08010904091009400981098909" + b"000a200a280a960aa00a01100410061009101010121015101810211024104010" + b"4210451048105110541060106a10811084109010001102110511081111111411" + b"2011411144115011801194119611011204120612101240126012001402140514" + b"0814111414142014411444144914501464148014011504151015401500161416" + b"49160118041810181218401854188618001905196619511aa91a002002200520" + b"08200a201120142020204120442050208020a020012104211021402148216521" + b"002222228022a82201240424102429244024002541255225992501261a26a626" + b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440" + b"0640094010401240154018402140244040404240454048404a40514054406040" + b"6540814084409040004102410541084111411441204141414441504180418541" + b"a241014204421042124229424042004402440544084411441444194420444144" + b"4444504480449444014504451045244540459a4500460a464446504601480448" + b"1048404845485448624800491149444950496949044a00500250055008501150" + b"145020502850415044505050805001510451105115514051425100524452aa52" + b"0154045410542154405460548154a154005508558055885521566856a1560058" + b"14584158505899581a5940594259855a0160046010604060546062608660a960" + b"006124624a62926200641664106540654565a46501686a682569066a546a626a" + b"00800280058008801180148020802a8041804480508080808280a880aa800181" + b"0481068110814081518159810082208280828282a082a8820184048410841284" + b"158440846084898400854485a58518866a860088088825885a8880888288a888" + b"0689228a808a888a968aa88a0190049010904090569084900091229164915692" + b"89920094059444945094589429959095929541965198a6984999159a609a00a0" + b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4" + b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [2 * QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qs = qs.view(np.uint16) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128) + signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S): + # iq2s_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (1024, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"490050005200550058006100640066006900800082008500880091009400a000" + b"a500aa0001010401060109011001120115011801210124014001420145014801" + b"510154015601590160016501680181018401900192019501a101a40100020202" + b"050208021102140220022a02410244024602490250025502800285028a029402" + b"a202010404040604090410041204150418042104240426042904400442044504" + b"48044a0451045404560459046004620465048104840486048904900495049804" + b"a104a40400050205050508050a05110514051605190520052505280541054405" + b"46054905500552055505580561056405800582058505880591059405a0050106" + b"0406060609061006150640064506480651065406600681068406900600080208" + b"050808081108140816081908200825082a084108440846084908500852085508" + b"580861086408800885089408aa08010904091009120915091809210940094509" + b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410" + b"0610091010101210151018102110241026104010421045104810511054105610" + b"59106010621065106810811084108610901095109810a110a410001102110511" + b"08110a1111111411161119112011221125112811411144114611491150115211" + b"5511581161116411801182118511881191119411011204120912101215122112" + b"2412401245125112541281128412901200140214051408141114141416141914" + b"2014251428144114441446144914501452145514581461146414801482148514" + b"881491149414a014011504150615091510151215151518152115241540154215" + b"4515481551155415601581158415901500160516081611161416201641164416" + b"50168016aa160118041806180918101815181818211840184218451848185118" + b"541860188118841800190219051908191119141920194119441950196919a219" + b"041a101a401a561a00200220052008201120142016201920202025202a204120" + b"4420502052205520642080208a209420aa200121042110211221152121214021" + b"4221452151215421602181218421902100220a22222228222a22442250228822" + b"8a22a82201240424062409241024152418242124242440244224452448245124" + b"5424602481248424902400250525082511251425202541254425502566258025" + b"0126042610264026592600280528112814284128442850288a28aa2801290429" + b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40" + b"21402440264040404240454048404a4051405440564059406040624065408140" + b"8440904095409840a140a4400041024105410841114114411641194120412241" + b"2541414144414641494150415241554158416141644180418241854188419141" + b"9441a04101420442104212421542184224424042454248425142544260428142" + b"844200440244054408440a441144144416441944204422442544284441444444" + b"46444944504452445544584461446444804482448544884491449444a0440145" + b"0445064509451045124515451845214524454045424545454845514554456045" + b"6a4581458445904500460246054608461146144620464146444650468046a546" + b"0148044809481048124815481848214824484048424845484848514854486048" + b"84489048004902490549084911491449204941494449504980499649014a044a" + b"104a404a00500250055008501150145016501950205022502550285041504450" + b"4650495050505250555058506150645080508250855088509150945001510451" + b"0651095110511251155118512151245140514251455148515151545160518151" + b"8451905100520552085211521452205241524452505269528052015404540654" + b"0954105412541554185421542454405442544554485451545454605481548454" + b"9054005502550555085511551455205541554455505580550156045610562656" + b"405600580258055808581158145820584158445850585a588058015904591059" + b"4059005a195a855aa85a01600460066010601260156018602160246040604560" + b"4860516054606060846090600061026105610861116114612061416144615061" + b"806199610462106240625662a162006405640864116414642064416444645064" + b"806401650465106540654a656865926500669466016804681068656898680069" + b"2a69426aa16a0080028005800880118014801980208025804180448050805280" + b"5580588061808080858091809480018104810981108112811581188121812481" + b"408142814581488151815481818184819081a981008205820a82118214824182" + b"4482508201840484068409841084128415841884218440844284458448845184" + b"5484608481848484908400850285058508851185148520854185448550858085" + b"8a85018604861086298640860088058811881488418844885088a28801890489" + b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090" + b"4290459048905190549060908190849090900091059111911491419144915091" + b"5a910192049210924092a6920094029405940894119414942094419444945094" + b"8094969401950495109540959895a19500964696649601980498109826984098" + b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0" + b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4" + b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 8]) + signs, rest = np.hsplit(rest, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 32]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS): + grid_shape = (256, 4) + grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e) + grid_hex = ( + b"0000020004001100130017002000220031004200730075000101030110011201" + b"2101250130013201410154017001000202020402110220022202310233023702" + b"5102570275020103070310031203250370031304370444045704730475040105" + b"0705320552053506640610071407160743076107011003101010121021102310" + b"3010321034104710501000110211111120112211011203121012121221123012" + b"7212001302132013311346136613011405145014201524154615711505162217" + b"4017002002201120132020202220262031204220012103210521102112212121" + b"3021632167217021002202221122172220222222372240225522012310231423" + b"7023742335245324032527254125742501270327162745270130103012302130" + b"2330503065307230003102312031313144314631013203321032253252327232" + b"1133333330344734723400350635223555351436363663363337603704401740" + b"3540374053405740744120423742404260426642074345430444514464442545" + b"4345704505471047124730471250415070500051065126515551145232527252" + b"0253535310542354275472540255315550562457425724604460466064602161" + b"6161176264623063366344640565526533660367216703700570077010703270" + b"5270267140711272457252720073157333736073217441740075027524753076" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [QK_K // 4]) + + d = d.view(np.float16).astype(np.float32) + scales = scales.view(np.uint32) + + db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) + ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S): + grid_shape = (512, 4) + grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f) + grid_hex = ( + b"0000010002000500070010001100120014001600200021002500330040004200" + b"4500470051005300600062007100740077000001010102010401100111011501" + b"2001230127013101350144016101650172010002010205020702100213021602" + b"2102250230023402420245024702510253027002730203031103150320032203" + b"3103330336034403500352036703710375030004130417042104240432044004" + b"4304510470040205040520052205260533054105450547056605730506061106" + b"1306310652067106000702070407200722072607330750075407001001100210" + b"0410101011101310151017102010221031103410361054105610611072100011" + b"0111031106111011141121113011331141115011521170117611001212121512" + b"1712201224123212401243125512601272120113041307131013131321132713" + b"3013341341136213701303140514121414143114331442144614501454140115" + b"1015131521153015321551152016241627164416461601170317101712172117" + b"3517411762177017002001200320052007201020122014201620212023202720" + b"3020322041204320452050205220672070207320752000210221102113211721" + b"2221252131213421422151210122042207222122232230223722412253225722" + b"7122742200230223052311232223242331233323422350236623012407242024" + b"2324322435244124722475240425112522253725402553257025002602260726" + b"2126552661260527112726273027432750270230113013301530173022303130" + b"3330353042304430473051306330713001310331053114312131233140316031" + b"7231763100321232203232323432503201331033143321332333273330334133" + b"4333473355337333033411341634223431345234603464340135103512352535" + b"3235443556357335163641360137033720372237353700400440124020402440" + b"2740324041405040704002410741114113412241304135414341514155410142" + b"0342104215422142334240425742624270420443114313432043224331433543" + b"0044024424443744404471440545074521456245134634466046104715473047" + b"4347514702501050145022504050445047505250665074500151035105511251" + b"2151325172510052115223523052365253520253075310532753445351536553" + b"7353015404542054325446541255265551555355425602570457225711601360" + b"1560316033606060006120612761646112623462426255626262706200631463" + b"2163406325644364626400650365346560650566406611671367007004700770" + b"2070227036704070547062700271117124714371457101720472107216722172" + b"3072517202733273357353730174057413742074507422754275027631760077" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + qh, rest = np.hsplit(rest, [QK_K // 32]) + signs, scales = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (1 + 2 * scales) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8) + qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1)) + qs = qs.astype(np.uint16) | (qh << 8) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S): + # iq1s_grid, with each byte packed into 2 bits + # -1, 0, 1 <=> 0, 1, 2 + grid_shape = (2048, 8) + grid_map = (-1, 0, 1) + grid_hex = ( + b"00000200050008000a00110015002000220028002a0045005100540056006500" + b"8000820088008a009500a000a200a800aa000401050111011401160119011a01" + b"2501410146014901520155015a0161016401660168018501910194019601a501" + b"0002020208020a0215022002220228022a024502510259026402690280028202" + b"88028a02910295029902a002a202a802aa021104140416042504410449045504" + b"5a046404650491049904a5040105040505050605150518051a05290540054505" + b"4a0550055105540555055605590560056205650568056a058105910595059805" + b"9a05a105a405a505a605a9051406190641064406500652065506580660066106" + b"6606690685069106940699060008020808080a0815082008220828082a084508" + b"5108560865088008820888088a089508a008a208a808aa080509110914091909" + b"2409250941095009510955096109640969099109940996099909a509000a020a" + b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a" + b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510" + b"58106110641065106910911094109610a110a510011104110611091110111211" + b"1511181121112411291145114a11501151115211541155115611591160116511" + b"841192119511a111a41111121412161225124012461249125212551258125a12" + b"641266128512911294129612a512011406140914141415141814191421142614" + b"41144514461448144a1451145414551456145914621465146814841489149014" + b"94149514981499149a14a114a414a514a914021505150a151115141515151615" + b"191520152215251528152a154115441545154615511552155415551556155915" + b"5a1561156415651566156915801582158415851588158a159015911594159515" + b"961599159a15a015a215a51501160416051606161516161618161a1621162616" + b"401642164416451648164a165116551656165816591661166416651668166916" + b"6a1686168a1692169516a416a916111816182518411844184618491850185518" + b"58185a1860186118641866186918851891189418a5181019121915191a192119" + b"25194219441945194819511954195519561959195a19601965196a1989199119" + b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a" + b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520" + b"28202a20452051205920612065208020822088208a209520a020a220a520a820" + b"aa2005211121142119212521422144214921552158215a216121642165216621" + b"8521902196219921a521012208220a22112215222022222228222a2245225122" + b"562259226522812288228a2291229522a022a222a822aa220524142416241924" + b"252444244524462449245224552458245a2466248524912494249924a124a524" + b"0925152521252925402545254825512554255525592562256525682589259025" + b"9425952598259a25a125a425a625a92505261026122619262526412649265526" + b"6026612669268426862690269a260028022808280a2815282028222828282a28" + b"45285128542865288028822888288a28a028a228a828aa280929112914291929" + b"2529462949295229552961296429662969298529902996299929a429a529002a" + b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a" + b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440" + b"664094409940a140a6400041014104410641094112411541164118411a412141" + b"26412941454148414a41514154415541564159415a41654168416a4181418441" + b"8641904192419541a041a141a241054211421442164225424142524255425a42" + b"6442694289429442a5420144154419442944454448444a445144544455445644" + b"61446244654468446a44814486448944904492449544a044a144a94401450245" + b"05450a4511451445154516451945204525452a45414544454545464549455045" + b"5145544555455645584559456145644565456645694582458445854588459145" + b"94459545964599459a45a545a845aa450146054609461446154618461a462146" + b"2446294640464246454648465046514652465546564659466246654668468146" + b"85468a4694469546a146a446a6460548114815481a4825484248494850485548" + b"5848614864486648694885489148944896489948a5480149054906490a491049" + b"144915491849214924492649404945494a495149524954495549564959496049" + b"6249654966496a49864989499249954996499849a149a449a649a949164a444a" + b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550" + b"1a50215024502950405045504850515054505550565059506550685086508950" + b"95509850a050a150a650a9500551085109510a51115114511551165118511951" + b"20512551265128512a5141514451455146514951505151515251545155515651" + b"585159515a51615164516551665169518251855191519451955196519951a051" + b"a551aa5101520652125215521a5221522452425245524a525152545255525652" + b"595262526552855290529252955299529a52a452045405541154145415541654" + b"185419542154255428542a54415444544554465449544a545054515454545554" + b"5654585459545a54615462546454655466546954805488548a54915494549554" + b"96549954a154a454a554aa540155025504550555065509551055115512551455" + b"1555165519551a55215524552555265529554055415542554455455546554855" + b"4955505551555255545555555655585559555a55605561556455655566556855" + b"69556a5581558455855589558a559055915594559555965598559955a155a455" + b"a555a655a9550056015602560456065608560956115614561556185619562056" + b"2156225624562556265628562956415645564656485649564a56505651565256" + b"545655565656585659565a566156645665566956825685568656885689568a56" + b"915695569a56a256a556a656a856a95604580558065809581058155818582158" + b"2a58455848584a58515854585558565858585958605862586458655882588958" + b"9058925895589858a158a9580159025905590a59115914591559165919592559" + b"41594459455946594959505951595259545955595659585959595a5961596459" + b"655966596959815985598959915994599559965998599959a559045a085a155a" + b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a" + b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060" + b"5560566058605a60616064606660696081609660a56001610461066109611261" + b"15612161226126612961456149615161556156615961656166616a6184618a61" + b"92619561a161a661a96111621662196240624162466255625662586260628562" + b"91629662a56211641264156416641a6421642664296440644264456448644a64" + b"516454645564566459645a646064626465648464856489649064926494649564" + b"966498649a64a164a464a964056508650a651165156516651965446545654665" + b"496550655165546555655665596561656465656566656965866589658a659165" + b"9565966599659a65a265a565a665a86502660966156620662666286629664066" + b"456648664a66516654665566566658665a666066656668668066826685668a66" + b"9466966698669966a066a466a666aa661668196825684168526855685a686168" + b"6968856891689868a66801690469106915692169246926692969406941694569" + b"4669486951695469556956695969606965696a69826984698a699569a169a469" + b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a" + b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480" + b"5680598065808080828088808a809580a080a280a880aa800581118114811681" + b"1981258141814481498150815281558156815881598164816681698185818981" + b"948196819981a5810082028208820a8215822082228228822a82518254825982" + b"65828082828288828a829582a082a282a882aa82148419844184448451845584" + b"5a846184648469849484998401850985128515851a8526852985408541854585" + b"4885518554855585568559855a856585668568856a8581858485868589859085" + b"928595859885a68511861686198625864186448649864a865086558659865a86" + b"618666866a86858691869a86a4860088028808880a8815882088228828882a88" + b"41884588518854885988658869888088828888888a889588a088a288a888aa88" + b"05890689118914891689258941894489468949895089528955895a8961896489" + b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a" + b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590" + b"419046904990559058905a9069906a9085909190949096909990a59001910491" + b"069109911091159118911a912191249126912991409145915091519154915591" + b"569159916291659184918691929195919891a191a491a691a991059211921492" + b"19922592449246924992509252925592589266926992859294929692a9920194" + b"04940694109415941894269440944a9451945494559456945894599460946194" + b"62946594849486949294949495949894a194a9940095059508950a9510951195" + b"14951595169519952195259529952a9541954495459546954995509551955295" + b"549555955695589559955a956195649565956695699581958595889591959295" + b"94959595969599959a95a095a295a595a895aa95019604961096159619962096" + b"2696299645964896499651965296559656965996659668968296849689968a96" + b"929694969596a496a696a9960598169819982598419846985098529855985698" + b"5a98649865988598919896989998a59804990699099910991299159918991a99" + b"209921992499269940994299459948994a995199549955995699599962996599" + b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a" + b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0" + b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0" + b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1" + b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2" + b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4" + b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5" + b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5" + b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6" + b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8" + b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9" + b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa" + b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa" + ) + + delta = np.float32(0.125) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, qh = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint16) + + dl = d * (2 * ((qh >> 12) & 7) + 1) + dl = dl.reshape((n_blocks, -1, 1, 1)) + delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M): + grid_shape = IQ1_S.grid_shape + grid_map = IQ1_S.grid_map + grid_hex = IQ1_S.grid_hex + + delta = IQ1_S.delta + + # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts. + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, rest = np.hsplit(blocks, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 16]) + + # The f16 scale is packed across multiple bytes + scales = scales.view(np.uint16) + d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4)) + d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3] + d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1)) + + scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) + scales = (scales & 0x07).reshape((n_blocks, -1)) + dl = d * (2 * scales + 1) + dl = dl.reshape((n_blocks, -1, 2, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1)) + + delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 2, 2, 1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 2, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL): + kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1)) + + kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1)) + + return (d * qs) + + +class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + scales_h, rest = np.hsplit(rest, [2]) + scales_l, qs = np.hsplit(rest, [QK_K // 64]) + + d = d.view(np.float16).astype(np.float32) + scales_h = scales_h.view(np.uint16) + + scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1)) + scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F) + scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03) + + scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32) + dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F) + + kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1)) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32)) + + return (dl * qs).reshape((n_blocks, -1)) diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py new file mode 100755 index 000000000..8b7a85c2c --- /dev/null +++ b/gguf-py/tests/test_quants.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 + +# Test gguf.quants so that it exactly matches the C implementation of the (de)quantization + +# NOTE: this is kind of a mess, but at least it worked for initially testing the Python implementations. + +from __future__ import annotations + +import argparse +from math import prod +import os +import sys +from pathlib import Path +import ctypes +import logging +import numpy as np + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +import gguf +from gguf.constants import GGMLQuantizationType + + +logger = logging.getLogger("test-quants") + + +c_float_p = ctypes.POINTER(ctypes.c_float) + + +class ggml_init_params(ctypes.Structure): + _fields_ = [ + ("mem_size", ctypes.c_size_t), + ("mem_buffer", ctypes.c_void_p), + ("no_alloc", ctypes.c_bool), + ] + + +class GGMLQuants: + libggml: ctypes.CDLL + + def __init__(self, libggml: Path): + self.libggml = ctypes.CDLL(str(libggml)) + self.libggml.ggml_quantize_chunk.restype = ctypes.c_size_t + # enum ggml_type type, + # const float * src, + # void * dst, + # int64_t start, + # int64_t nrows, + # int64_t n_per_row, + # const float * imatrix) { + self.libggml.ggml_quantize_chunk.argtypes = ( + ctypes.c_int, + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.POINTER(ctypes.c_float), + ) + + self.libggml.ggml_quantize_requires_imatrix.restype = ctypes.c_bool + self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,) + + for t in ( + "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", + "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", + "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m", + "iq4_nl", "iq4_xs", + ): + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + t) + dequant_func.restype = None + dequant_func.argtypes = (ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + + self.libggml.ggml_fp16_to_fp32_row.restype = None + self.libggml.ggml_fp16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + self.libggml.ggml_bf16_to_fp32_row.restype = None + self.libggml.ggml_bf16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + + self.libggml.ggml_init.argtypes = (ggml_init_params,) + + self.libggml.ggml_init(ggml_init_params(1 * 1024 * 1024, 0, False)) + + def dequantize(self, tensor: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + result = np.zeros(gguf.quant_shape_from_byte_shape(tensor.shape, qtype), dtype=np.float32, order="C") + if qtype == GGMLQuantizationType.F32: + # no-op + result = tensor.view(np.float32) + elif qtype == GGMLQuantizationType.F16: + self.libggml.ggml_fp16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) + elif qtype == GGMLQuantizationType.BF16: + self.libggml.ggml_bf16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) + else: + lw_qname = qtype.name.lower() + if lw_qname[-1] == "k": + lw_qname = lw_qname[:-1] + "K" + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + lw_qname) + dequant_func(tensor.ctypes.data_as(ctypes.c_void_p), result.ctypes.data_as(c_float_p), result.size) + return result + + def quantize(self, data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + result = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C") + if self.libggml.ggml_quantize_requires_imatrix(qtype.value): + # TODO: is a column-wise sum of squares appropriate? + qw = np.sum((data * data).reshape((-1, data.shape[-1])), axis=0).ctypes.data_as(c_float_p) + else: + qw = ctypes.cast(0, c_float_p) + result_size = self.libggml.ggml_quantize_chunk(qtype.value, data.ctypes.data_as(c_float_p), result.ctypes.data_as(ctypes.c_void_p), 0, prod(data.shape[:-1]), data.shape[-1], qw) + assert result.size == result_size + return result + + +def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) -> bool: + same = np.array_equal(t1, t2) + if same: + return True + else: + block_size, type_size = gguf.GGML_QUANT_SIZES[qtype] + if t1.dtype == np.float32: + t1 = t1.reshape((-1, block_size)) + t2 = t2.reshape((-1, block_size)) + else: + t1 = t1.reshape((-1, type_size)) + t2 = t2.reshape((-1, type_size)) + x = t1.view(np.uint8) ^ t2.view(np.uint8) + diff_bits = np.count_nonzero(np.unpackbits(x, axis=-1), axis=-1) + num_bad_blocks = np.count_nonzero(diff_bits, axis=0) + if num_bad_blocks == 0 and t1.shape == t2.shape: + logger.debug("Bits are equal, but arrays don't match, likely contains NANs") + return True + logger.debug(f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)") + bad_block_id = np.argmax(diff_bits, axis=0) + logger.debug(f"Worst block id: {bad_block_id}") + logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}") + + sum_diff_bits = np.sum(diff_bits) + logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)") + return False + + +def do_test(libggml_path: Path, quick: bool = False): + ggml_quants = GGMLQuants(libggml_path) + + np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n}) + + r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False) + + for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()): + has_dequantize = False + has_quantize = False + + try: + gguf.dequantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype) + has_dequantize = True + except (NotImplementedError, AssertionError) as e: + if isinstance(e, AssertionError): + logger.error(f"Error with {qtype.name}: {e}") + raise e + try: + gguf.quantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype) + has_quantize = True + except (NotImplementedError, AssertionError) as e: + if isinstance(e, AssertionError): + logger.error(f"Error with {qtype.name}: {e}") + raise e + + if not has_dequantize and not has_quantize: + continue + + logger.info(f"Testing {qtype.name}") + + rc = r.copy(order="C") + + pyq = None + ggq = None + + if has_quantize: + logger.debug(f"Quantizing to {qtype.name} with Python") + pyq = gguf.quants.quantize(rc, qtype) + + logger.debug(f"Quantizing to {qtype.name} with C") + ggq = ggml_quants.quantize(rc, qtype) + + if qtype == GGMLQuantizationType.F16: + pyq = pyq.view(np.uint8) + quant_equal = compare_tensors(pyq, ggq, qtype) + + if not quant_equal: + logger.error(f"Quantization to {qtype.name} does not match ❌") + else: + logger.info(f"Quantization to {qtype.name} matches exactly ✅") + + if has_dequantize: + if ggq is None and not quick: + logger.debug(f"Quantizing to {qtype.name} with C") + ggq = ggml_quants.quantize(rc, qtype) + + if ggq is not None: + logger.debug(f"Dequantizing from {qtype.name} with Python") + pydq = gguf.quants.dequantize(ggq, qtype) + logger.debug(f"Dequantizing from {qtype.name} with C") + ggdq = ggml_quants.dequantize(ggq, qtype) + + dequant_equal = compare_tensors(pydq, ggdq, qtype) + + if not dequant_equal: + logger.error(f"Dequantization from {qtype.name} does not match ❌") + else: + logger.info(f"Dequantization from {qtype.name} matches exactly ✅") + + rq_shape = gguf.quants.quant_shape_to_byte_shape((8, 1024, 1024 // 2), qtype) + rq = np.random.random(rq_shape).astype(np.float16).view(np.uint8) + + logger.debug(f"Dequantizing random f16 data as {qtype.name} with Python") + pydq = gguf.quants.dequantize(rq, qtype) + logger.debug(f"Dequantizing random f16 data as {qtype.name} with C") + ggdq = ggml_quants.dequantize(rq, qtype) + + dequant_equal = compare_tensors(pydq, ggdq, qtype) + + if not dequant_equal: + logger.error(f"Dequantization from random f16 data as {qtype.name} does not match ❌") + else: + logger.info(f"Dequantization from random f16 data as {qtype.name} matches exactly ✅") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation") + parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so") + parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary") + + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG) + + do_test(args.libggml, args.quick) From 5ef07e25ac39e62297a67208c5bcced50835a2dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 12 Aug 2024 10:21:50 +0300 Subject: [PATCH 123/154] server : handle models with missing EOS token (#8997) ggml-ci --- examples/server/server.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 360f571e4..1621c7c43 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -631,6 +631,7 @@ struct server_context { bool clean_kv_cache = true; bool add_bos_token = true; + bool has_eos_token = false; int32_t n_ctx; // total context for all clients / slots @@ -693,7 +694,7 @@ struct server_context { n_ctx = llama_n_ctx(ctx); add_bos_token = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); + has_eos_token = llama_add_eos_token(model) != 1; return true; } @@ -1031,7 +1032,7 @@ struct server_context { { slot.sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) { + if (json_value(data, "ignore_eos", false) && has_eos_token) { slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } From d3ae0ee8d75033921a076131d4d0fa1c6ec579a7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 12 Aug 2024 11:02:01 +0300 Subject: [PATCH 124/154] py : fix requirements check '==' -> '~=' (#8982) * py : fix requirements check '==' -> '~=' * cont : fix the fix * ci : run on all requirements.txt --- .github/workflows/python-check-requirements.yml | 6 ++---- examples/llava/requirements.txt | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 4e0374fc6..46e80aecd 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -6,15 +6,13 @@ on: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' + - '**/requirements*.txt' pull_request: paths: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' + - '**/requirements*.txt' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index dfe5fbe62..cbcbf26c9 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,4 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 -torchvision==0.17.1 +torchvision~=0.17.1 From 2589292cde038ba876c041bcd7b3f0c81f3f11fe Mon Sep 17 00:00:00 2001 From: Liu Jia Date: Mon, 12 Aug 2024 17:46:03 +0800 Subject: [PATCH 125/154] Fix a spelling mistake (#9001) --- src/llama-sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8910f6d65..8f4841d9d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra constexpr float bucket_low = -10.0f; constexpr float bucket_high = 10.0f; constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); - constexpr float bucker_inter = -bucket_low * bucket_scale; + constexpr float bucket_inter = -bucket_low * bucket_scale; std::vector bucket_idx(candidates->size); std::vector histo(nbuckets, 0); for (int i = 0; i < (int)candidates->size; ++i) { const float val = candidates->data[i].logit; - int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); ib = std::max(0, std::min(nbuckets-1, ib)); bucket_idx[i] = ib; ++histo[ib]; From df5478fbea7e652cfad4ee7974ac3b624fd6c7f6 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Mon, 12 Aug 2024 13:21:41 +0100 Subject: [PATCH 126/154] ggml: fix div-by-zero (#9003) Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=70724 In order to access the above bug you need to login using one of the emails in https://github.com/google/oss-fuzz/blob/master/projects/llamacpp/project.yaml#L3-L5 Signed-off-by: David Korczynski --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 38990e3a0..c9b0e8168 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -21129,7 +21129,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ne % ggml_blck_size(info->type) != 0) { + if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); From 1262e7ed13ac197c944f15e1ddb083cb4f36cf65 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Mon, 12 Aug 2024 13:36:41 +0100 Subject: [PATCH 127/154] grammar-parser : fix possible null-deref (#9004) Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=70680 Signed-off-by: David Korczynski --- common/grammar-parser.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index a518b766d..438452eab 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -369,6 +369,9 @@ namespace grammar_parser { } // Validate the state to ensure that all rules are defined for (const auto & rule : state.rules) { + if (rule.empty()) { + throw std::runtime_error("Undefined rule"); + } for (const auto & elem : rule) { if (elem.type == LLAMA_GRETYPE_RULE_REF) { // Ensure that the rule at that location exists From 84eb2f4fad28ceadd415a4e775320c983f4d9a7d Mon Sep 17 00:00:00 2001 From: Frank Mai Date: Mon, 12 Aug 2024 20:45:50 +0800 Subject: [PATCH 128/154] docs: introduce gpustack and gguf-parser (#8873) * readme: introduce gpustack GPUStack is an open-source GPU cluster manager for running large language models, which uses llama.cpp as the backend. Signed-off-by: thxCode * readme: introduce gguf-parser GGUF Parser is a tool to review/check the GGUF file and estimate the memory usage without downloading the whole model. Signed-off-by: thxCode --------- Signed-off-by: thxCode --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1283f6805..7f48fde6e 100644 --- a/README.md +++ b/README.md @@ -186,10 +186,12 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption +- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage **Infrastructure:** - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs **Games:** - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. From 0fd93cdef5e583aa980b3c0d693c0d207f0787a7 Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Mon, 12 Aug 2024 17:13:59 +0200 Subject: [PATCH 129/154] llama : model-based max number of graph nodes calculation (#8970) * llama : model-based max number of graph nodes calculation * Update src/llama.cpp --------- Co-authored-by: slaren --- src/llama.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index aaf8db496..7f2f00031 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3575,13 +3575,8 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; -// TODO: update when needed or think of some clever automatic way to do this -static size_t llama_model_max_nodes(const llama_model & /*model*/) { - //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B - // return 32768; - //} - - return 8192; +static size_t llama_model_max_nodes(const llama_model & model) { + return std::max(8192, model.tensors_by_name.size()*5); } struct llama_model_loader { From 1f67436c5ee6f4c99e71a8518bdfc214c27ce934 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 12 Aug 2024 19:17:03 +0300 Subject: [PATCH 130/154] ci : enable RPC in all of the released builds (#9006) ref: #8912 --- .github/workflows/build.yml | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b9246659a..74b5d4f69 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -222,7 +222,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(nproc) - name: Test @@ -696,22 +696,20 @@ jobs: strategy: matrix: include: - - build: 'rpc-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' From fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2 Mon Sep 17 00:00:00 2001 From: Diogo Teles Sant'Anna Date: Mon, 12 Aug 2024 13:28:23 -0300 Subject: [PATCH 131/154] ci : fix github workflow vulnerable to script injection (#9008) Signed-off-by: Diogo Teles Sant'Anna --- .github/workflows/bench.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index eb69b82c4..56d22bc0c 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -129,6 +129,8 @@ jobs: - name: Server bench id: server_bench + env: + HEAD_REF: ${{ github.head_ref || github.ref_name }} run: | set -eux @@ -137,7 +139,7 @@ jobs: python bench.py \ --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ - --branch ${{ github.head_ref || github.ref_name }} \ + --branch $HEAD_REF \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ --duration ${{ github.event.inputs.duration || env.DURATION }} \ From 828d6ff7d796f48b2c345f6be2805a3c531a089c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 13 Aug 2024 11:41:14 +0200 Subject: [PATCH 132/154] export-lora : throw error if lora is quantized (#9002) --- examples/export-lora/README.md | 6 ++--- examples/export-lora/export-lora.cpp | 35 +++++++++++++++++++++------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 91c33c34a..7dce99c9a 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -17,9 +17,9 @@ For example: ```bash ./bin/llama-export-lora \ - -m open-llama-3b-v2-q8_0.gguf \ - -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf + -m open-llama-3b-v2.gguf \ + -o open-llama-3b-v2-english2tokipona-chat.gguf \ + --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf ``` Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 3176d6e26..c7e5ca788 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -10,6 +10,12 @@ static bool g_verbose = false; +struct tensor_transformation { + struct ggml_tensor * in; + struct ggml_tensor * out; + bool is_copy; +}; + static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); @@ -198,8 +204,7 @@ struct lora_merge_ctx { } // mapping base tensor to out tensor (same shape with base, but different type) - // if out_tensor == nullptr, we only copy it - std::vector> base_to_out_tensors; + std::vector trans; for (auto & it : base_model.tensors) { bool t_a = true; bool t_b = true; @@ -212,14 +217,22 @@ struct lora_merge_ctx { // only copy struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); ggml_set_name(cpy_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + trans.push_back({ + cpy_tensor, + cpy_tensor, + true, + }); gguf_add_tensor(ctx_out, cpy_tensor); } else if (t_a && t_b) { // need merging struct ggml_tensor * out_tensor = ggml_new_tensor( ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); ggml_set_name(out_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + trans.push_back({ + base_tensor, + out_tensor, + false, + }); gguf_add_tensor(ctx_out, out_tensor); } else { throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); @@ -234,12 +247,12 @@ struct lora_merge_ctx { // process base model tensors size_t n_merged = 0; - for (auto & it : base_to_out_tensors) { - if (it.second != nullptr) { - merge_tensor(it.first, it.second); + for (auto & it : trans) { + if (!it.is_copy) { + merge_tensor(it.in, it.out); n_merged++; } else { - copy_tensor(it.first); + copy_tensor(it.in); } } @@ -252,7 +265,7 @@ struct lora_merge_ctx { } printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); + printf("%s : wrote %ld tensors to output file\n", __func__, trans.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -285,6 +298,10 @@ struct lora_merge_ctx { for (size_t i = 0; i < adapters.size(); ++i) { auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_b = adapters[i]->get_tensor(name_lora_b); + // TODO: add support for quantized lora + if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) { + throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32"); + } inp_a[i] = ggml_dup_tensor(ctx, t_a); inp_b[i] = ggml_dup_tensor(ctx, t_b); } From 06943a69f678fb32829ff06d9c18367b17d4b361 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 13 Aug 2024 21:13:15 +0200 Subject: [PATCH 133/154] ggml : move rope type enum to ggml.h (#8949) * ggml : move rope type enum to ggml.h This commit moves the `llama_rope_type` enum from `llama.h` to `ggml.h` and changes its name to `ggml_rope_type`. The motivation for this change is to address the TODO in `llama.h` and use the enum in ggml. Note: This commit does not change the `mode` parameter to be of type `enum ggml_rope_type`. The name `mode` and its usage suggest that it might be more generic and possibly used as a bit field for multiple flags. Further investigation/discussion may be needed to determine if `mode` should be restricted to RoPE types. * squash! ggml : move rope type enum to ggml.h This commit removes GGML_ROPE_TYPE_NONE and GGML_ROPE_TYPE_GLM from ggml.h, and back the llama_rope_type enum. I've kept the assert for GGML_ROPE_TYPE_GLM as I'm not sure if it is safe to remove it yet. * squash! ggml : move rope type enum to ggml.h This commit removes the enum ggml_rope_type from ggml.h and replaces it with a define (GGML_ROPE_TYPE_NEOX). This define is used in the code to check if the mode is set to GPT-NeoX. Also the enum llama_rope_type has been updated to reflect this change. * squash! ggml : move rope type enum to ggml.h This commit contains a suggestion enable the GGML_ROPE_TYPE_NEOX macro/define to be passed to the shader compiler. * squash! ggml : move rope type enum to ggml.h This commit fixes the editorconfig-checker warnings. * squash! ggml : move rope type enum to ggml.h Update comment for ggml_rope function. * Revert "squash! ggml : move rope type enum to ggml.h" This reverts commit 6261222bd0dc0efd51f0fb0435ad3f16a5b52fd6. * squash! ggml : move rope type enum to ggml.h Add GGML_ROPE_TYPE_NEOX to rope_common.comp. * remove extra line --------- Co-authored-by: slaren --- ggml/include/ggml.h | 6 ++++-- ggml/src/ggml-cann/aclnn_ops.cpp | 2 +- ggml/src/ggml-cuda/rope.cu | 2 +- ggml/src/ggml-metal.m | 2 +- ggml/src/ggml-sycl/rope.cpp | 2 +- ggml/src/ggml-vulkan.cpp | 2 +- ggml/src/ggml.c | 4 ++-- ggml/src/kompute-shaders/op_rope_f16.comp | 2 +- ggml/src/kompute-shaders/op_rope_f32.comp | 2 +- ggml/src/kompute-shaders/rope_common.comp | 2 ++ include/llama.h | 7 ++----- 11 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 15602a96d..1d2a35402 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -244,6 +244,8 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 +#define GGML_ROPE_TYPE_NEOX 2 + #define GGUF_MAGIC "GGUF" #define GGUF_VERSION 3 @@ -1453,8 +1455,8 @@ extern "C" { struct ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) - // if mode & 2 == 1, GPT-NeoX style + // if (mode & 1) - skip n_past elements (NOT SUPPORTED) + // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions GGML_API struct ggml_tensor * ggml_rope( diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 8c4132f5b..a4ec8418e 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2881,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; // init cos/sin cache ggml_cann_pool_alloc sin_allocator( diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 99ec1dd98..88f586d68 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -226,7 +226,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const int32_t * pos = (const int32_t *) src1_d; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index aad189430..995f1934b 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -2313,7 +2313,7 @@ static enum ggml_status ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; id pipeline = nil; diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index c7545bcc1..1f06f78fa 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -226,7 +226,7 @@ void ggml_sycl_op_rope( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const int32_t * pos = (const int32_t *) src1_dd; diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 867328372..c0504e434 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -4053,7 +4053,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_ROPE: { const int mode = ((const int32_t *) dst->op_params)[2]; - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c9b0e8168..88e4fb732 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -14094,7 +14094,7 @@ static void ggml_compute_forward_rope_f32( float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { @@ -14219,7 +14219,7 @@ static void ggml_compute_forward_rope_f16( float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { diff --git a/ggml/src/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp index 1a4058b3f..0ecfb2eab 100644 --- a/ggml/src/kompute-shaders/op_rope_f16.comp +++ b/ggml/src/kompute-shaders/op_rope_f16.comp @@ -11,7 +11,7 @@ void main() { const uint i2 = gl_WorkGroupID.y; const uint i1 = gl_WorkGroupID.x; - const bool is_neox = (pcs.mode & 2) != 0; + const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; float corr_dims[2]; rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); diff --git a/ggml/src/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp index 65e03827a..cec0fd9a5 100644 --- a/ggml/src/kompute-shaders/op_rope_f32.comp +++ b/ggml/src/kompute-shaders/op_rope_f32.comp @@ -11,7 +11,7 @@ void main() { const uint i2 = gl_WorkGroupID.y; const uint i1 = gl_WorkGroupID.x; - const bool is_neox = (pcs.mode & 2) != 0; + const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; float corr_dims[2]; rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); diff --git a/ggml/src/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp index 7b9394cb2..df4702896 100644 --- a/ggml/src/kompute-shaders/rope_common.comp +++ b/ggml/src/kompute-shaders/rope_common.comp @@ -1,5 +1,7 @@ #include "common.comp" +#define GGML_ROPE_TYPE_NEOX 2 + // TODO: use a local size of 32 or more (Metal uses 1024) layout(local_size_x = 1) in; diff --git a/include/llama.h b/include/llama.h index ce07f4fac..3c28cf0b5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -95,13 +95,10 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, }; - // note: these values should be synchronized with ggml_rope - // TODO: maybe move this enum to ggml.h (ggml_rope_type) enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, - LLAMA_ROPE_TYPE_NORM = 0, - LLAMA_ROPE_TYPE_NEOX = 2, - LLAMA_ROPE_TYPE_GLM = 4, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, }; enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file From 43bdd3ce188cd247bda7c1bd1ad01fa64e566690 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 14 Aug 2024 09:14:49 +0300 Subject: [PATCH 134/154] cmake : remove unused option GGML_CURL (#9011) --- ggml/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 7fe1661bb..357e7e51e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -129,7 +129,6 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) -option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) From 98a532d474c73d3494a5353024cb6a4fbbabbb35 Mon Sep 17 00:00:00 2001 From: compilade Date: Wed, 14 Aug 2024 02:51:02 -0400 Subject: [PATCH 135/154] server : fix segfault on long system prompt (#8987) * server : fix segfault on long system prompt * server : fix parallel generation with very small batch sizes * server : fix typo in comment --- examples/server/server.cpp | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1621c7c43..c25338f57 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -754,13 +754,13 @@ struct server_context { default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props["seed"] = -1; - // the update_slots() logic will always submit a maximum of n_batch tokens + // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(n_batch, 0, 1); + batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); } metrics.init(); @@ -1137,28 +1137,19 @@ struct server_context { if (!system_prompt.empty()) { system_tokens = ::llama_tokenize(ctx, system_prompt, true); - llama_batch_clear(batch); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, { 0 }, false); - } - const int32_t n_batch = llama_n_batch(ctx); + const int32_t n_tokens_prompt = system_tokens.size(); - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; + for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - if (llama_decode(ctx, batch_view) != 0) { + llama_batch_clear(batch); + + for (int32_t j = 0; j < n_tokens; ++j) { + llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); + } + + if (llama_decode(ctx, batch) != 0) { LOG_ERROR("llama_decode() failed", {}); return; } From 5fd89a70ead34d1a17015ddecad05aaa2490ca46 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 14 Aug 2024 18:32:53 +0200 Subject: [PATCH 136/154] Vulkan Optimizations and Fixes (#8959) * Optimize Vulkan REPEAT performance * Use Vulkan GLSL fused multiply-add instruction where possible * Add GGML_VULKAN_PERF option to output performance data per operator * Rework and fix Vulkan descriptor set and descriptor pool handling * Fix float32 concat f16 shader validation error * Add Vulkan GROUP_NORM eps parameter * Fix validation error with transfer queue memory barrier flags * Remove trailing whitespaces --- Makefile | 4 + ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 4 + ggml/src/ggml-vulkan.cpp | 1388 ++++++++--------- ggml/src/vulkan-shaders/concat.comp | 6 +- ggml/src/vulkan-shaders/mul_mat_vec.comp | 3 +- ggml/src/vulkan-shaders/mul_mat_vec_nc.comp | 2 +- ggml/src/vulkan-shaders/mul_mat_vec_p021.comp | 2 +- ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp | 35 +- ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp | 19 +- ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp | 43 +- ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp | 56 +- ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp | 26 +- ggml/src/vulkan-shaders/mul_mm.comp | 15 +- ggml/src/vulkan-shaders/repeat.comp | 24 + .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 4 + 16 files changed, 781 insertions(+), 851 deletions(-) create mode 100644 ggml/src/vulkan-shaders/repeat.comp diff --git a/Makefile b/Makefile index 649671ed6..332496cfc 100644 --- a/Makefile +++ b/Makefile @@ -763,6 +763,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG endif +ifdef GGML_VULKAN_PERF + MK_CPPFLAGS += -DGGML_VULKAN_PERF +endif + ifdef GGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE endif diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 357e7e51e..cc1685884 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -135,6 +135,7 @@ option(GGML_VULKAN "ggml: use Vulkan" option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) +option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 425a25895..1775ef3cc 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -602,6 +602,10 @@ if (GGML_VULKAN) add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) endif() + if (GGML_VULKAN_PERF) + add_compile_definitions(GGML_VULKAN_PERF) + endif() + if (GGML_VULKAN_VALIDATE) add_compile_definitions(GGML_VULKAN_VALIDATE) endif() diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index c0504e434..7a0ec706f 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1,6 +1,6 @@ #include "ggml-vulkan.h" #include -#ifdef GGML_VULKAN_RUN_TESTS +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) #include #endif @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -34,9 +35,7 @@ #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de -#define VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN 0 -#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1 -#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2 +#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32 #define GGML_VK_MAX_NODES 8192 @@ -74,6 +73,8 @@ struct vk_queue { std::vector cmd_buffers; vk::PipelineStageFlags stage_flags; + + bool transfer_only; }; struct vk_pipeline_struct { @@ -133,6 +134,9 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { #ifdef GGML_VULKAN_MEMORY_DEBUG class vk_memory_logger; #endif +#ifdef GGML_VULKAN_PERF +class vk_perf_logger; +#endif static void ggml_vk_destroy_buffer(vk_buffer& buf); struct vk_device_struct { @@ -148,7 +152,6 @@ struct vk_device_struct { vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; - uint32_t descriptor_set_mode; uint32_t subgroup_size; bool uma; @@ -186,6 +189,7 @@ struct vk_device_struct { vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; @@ -205,7 +209,8 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; - std::vector pipelines; + std::unordered_map pipelines; + std::unordered_map pipeline_descriptor_set_requirements; std::vector> pinned_memory; @@ -217,6 +222,9 @@ struct vk_device_struct { #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; #endif +#ifdef GGML_VULKAN_PERF + std::unique_ptr perf_logger; +#endif ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -231,11 +239,11 @@ struct vk_device_struct { } for (auto& pipeline : pipelines) { - if (pipeline.expired()) { + if (pipeline.second.expired()) { continue; } - vk_pipeline pl = pipeline.lock(); + vk_pipeline pl = pipeline.second.lock(); ggml_vk_destroy_pipeline(device, pl); } pipelines.clear(); @@ -479,6 +487,48 @@ private: #define VK_LOG_MEMORY(msg) ((void) 0) #endif // GGML_VULKAN_MEMORY_DEBUG +#if defined(GGML_VULKAN_PERF) + +class vk_perf_logger { +public: + void print_timings() { + std::cerr << "----------------\nVulkan Timings:" << std::endl; + for (const auto& t : timings) { + uint64_t total = 0; + for (const auto& time : t.second) { + total += time; + } + std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl; + } + + timings.clear(); + } + + void log_timing(const ggml_tensor * node, uint64_t time) { + if (node->op == GGML_OP_UNARY) { + timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); + return; + } + if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { + const uint64_t m = node->src[0]->ne[1]; + const uint64_t n = node->src[1]->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + std::string name = ggml_op_name(node->op); + if (n == 1) { + name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k); + } else { + name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); + } + timings[name].push_back(time); + return; + } + timings[ggml_op_name(node->op)].push_back(time); + } +private: + std::map> timings; +}; +#endif // GGML_VULKAN_PERF + struct ggml_backend_vk_context { std::string name; @@ -489,9 +539,6 @@ struct ggml_backend_vk_context { size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; vk_buffer prealloc_x, prealloc_y, prealloc_split_k; vk::Fence fence; - vk_buffer staging; - size_t staging_size; - size_t staging_offset; vk_buffer buffer_pool[MAX_VK_BUFFERS]; @@ -595,35 +642,9 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co descriptor_set_layout_create_info.setPNext(&dslbfci); pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); - // Check if device supports multiple descriptors per pool - if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { - const uint32_t alloc_count = 2; - - // Try allocating multiple sets from one pool - // This fails on AMD for some reason, so add a fall back to allocating one pool per set - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size); - vk::DescriptorPool pool = device->device.createDescriptorPool(descriptor_pool_create_info); - - std::vector layouts(alloc_count); - for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = pipeline->dsl; - } - try { - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data()); - std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); - } catch(vk::OutOfPoolMemoryError const&) { - device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; - } - - device->device.destroyDescriptorPool(pool); - } - - if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size); - pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); - } + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); + pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); pipeline->descriptor_set_idx = 0; @@ -657,7 +678,7 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co pipeline->layout); pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - device->pipelines.push_back(pipeline); + device->pipelines.insert({ pipeline->name, pipeline }); } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { @@ -678,34 +699,49 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) device.destroyPipeline(pipeline->pipeline); } -static void ggml_pipeline_allocate_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) { - VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")"); - if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { - // Enough descriptors are available - return; - } +static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) { + VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); + device->pipeline_descriptor_set_requirements[pipeline->name] += n; +} +static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) { std::lock_guard guard(device->mutex); - if (device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { - const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size(); + for (auto& pair : device->pipeline_descriptor_set_requirements) { + vk_pipeline pipeline = device->pipelines.at(pair.first).lock(); + const uint64_t n = pair.second; - std::vector layouts(alloc_count); - for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = pipeline->dsl; + VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")"); + + if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { + // Enough descriptors are available + continue; } - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data()); - std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); - pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end()); - } else { - for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size); - pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl); + uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size(); + uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; + uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; + + while (to_alloc > 0) { + const uint32_t alloc_count = std::min(pool_remaining, to_alloc); + to_alloc -= alloc_count; + pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; + + if (pool_idx >= pipeline->descriptor_pools.size()) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); + pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); + } + + std::vector layouts(alloc_count); + for (uint32_t i = 0; i < alloc_count; i++) { + layouts[i] = pipeline->dsl; + } + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data()); std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); - pipeline->descriptor_sets.push_back(sets[0]); + pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end()); + + pool_idx++; } } } @@ -866,11 +902,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector guard(device->mutex); q.queue_family_index = queue_family_index; + q.transfer_only = transfer_only; vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index); q.pool = device->device.createCommandPool(command_pool_create_info_compute); @@ -1067,13 +1104,16 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); + + const bool transfer_queue = ctx->q->transfer_only; + ctx->s->buffer.pipelineBarrier( ctx->q->stage_flags, ctx->q->stage_flags, {}, { { - {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, - {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} + { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, + { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) } } }, {}, {} @@ -1664,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -1703,6 +1745,9 @@ static vk_device ggml_vk_get_device(size_t idx) { #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger = std::unique_ptr(new vk_memory_logger()); #endif +#ifdef GGML_VULKAN_PERF + device->perf_logger = std::unique_ptr(new vk_perf_logger()); +#endif size_t dev_num = vk_instance.device_indices[idx]; @@ -1833,17 +1878,15 @@ static vk_device ggml_vk_get_device(size_t idx) { device_create_info.setPNext(&device_features2); device->device = device->physical_device.createDevice(device_create_info); - device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; - // Queues - ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); + ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); // Shaders ggml_vk_load_shaders(device); if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; - ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); + ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); } else { // TODO: Use pointer or reference to avoid copy device->transfer_queue = device->compute_queue; @@ -2130,9 +2173,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->fence = ctx->device->device.createFence({}); - ctx->staging_size = 0; - ctx->staging_offset = 0; - #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); @@ -2565,23 +2605,15 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont return; } - // Staging buffer required - vk_buffer staging = ctx->staging; - size_t staging_offset = ctx->staging_offset; - const size_t copy_size = ts*ne/bs; - if (ctx->staging->size < ctx->staging_offset + copy_size) { - if (sync_staging) { - // Create temporary larger buffer - ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); - - staging = ctx->device->sync_staging; - staging_offset = 0; - } else { - GGML_ABORT("fatal error"); - } + if (!sync_staging) { + GGML_ABORT("Asynchronous write to non-pinned memory not supported"); } - VkBufferCopy buf_copy{ staging_offset, offset, copy_size }; + // Staging buffer required + vk_buffer& staging = ctx->device->sync_staging; + const uint64_t copy_size = ts*ne/bs; + ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); + VkBufferCopy buf_copy{ 0, offset, copy_size }; ggml_vk_sync_buffers(subctx); vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); @@ -2590,14 +2622,14 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont for (uint64_t i2 = 0; i2 < ne2; i2++) { // Find longest contiguous slice if (ne1*nb1 == dstnb2) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); } else { for (uint64_t i1 = 0; i1 < ne1; i1++) { if (ne0*nb0/bs == dstnb1) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); } else { const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; - const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; + const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); } @@ -2608,7 +2640,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } } -static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { @@ -2643,21 +2675,18 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz } VK_LOG_DEBUG("STAGING"); - // Staging buffer required - const size_t copy_size = width*height; - if (staging_buffer == nullptr || staging_buffer->size < staging_offset + copy_size) { - if (sync_staging) { - ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size); - - staging_buffer = dst->device->sync_staging; - staging_offset = 0; - } else { - GGML_ABORT("fatal error"); - } + if (!sync_staging) { + GGML_ABORT("Asynchronous write to non-pinned memory not supported"); } + // Staging buffer required + const size_t copy_size = width*height; + ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size); + + vk_buffer& staging_buffer = dst->device->sync_staging; + VkBufferCopy buf_copy = { - staging_offset, + 0, offset, copy_size}; @@ -2665,17 +2694,17 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy); if (width == spitch) { - deferred_memcpy((uint8_t *)staging_buffer->ptr + staging_offset, src, width * height, &subctx->in_memcpys); + deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys); } else { for (size_t i = 0; i < height; i++) { - deferred_memcpy((uint8_t *)staging_buffer->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); + deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); } } } -static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); - return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging); + return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging); } static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { @@ -2690,7 +2719,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * } else { vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); ggml_vk_ctx_begin(dst->device, subctx); - ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true); + ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true); ggml_vk_ctx_end(subctx); for (auto& cpy : subctx->in_memcpys) { @@ -2708,7 +2737,7 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); } -static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); @@ -2745,18 +2774,15 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size } VK_LOG_DEBUG("STAGING"); + if (!sync_staging) { + GGML_ABORT("Asynchronous read from non-pinned memory not supported"); + } + // Fall back to staging buffer const size_t copy_size = dpitch * height; - if (staging_buffer == nullptr || staging_buffer->size < staging_offset + copy_size) { - if (sync_staging) { - // Create temporary larger buffer - ggml_vk_ensure_sync_staging_buffer(src->device, copy_size); + ggml_vk_ensure_sync_staging_buffer(src->device, copy_size); - staging_buffer = src->device->sync_staging; - } else { - GGML_ABORT("fatal error"); - } - } + vk_buffer& staging_buffer = src->device->sync_staging; ggml_vk_sync_buffers(subctx); subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices); @@ -2764,8 +2790,8 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); } -static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { - return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging); +static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { + return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging); } static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { @@ -2777,7 +2803,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ } else { vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); ggml_vk_ctx_begin(src->device, subctx); - ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true); + ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); @@ -2978,10 +3004,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); } -static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -3055,6 +3082,56 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + } else { + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + + if (dryrun) { + const uint64_t x_sz_upd = x_sz * ne02 * ne03; + const uint64_t y_sz_upd = y_sz * ne12 * ne13; + const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0; + if ( + (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || + (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) || + (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { + ctx->prealloc_size_x = x_sz_upd; + } + if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) { + ctx->prealloc_size_y = y_sz_upd; + } + if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; + } + + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + } + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1); + } + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3090,34 +3167,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); - } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - - // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1); - if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1); - } - if (split_k > 1) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1); - } - if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); } else if (qx_needs_dequant) { @@ -3151,10 +3200,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub ); // NOLINT } -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -3218,6 +3268,47 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type); + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + GGML_ASSERT(dmmv != nullptr); + + if (dryrun) { + const uint64_t x_sz_upd = x_sz * ne02 * ne03; + const uint64_t y_sz_upd = y_sz * ne12 * ne13; + if ( + (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || + (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { + ctx->prealloc_size_x = x_sz_upd; + } + if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) { + ctx->prealloc_size_y = y_sz_upd; + } + + // Request descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + } + ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1); + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3250,30 +3341,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type); - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - GGML_ASSERT(dmmv != nullptr); - - // Allocate descriptor sets - if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); - } - ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13); - if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); @@ -3316,10 +3383,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT @@ -3360,6 +3428,12 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t d_sz = sizeof(float) * d_ne; + if (dryrun) { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1); + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3372,9 +3446,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c GGML_ASSERT(d_Qx != nullptr); } - // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; @@ -3387,10 +3458,11 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); @@ -3435,6 +3507,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t qy_sz = ggml_nbytes(src1); const uint64_t d_sz = sizeof(float) * d_ne; + if (dryrun) { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3447,9 +3525,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(d_Qx != nullptr); } - // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; @@ -3463,20 +3538,20 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst); + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun); } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst); + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun); } else { - ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst); + ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun); } } -static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -3566,6 +3641,48 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t ids_sz = nbi2; const uint64_t d_sz = sizeof(float) * d_ne; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + } else { + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + + if (dryrun) { + const uint64_t x_sz_upd = x_sz * ne02 * ne03; + const uint64_t y_sz_upd = y_sz * ne12 * ne13; + if ( + (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || + (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { + ctx->prealloc_size_x = x_sz_upd; + } + if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) { + ctx->prealloc_size_y = y_sz_upd; + } + + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + } + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3605,31 +3722,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); - } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - - // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1); - if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, 1); - } - if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); } else if (qx_needs_dequant) { @@ -3664,11 +3756,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& ); // NOLINT } -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); @@ -3742,6 +3835,47 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t ids_sz = nbi2; const uint64_t d_sz = sizeof(float) * d_ne; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type); + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + GGML_ASSERT(dmmv != nullptr); + + if (dryrun) { + const uint64_t x_sz_upd = x_sz * ne02 * ne03; + const uint64_t y_sz_upd = y_sz * ne12 * ne13; + if ( + (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || + (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { + ctx->prealloc_size_x = x_sz_upd; + } + if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) { + ctx->prealloc_size_y = y_sz_upd; + } + + // Request descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + } + ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1); + return; + } + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); @@ -3779,30 +3913,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline to_fp16_vk_0 = nullptr; - vk_pipeline to_fp16_vk_1 = nullptr; - if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); - } - if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); - } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); - } - vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type); - GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT - GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT - GGML_ASSERT(dmmv != nullptr); - - // Allocate descriptor sets - if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_0, 1); - } - if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); - } - ggml_pipeline_allocate_descriptor_sets(ctx->device, dmmv, ne12 * ne13); - if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); @@ -3841,85 +3951,15 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); } -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst); + ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } } -static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")"); - const uint64_t ne0 = dst->ne[0]; - const uint64_t ne1 = dst->ne[1]; - const uint64_t ne2 = dst->ne[2]; - const uint64_t ne3 = dst->ne[3]; - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const uint64_t nb0 = dst->nb[0]; - const uint64_t nb1 = dst->nb[1]; - const uint64_t nb2 = dst->nb[2]; - const uint64_t nb3 = dst->nb[3]; - - const uint64_t nb00 = src0->nb[0]; - const uint64_t nb01 = src0->nb[1]; - const uint64_t nb02 = src0->nb[2]; - const uint64_t nb03 = src0->nb[3]; - - // guaranteed to be an integer due to the check in ggml_can_repeat - const uint64_t nr0 = ne0/ne00; - const uint64_t nr1 = ne1/ne01; - const uint64_t nr2 = ne2/ne02; - const uint64_t nr3 = ne3/ne03; - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - - const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); - const uint64_t src_offset = extra_src0->offset + src0->view_offs; - vk_buffer dst_buf = extra->buffer_gpu.lock(); - const uint64_t dst_offset = extra->offset + dst->view_offs; - - std::vector copies; - - for (uint64_t i3 = 0; i3 < nr3; i3++) { - for (uint64_t k3 = 0; k3 < ne03; k3++) { - for (uint64_t i2 = 0; i2 < nr2; i2++) { - for (uint64_t k2 = 0; k2 < ne02; k2++) { - for (uint64_t i1 = 0; i1 < nr1; i1++) { - for (uint64_t k1 = 0; k1 < ne01; k1++) { - for (uint64_t i0 = 0; i0 < nr0; i0++) { - copies.push_back({ - src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, - dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, - ne00*nb0, - }); - } - } - } - } - } - } - } - - ggml_vk_sync_buffers(subctx); - subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); - - GGML_UNUSED(ctx); - GGML_UNUSED(src1); -} - - static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: @@ -3985,6 +4025,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pad_f32; } return nullptr; + case GGML_OP_REPEAT: + if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { + return ctx->device->pipeline_repeat_f32; + } + return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -4107,15 +4152,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const GGML_UNUSED(src2); } -static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { - switch(op) { - case GGML_OP_REPEAT: - return ggml_vk_op_repeat; - default: - return nullptr; - } -} - static bool ggml_vk_op_supports_incontiguous(ggml_op op) { switch (op) { case GGML_OP_CPY: @@ -4129,6 +4165,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_SQR: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_REPEAT: return true; default: return false; @@ -4136,7 +4173,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4144,7 +4181,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co if (src2 != nullptr) { std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; } - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"); + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(dst->extra != nullptr); @@ -4176,20 +4214,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint64_t ned = ned0 * ned1; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); - ggml_vk_func_t op_func; if (pipeline == nullptr) { - op_func = ggml_vk_op_get_func(op); - if (op_func == nullptr) { - std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); - if (src1 != nullptr) { - std::cerr << " and " << ggml_type_name(src1->type); - } - std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ABORT("fatal error"); + std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); + if (src1 != nullptr) { + std::cerr << " and " << ggml_type_name(src1->type); } + std::cerr << " to " << ggml_type_name(dst->type) << std::endl; + GGML_ABORT("fatal error"); + } - op_func(ctx, subctx, src0, src1, dst); + if (dryrun) { + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); return; } @@ -4278,188 +4314,141 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co std::array elements; // Single call if dimension 2 is contiguous - if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1); + GGML_ASSERT(op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))); - switch (op) { - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_SOFT_MAX: - case GGML_OP_SUM_ROWS: - { - const uint32_t nr = ggml_nrows(src0); - if (nr > 262144) { - elements = { 512, 512, CEIL_DIV(nr, 262144) }; - } else if (nr > 512) { - elements = { 512, CEIL_DIV(nr, 512), 1 }; - } else { - elements = { nr, 1, 1 }; - } - } break; - case GGML_OP_GROUP_NORM: - { - const uint32_t num_groups = dst->op_params[0]; - elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_ROPE: - elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; - break; - case GGML_OP_GET_ROWS: - elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; - break; - case GGML_OP_ARGSORT: - elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 }; - break; - case GGML_OP_IM2COL: - { - const bool is_2D = dst->op_params[6] == 1; - - const uint32_t IC = src1->ne[is_2D ? 2 : 1]; - - const uint32_t KH = is_2D ? src0->ne[1] : 1; - const uint32_t KW = src0->ne[0]; - - const uint32_t OH = is_2D ? dst->ne[2] : 1; - const uint32_t OW = dst->ne[1]; - - const uint32_t batch = src1->ne[3]; - - elements = { OW * KW * KH, OH, batch * IC }; - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - const uint32_t dim = dst->op_params[0]; - uint32_t half_ceil = (dim + 1) / 2; - elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; - } break; - case GGML_OP_ADD: - case GGML_OP_DIV: - case GGML_OP_MUL: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_CLAMP: - case GGML_OP_PAD: - case GGML_OP_CPY: - case GGML_OP_CONCAT: - case GGML_OP_UPSCALE: - case GGML_OP_UNARY: - { - const uint32_t ne = ggml_nelements(dst); - if (ne > 262144) { - elements = { 512, 512, CEIL_DIV(ne, 262144) }; - } else if (ne > 512) { - elements = { 512, CEIL_DIV(ne, 512), 1 }; - } else { - elements = { ne, 1, 1 }; - } - } break; - default: - elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; - break; - } - - if (!op_supports_incontiguous) { - if (x_sz != VK_WHOLE_SIZE) { - x_sz *= ne02 * ne03; - } - if (use_src1 && y_sz != VK_WHOLE_SIZE) { - y_sz *= ne12 * ne13; - } - if (use_src2 && z_sz != VK_WHOLE_SIZE) { - z_sz *= ne22 * ne23; - } - if (d_sz != VK_WHOLE_SIZE) { - d_sz *= ned2 * ned3; - } - } - - if (op == GGML_OP_SOFT_MAX) { - // Empty src1 is possible in soft_max, but the shader needs a buffer - vk_subbuffer subbuf_y; - if (use_src1) { - subbuf_y = { d_Y, y_buf_offset, y_sz }; + switch (op) { + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_SOFT_MAX: + case GGML_OP_SUM_ROWS: + { + const uint32_t nr = ggml_nrows(src0); + if (nr > 262144) { + elements = { 512, 512, CEIL_DIV(nr, 262144) }; + } else if (nr > 512) { + elements = { 512, CEIL_DIV(nr, 512), 1 }; } else { - subbuf_y = { d_X, 0, x_sz }; + elements = { nr, 1, 1 }; } + } break; + case GGML_OP_GROUP_NORM: + { + const uint32_t num_groups = dst->op_params[0]; + elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ROPE: + elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; + break; + case GGML_OP_GET_ROWS: + elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; + break; + case GGML_OP_ARGSORT: + elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 }; + break; + case GGML_OP_IM2COL: + { + const bool is_2D = dst->op_params[6] == 1; - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); - } else if (op == GGML_OP_ROPE) { - // Empty src2 is possible in rope, but the shader needs a buffer - vk_subbuffer subbuf_z; - if (use_src2) { - subbuf_z = { d_Z, z_buf_offset, z_sz }; + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t batch = src1->ne[3]; + + elements = { OW * KW * KH, OH, batch * IC }; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + const uint32_t dim = dst->op_params[0]; + uint32_t half_ceil = (dim + 1) / 2; + elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; + } break; + case GGML_OP_ADD: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_REPEAT: + case GGML_OP_CPY: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_UNARY: + { + const uint32_t ne = ggml_nelements(dst); + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; } else { - subbuf_z = { d_X, 0, x_sz }; + elements = { ne, 1, 1 }; } + } break; + default: + elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; + break; + } - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); - } else if (op == GGML_OP_IM2COL) { - // im2col uses only src1 and dst buffers - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); - } else if (use_src2) { - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); - } else if (use_src1) { - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + if (!op_supports_incontiguous) { + if (x_sz != VK_WHOLE_SIZE) { + x_sz *= ne02 * ne03; + } + if (use_src1 && y_sz != VK_WHOLE_SIZE) { + y_sz *= ne12 * ne13; + } + if (use_src2 && z_sz != VK_WHOLE_SIZE) { + z_sz *= ne22 * ne23; + } + if (d_sz != VK_WHOLE_SIZE) { + d_sz *= ned2 * ned3; + } + } + + if (op == GGML_OP_SOFT_MAX) { + // Empty src1 is possible in soft_max, but the shader needs a buffer + vk_subbuffer subbuf_y; + if (use_src1) { + subbuf_y = { d_Y, y_buf_offset, y_sz }; } else { - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + subbuf_y = { d_X, 0, x_sz }; } + + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (op == GGML_OP_ROPE) { + // Empty src2 is possible in rope, but the shader needs a buffer + vk_subbuffer subbuf_z; + if (use_src2) { + subbuf_z = { d_Z, z_buf_offset, z_sz }; + } else { + subbuf_z = { d_X, 0, x_sz }; + } + + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (op == GGML_OP_IM2COL) { + // im2col uses only src1 and dst buffers + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (use_src2) { + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (use_src1) { + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { - GGML_ASSERT(op != GGML_OP_SOFT_MAX); - GGML_ASSERT(op != GGML_OP_ARGSORT); - GGML_ASSERT(!use_src2); - - ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03); - - switch (op) { - case GGML_OP_NORM: - case GGML_OP_GROUP_NORM: - case GGML_OP_RMS_NORM: - elements = { (uint32_t)ne01, 1, 1 }; - break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_ROPE: - elements = { (uint32_t)ne01, (uint32_t)ne00, 1 }; - break; - case GGML_OP_GET_ROWS: - elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; - break; - default: - elements = { (uint32_t)ne0, 1, 1 }; - break; - } - - for (uint64_t i03 = 0; i03 < ne03; i03++) { - for (uint64_t i02 = 0; i02 < ne02; i02++) { - const uint32_t it_idx0 = (i03 * ne02 + i02); - const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0; - const uint32_t x_offset = x_sz * it_idx0; - const uint32_t y_offset = y_sz * it_idx1; - const uint32_t d_offset = d_sz * it_idx0; - - if (use_src1) { - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); - } else { - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); - } - } - } + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } } -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {}); -} - -static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4471,10 +4460,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }); + }, dryrun); } -static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4486,10 +4475,10 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }); + }, dryrun); } -static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4501,10 +4490,10 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }); + }, dryrun); } -static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4516,10 +4505,10 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }); + }, dryrun); } -static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { int * op_params = (int *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); @@ -4533,10 +4522,10 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, op_params[0], - }); + }, dryrun); } -static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const float sf0 = (float)dst->ne[0] / src0->ne[0]; @@ -4549,10 +4538,10 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], sf0, sf1, sf2, sf3, - }); + }, dryrun); } -static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4563,10 +4552,10 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], 0.0f - }); + }, dryrun); } -static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4576,10 +4565,10 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } -static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4590,10 +4579,10 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], op_params[1], - }); + }, dryrun); } -static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4603,10 +4592,23 @@ static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } -static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + }, dryrun); +} + +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4618,40 +4620,41 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, d_offset, 0.0f, 0.0f, - }); + }, dryrun); } -static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); } -static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - int * op_params = (int *)dst->op_params; +static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + const int * int_op_params = (const int *)dst->op_params; + const float * float_op_params = (const float *)dst->op_params; - uint32_t num_groups = op_params[0]; - uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - static const float eps = 1e-6f; + const uint32_t num_groups = int_op_params[0]; + const float eps = float_op_params[1]; + const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); } -static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); } -static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun); } -static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; float scale = op_params[0]; @@ -4673,10 +4676,10 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, scale, max_bias, m0, m1, n_head_log2, - }); + }, dryrun); } -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { const int n_dims = ((int32_t *) dst->op_params)[1]; // const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -4697,10 +4700,10 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, src2 != nullptr, - }); + }, dryrun); } -static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { int32_t * op_params = (int32_t *)dst->op_params; uint32_t ncols = src0->ne[0]; @@ -4716,14 +4719,14 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c ncols, ncols_pad, op_params[0], - }); + }, dryrun); } -static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }); +static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun); } -static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { const int32_t s0 = dst->op_params[0]; const int32_t s1 = dst->op_params[1]; const int32_t p0 = dst->op_params[2]; @@ -4754,22 +4757,22 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co pelements, IC * KH * KW, s0, s1, p0, p1, d0, d1, - }); + }, dryrun); } -static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t dim = dst->op_params[0]; const uint32_t max_period = dst->op_params[1]; const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { nb1, dim, max_period, - }); + }, dryrun); } -static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const float * op_params = (const float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun); } #ifdef GGML_VULKAN_RUN_TESTS @@ -4915,9 +4918,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } } - ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it); + ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it); if (split_k > 1) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer @@ -5164,7 +5167,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_quantize_data(x, qx, ne, quant); ggml_vk_dequantize_data(qx, x_ref, ne, quant); - ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1); + ggml_pipeline_request_descriptor_sets(ctx->device, p, 1); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); @@ -5285,9 +5288,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, y[i] = (i % k == i / k) ? 1.0f : 0.0f; } - ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it); + ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it); if (split_k > 1) { - ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer @@ -5415,135 +5418,6 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) return extra; } -static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ - VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")"); - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; - - if (extra == nullptr) { - return; - } - - ggml_tensor * src0 = node->src[0]; - ggml_tensor * src1 = node->src[1]; - - const bool use_src0 = src0 != nullptr; - const int64_t ne00 = use_src0 ? src0->ne[0] : 0; - const int64_t ne01 = use_src0 ? src0->ne[1] : 0; - const int64_t ne02 = use_src0 ? src0->ne[2] : 0; - const int64_t ne03 = use_src0 ? src0->ne[3] : 0; - const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP; - const int64_t ne10 = use_src1 ? src1->ne[0] : 0; - const int64_t ne11 = use_src1 ? src1->ne[1] : 0; - const int64_t ne12 = use_src1 ? src1->ne[2] : 0; - const int64_t ne13 = use_src1 ? src1->ne[3] : 0; - const int64_t ne20 = node->ne[0]; - const int64_t ne21 = node->ne[1]; - const int64_t ne22 = node->ne[2]; - const int64_t ne23 = node->ne[3]; - - const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16; - const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16; - - const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1); - - const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig; - - bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; - - const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig); - const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); - - int split_k; - if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { - split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - } else { - split_k = 1; - } - const uint32_t x_ne = ne00 * ne01; - const uint32_t y_ne = ne10 * ne11; - const uint32_t d_ne = ne20 * ne21; - - const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; - const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; - - if (extra->buffer_gpu.expired()) { - // Workaround for CPU backend BLAS matmul calls - extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz); - } - - switch (node->op) { - case GGML_OP_REPEAT: - case GGML_OP_GET_ROWS: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_ADD: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_CLAMP: - case GGML_OP_PAD: - case GGML_OP_CPY: - case GGML_OP_CONT: - case GGML_OP_DUP: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_CONCAT: - case GGML_OP_UPSCALE: - case GGML_OP_NORM: - case GGML_OP_GROUP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - case GGML_OP_ARGSORT: - case GGML_OP_SUM_ROWS: - case GGML_OP_IM2COL: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_LEAKY_RELU: - break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(node)) { - case GGML_UNARY_OP_SILU: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_TANH: - break; - default: - return; - } - break; - case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: - if ( - x_sz > ctx->device->max_memory_allocation_size || - y_sz > ctx->device->max_memory_allocation_size || - d_sz > ctx->device->max_memory_allocation_size || - split_k_size > ctx->device->max_memory_allocation_size) { - GGML_ABORT("Requested preallocation size is too large"); - } - if (ctx->prealloc_size_x < x_sz) { - ctx->prealloc_size_x = x_sz; - } - if (ctx->prealloc_size_y < y_sz) { - ctx->prealloc_size_y = y_sz; - } - if (ctx->prealloc_size_split_k < split_k_size) { - ctx->prealloc_size_split_k = split_k_size; - } - if (ctx->staging_size < x_sz + y_sz) { - ctx->staging_size = x_sz + y_sz; - } - break; - default: - return; - } -} - static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul, @@ -5708,19 +5582,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k); } - if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) { - VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")"); - // Resize buffer - if (ctx->staging != nullptr) { - ggml_vk_destroy_buffer(ctx->staging); - } - ctx->staging = ggml_vk_create_buffer_check(ctx->device, ctx->staging_size, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); - } } -static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){ +static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (ggml_is_empty(node) || extra == nullptr) { @@ -5729,7 +5593,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); ctx->semaphore_idx = 0; - ctx->staging_offset = 0; const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; @@ -5791,75 +5654,77 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod vk_context compute_ctx; - if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); + if (!dryrun) { + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + } else { + compute_ctx = ctx->compute_ctx.lock(); + } } switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(ctx, compute_ctx, src0, node); + ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); + ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_ADD: - ggml_vk_add(ctx, compute_ctx, src0, src1, node); + ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_MUL: - ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_DIV: - ggml_vk_div(ctx, compute_ctx, src0, src1, node); + ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_CONCAT: - ggml_vk_concat(ctx, compute_ctx, src0, src1, node); + ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_UPSCALE: - ggml_vk_upscale(ctx, compute_ctx, src0, node); + ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_SCALE: - ggml_vk_scale(ctx, compute_ctx, src0, node); + ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_SQR: - ggml_vk_sqr(ctx, compute_ctx, src0, node); + ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_CLAMP: - ggml_vk_clamp(ctx, compute_ctx, src0, node); + ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_PAD: - ggml_vk_pad(ctx, compute_ctx, src0, node); + ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(ctx, compute_ctx, src0, node); + ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_NORM: - ggml_vk_norm(ctx, compute_ctx, src0, node); + ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_GROUP_NORM: - ggml_vk_group_norm(ctx, compute_ctx, src0, node); + ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, compute_ctx, src0, node); + ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_UNARY: @@ -5869,59 +5734,63 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: - ggml_vk_unary(ctx, compute_ctx, src0, node); + ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); break; default: return; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); + ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_SOFT_MAX: - ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node); + ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_ROPE: - ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node); + ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, dryrun); break; case GGML_OP_ARGSORT: - ggml_vk_argsort(ctx, compute_ctx, src0, node); + ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_SUM_ROWS: - ggml_vk_sum_rows(ctx, compute_ctx, src0, node); + ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_IM2COL: - ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); + ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_TIMESTEP_EMBEDDING: - ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); + ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_LEAKY_RELU: - ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); + ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node); + ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node, dryrun); break; case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node); + ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); break; default: return; } + if (dryrun) { + return; + } + ctx->tensor_ctxs[node_idx] = compute_ctx; -#ifdef GGML_VULKAN_CHECK_RESULTS +#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF) // Force context reset on each node so that each tensor ends up in its own context // and can be run and compared to its CPU equivalent separately last_node = true; @@ -6005,6 +5874,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); +#ifdef GGML_VULKAN_PERF + std::chrono::steady_clock::time_point start; +#endif // GGML_VULKAN_PERF + // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { // Do staging buffer copies @@ -6012,11 +5885,21 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(cpy.dst, cpy.src, cpy.n); } +#ifdef GGML_VULKAN_PERF + start = std::chrono::steady_clock::now(); +#endif // GGML_VULKAN_PERF + ggml_vk_submit(subctx, ctx->fence); } if (tensor_idx == subctx->exit_tensor_idx) { VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + +#ifdef GGML_VULKAN_PERF + auto duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); + ctx->device->perf_logger->log_timing(tensor, duration.count()); +#endif // GGML_VULKAN_PERF + ctx->device->device.resetFences({ ctx->fence }); // Do staging buffer copies @@ -6038,12 +5921,14 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { } ctx->gc.temp_buffers.clear(); - for (auto& pipeline : ctx->device->pipelines) { - if (pipeline.expired()) { + for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) { + vk_pipeline_ref plr = ctx->device->pipelines[dsr.first]; + + if (plr.expired()) { continue; } - vk_pipeline pl = pipeline.lock(); + vk_pipeline pl = plr.lock(); ggml_pipeline_cleanup(pl); } @@ -6067,10 +5952,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->device->device.resetEvent(event); } - ctx->staging_offset = 0; - ctx->tensor_ctxs.clear(); ctx->gc.contexts.clear(); + ctx->device->pipeline_descriptor_set_requirements.clear(); } // Clean up on backend free @@ -6081,7 +5965,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ggml_vk_destroy_buffer(ctx->prealloc_x); ggml_vk_destroy_buffer(ctx->prealloc_y); ggml_vk_destroy_buffer(ctx->prealloc_split_k); - ggml_vk_destroy_buffer(ctx->staging); for (auto& buffer : ctx->buffer_pool) { ggml_vk_destroy_buffer(buffer); @@ -6090,7 +5973,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->prealloc_size_x = 0; ctx->prealloc_size_y = 0; ctx->prealloc_size_split_k = 0; - ctx->staging_size = 0; for (auto& event : ctx->gc.events) { ctx->device->device.destroyEvent(event); @@ -6419,7 +6301,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6442,7 +6324,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { @@ -6508,9 +6390,10 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true); } ggml_vk_preallocate_buffers(ctx); + ggml_pipeline_allocate_descriptor_sets(ctx->device); int last_node = cgraph->n_nodes - 1; @@ -6523,7 +6406,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen ctx->tensor_ctxs.resize(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false); } for (int i = 0; i < cgraph->n_nodes; i++) { @@ -6549,6 +6432,10 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen GGML_ASSERT(ok); } +#ifdef GGML_VULKAN_PERF + ctx->device->perf_logger->print_timings(); +#endif + ggml_vk_graph_cleanup(ctx); return GGML_STATUS_SUCCESS; @@ -6640,10 +6527,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const return false; } break; case GGML_OP_REPEAT: - { - ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - } break; + return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float); case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_NONE: @@ -7107,12 +6991,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); } else if (tensor->op == GGML_OP_PAD) { tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); + } else if (tensor->op == GGML_OP_REPEAT) { + tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_GROUP_NORM) { - tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params); + tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params, ((float *)tensor->op_params)[1]); } else if (tensor->op == GGML_OP_RMS_NORM) { tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/vulkan-shaders/concat.comp index 08ab5514b..c23b6eb1b 100644 --- a/ggml/src/vulkan-shaders/concat.comp +++ b/ggml/src/vulkan-shaders/concat.comp @@ -30,6 +30,10 @@ void main() { #ifndef OPTIMIZATION_ERROR_WORKAROUND data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]); #else - data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx]; + if (is_src0) { + data_d[p.d_offset + dst_idx] = data_a[src0_idx]; + } else { + data_d[p.d_offset + dst_idx] = data_b[src1_idx]; + } #endif } diff --git a/ggml/src/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp index 46a6369bc..d3ccba7fc 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp @@ -39,8 +39,7 @@ void main() { vec2 v = dequantize(ib, iqs, a_offset / QUANT_K); // matrix multiplication - tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) + - FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); + tmp[tid] = fma(FLOAT_TYPE(v.x), FLOAT_TYPE(data_b[b_offset + iybs + iqs]), fma(FLOAT_TYPE(v.y), FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]), tmp[tid])); } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp index cb3f3c0df..1cc4996d3 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp @@ -53,7 +53,7 @@ void main() { const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]); - tmp[tid] += xi * FLOAT_TYPE(data_b[iy]); + tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]); } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp index 4b1871caa..9b443807d 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp @@ -52,7 +52,7 @@ void main() { // y is not transposed but permuted const uint iy = channel*nrows_y + row_y; - tmp[tid] += xi * FLOAT_TYPE(data_b[iy]); + tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]); } // dst is not transposed and not permuted diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp index 4cd97799d..ec8eadcd5 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -39,24 +39,25 @@ void main() { FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3) - + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3); - sum2 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF) - + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF); + sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1)))))))); + sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2)))))))); } - tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx])); } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp index a6e430ea0..3ca4ad85a 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -40,16 +40,17 @@ void main() { FLOAT_TYPE sum = FLOAT_TYPE(0.0); for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)); + sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum)))))))); } - tmp[16 * ix + tid] += d * sum; + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]); } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp index 75569363c..d91e00e10 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -67,17 +67,17 @@ void main() { const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4); const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4); - const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1 + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3); - const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_5 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7); - const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx]) * q4_8 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_9 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * q4_10 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11); - const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_12 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_13 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * q4_14 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15); - const FLOAT_TYPE smin = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y1_idx ]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7 - + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7 - + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * sc7 - + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7 - ); - tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin); + const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx]), q4_0, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), q4_1, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3))); + const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_4, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), q4_5, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), q4_6, FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7))); + const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx]), q4_8, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), q4_9, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), q4_10, FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11))); + const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_12, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), q4_13, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), q4_14, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7, + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), sc7, + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), sc7, + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 3]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 35]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 3]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7))))))))))))))); + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx])); #else const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); @@ -88,16 +88,19 @@ void main() { const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1); - const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3); - const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5); - const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7); - const FLOAT_TYPE smin = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7 - + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7 - ); + const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), q4_0, FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1); + const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3); + const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), q4_4, FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5); + const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7, + + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7))))))); - tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin); + tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin); + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(dall, (fma(sx, FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f), fma(sy, FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f), + fma(sz, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)), fma(sw, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))))))), fma(-dmin, smin, tmp[tmp_idx])); #endif } diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp index 9be3645bd..2306785af 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -66,35 +66,33 @@ void main() { const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4); const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4); - const FLOAT_TYPE sx = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y1_idx ]) * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)) - ); - const FLOAT_TYPE sy = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)) - ); - const FLOAT_TYPE sz = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y2_idx ]) * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)) - ); - const FLOAT_TYPE sw = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)) - + FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)) - ); - const FLOAT_TYPE smin = FLOAT_TYPE( - (FLOAT_TYPE(data_b[b_offset + y1_idx]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17])) * sc2 + (FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49])) * sc3 - + (FLOAT_TYPE(data_b[b_offset + y2_idx]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17])) * sc6 + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7 - ); - tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin); + const FLOAT_TYPE sx = + fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)), + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))))); + const FLOAT_TYPE sy = + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))))); + const FLOAT_TYPE sz = + fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)), + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))))); + const FLOAT_TYPE sw = + fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2, + fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3, + fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6, + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7))); + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx])); } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp index d610cf030..95c286eeb 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -44,22 +44,22 @@ void main() { const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); #if K_QUANTS_PER_ITERATION == 1 - FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32); - tmp[16 * ix + tid] += sum; + const uint tmp_idx = 16 * ix + tid; + tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx])))))))); #else FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { - sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32); + sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32), + fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum)))); } tmp[16 * ix + tid] += sum; #endif diff --git a/ggml/src/vulkan-shaders/mul_mm.comp b/ggml/src/vulkan-shaders/mul_mm.comp index 5fe9d5241..fffdd1818 100644 --- a/ggml/src/vulkan-shaders/mul_mm.comp +++ b/ggml/src/vulkan-shaders/mul_mm.comp @@ -326,10 +326,10 @@ void main() { mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); } const float d = loadd.x * sc; - const float m = loadd.y * mbyte; + const float m = -loadd.y * mbyte; - buf_a[buf_idx ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) - m); - buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m); + buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m)); + buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); #elif defined(DATA_A_Q5_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; @@ -357,10 +357,10 @@ void main() { mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); } const float d = loadd.x * sc; - const float m = loadd.y * mbyte; + const float m = -loadd.y * mbyte; - buf_a[buf_idx ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0)) - m); - buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m); + buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m)); + buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); #elif defined(DATA_A_Q6_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; @@ -463,7 +463,8 @@ void main() { [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { [[unroll]] for (uint cc = 0; cc < TN; cc++) { [[unroll]] for (uint cr = 0; cr < TM; cr++) { - sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr] += float(cache_a[wsir * TM + cr]) * float(cache_b[wsic * TN + cc]); + const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr; + sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]); } } } diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/vulkan-shaders/repeat.comp new file mode 100644 index 000000000..a86af87e7 --- /dev/null +++ b/ggml/src/vulkan-shaders/repeat.comp @@ -0,0 +1,24 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +uint src0_idx_mod(uint idx) { + const uint i13 = idx / (p.ne12*p.ne11*p.ne10); + const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; + const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); + const uint i12_offset = i12*p.ne11*p.ne10; + const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; + const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; + return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00; +} + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]); +} diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index a792e203b..53ceb13d3 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -380,6 +380,10 @@ void process_shaders(std::vector>& tasks) { string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); From 234b30676a97ce227b604c38beb9dcaca406dea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Podiv=C3=ADn?= <66251151+jpodivin@users.noreply.github.com> Date: Thu, 15 Aug 2024 08:21:57 +0200 Subject: [PATCH 137/154] server : init stop and error fields of the result struct (#9026) Signed-off-by: Jiri Podivin --- examples/server/server.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c25338f57..ace17a12f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1852,6 +1852,8 @@ struct server_context { llama_lora_adapters_apply(ctx, lora_adapters); server_task_result result; result.id = task.id; + result.stop = true; + result.error = false; result.data = json{{ "success", true }}; queue_results.send(result); } break; From d5492f0525fa533817a67e93a4bde9d71d81cf58 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 15 Aug 2024 10:11:11 +0300 Subject: [PATCH 138/154] ci : disable bench workflow (#9010) --- .github/workflows/{bench.yml => bench.yml.disabled} | 3 +++ 1 file changed, 3 insertions(+) rename .github/workflows/{bench.yml => bench.yml.disabled} (98%) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml.disabled similarity index 98% rename from .github/workflows/bench.yml rename to .github/workflows/bench.yml.disabled index 56d22bc0c..bfdbb4ef5 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml.disabled @@ -1,3 +1,6 @@ +# TODO: there have been some issues with the workflow, so disabling for now +# https://github.com/ggerganov/llama.cpp/issues/7893 +# # Benchmark name: Benchmark From 6bda7ce6c3a9284fcbb70c1ace4107db8eb63e5c Mon Sep 17 00:00:00 2001 From: Esko Toivonen Date: Thu, 15 Aug 2024 10:17:12 +0300 Subject: [PATCH 139/154] llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850) --- convert_hf_to_gguf.py | 8 +++++++- convert_hf_to_gguf_update.py | 2 ++ include/llama.h | 2 ++ src/llama-vocab.cpp | 2 ++ src/llama.cpp | 6 ++++++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 550dd5cfd..41063d94b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -590,6 +590,12 @@ class Model: if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M res = "smollm" + if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": + # ref: https://huggingface.co/bigscience/bloom + res = "bloom" + if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": + # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small + res = "gpt3-finnish" if res is None: logger.warning("\n") @@ -893,7 +899,7 @@ class GPTNeoXModel(Model): return tensors -@Model.register("BloomForCausalLM") +@Model.register("BloomForCausalLM", "BloomModel") class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index d5a2d925e..ba98f5c88 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -94,6 +94,8 @@ models = [ {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, + {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, + {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, ] diff --git a/include/llama.h b/include/llama.h index 3c28cf0b5..fda68da85 100644 --- a/include/llama.h +++ b/include/llama.h @@ -93,6 +93,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 749f85718..063af648e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -410,6 +410,8 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_PORO: + case LLAMA_VOCAB_PRE_TYPE_BLOOM: + case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: regex_exprs = { " ?[^(\\s|.,!?…。,、।۔،)]+", }; diff --git a/src/llama.cpp b/src/llama.cpp index 7f2f00031..bf7a57c79 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5467,6 +5467,12 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "codeshell") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; + } else if ( + tokenizer_pre == "bloom") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM; + } else if ( + tokenizer_pre == "gpt3-finnish") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From 4af8420afba39de4968adf2695ce12fd42422a13 Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:23:23 +0800 Subject: [PATCH 140/154] common : remove duplicate function llama_should_add_bos_token (#8778) --- common/common.cpp | 6 ------ common/common.h | 4 ---- examples/cvector-generator/cvector-generator.cpp | 2 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 4 ++-- examples/infill/infill.cpp | 4 ++-- examples/main/main.cpp | 4 ++-- examples/perplexity/perplexity.cpp | 12 ++++++------ examples/server/server.cpp | 7 +++---- examples/tokenize/tokenize.cpp | 2 +- include/llama.h | 7 ++----- src/llama-vocab.cpp | 4 ++-- src/llama-vocab.h | 4 ++-- src/llama.cpp | 4 ++-- 14 files changed, 26 insertions(+), 40 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d3d896115..ce46e65ae 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2702,12 +2702,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector return text; } -bool llama_should_add_bos_token(const llama_model * model) { - const int add_bos = llama_add_bos_token(model); - - return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); -} - // // Chat template utils // diff --git a/common/common.h b/common/common.h index bbc33a499..df23460a5 100644 --- a/common/common.h +++ b/common/common.h @@ -380,10 +380,6 @@ std::string llama_detokenize( const std::vector & tokens, bool special = true); -// Uses the value from the model metadata if possible, otherwise -// defaults to true when model type is SPM, otherwise false. -bool llama_should_add_bos_token(const llama_model * model); - // // Chat template utils // diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index a12e90d82..8fa492571 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -271,7 +271,7 @@ struct tokenized_prompt { size_t max_seq_len; tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index ef35ba2c0..5e89988e2 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } static bool run(llama_context * ctx, const gpt_params & params) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 58814b96e..83b85d72b 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -433,8 +433,8 @@ static void process_logits( } static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const int n_ctx = llama_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 92d630b15..05700c1d5 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -203,8 +203,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); } - const bool add_bos = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); + const bool add_bos = llama_add_bos_token(model); + GGML_ASSERT(!llama_add_eos_token(model)); LOG("add_bos: %d\n", add_bos); std::vector embd_inp; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6e0635a66..4a342ad03 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -267,9 +267,9 @@ int main(int argc, char ** argv) { } } - const bool add_bos = llama_should_add_bos_token(model); + const bool add_bos = llama_add_bos_token(model); if (!llama_model_has_encoder(model)) { - GGML_ASSERT(llama_add_eos_token(model) != 1); + GGML_ASSERT(!llama_add_eos_token(model)); } LOG("add_bos: %d\n", add_bos); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 372684f09..484dd5891 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -340,8 +340,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); @@ -480,8 +480,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); std::ofstream logits_stream; if (!params.logits_file.empty()) { @@ -1733,8 +1733,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const int n_batch = params.n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int nv = 2*((n_vocab + 1)/2) + 4; - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); + const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ace17a12f..3fe0e6558 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -693,9 +693,8 @@ struct server_context { n_ctx = llama_n_ctx(ctx); - add_bos_token = llama_should_add_bos_token(model); - has_eos_token = llama_add_eos_token(model) != 1; - + add_bos_token = llama_add_bos_token(model); + has_eos_token = !llama_add_eos_token(model); return true; } @@ -2038,7 +2037,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { - const bool add_bos = llama_should_add_bos_token(model); + const bool add_bos = llama_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 17f5e4961..c817be566 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -362,7 +362,7 @@ int main(int raw_argc, char ** raw_argv) { prompt = stdin_buffer.str(); } - const bool model_wants_add_bos = llama_should_add_bos_token(model); + const bool model_wants_add_bos = llama_add_bos_token(model); const bool add_bos = model_wants_add_bos && !no_bos; const bool parse_special = !no_parse_special; diff --git a/include/llama.h b/include/llama.h index fda68da85..ed81aa469 100644 --- a/include/llama.h +++ b/include/llama.h @@ -914,11 +914,8 @@ extern "C" { LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding - // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model); - - // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); + LLAMA_API bool llama_add_bos_token(const struct llama_model * model); + LLAMA_API bool llama_add_eos_token(const struct llama_model * model); // Codellama infill tokens LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 063af648e..11fffce93 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1468,11 +1468,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) { return vocab.special_pad_id; } -int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) { +bool llama_add_bos_token_impl(const struct llama_vocab & vocab) { return vocab.tokenizer_add_bos; } -int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) { +bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { return vocab.tokenizer_add_eos; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 7adfc16da..6e8f30be4 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab); -int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab); +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); diff --git a/src/llama.cpp b/src/llama.cpp index bf7a57c79..ee36de977 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18705,11 +18705,11 @@ llama_token llama_token_pad(const struct llama_model * model) { return llama_token_pad_impl(model->vocab); } -int32_t llama_add_bos_token(const struct llama_model * model) { +bool llama_add_bos_token(const struct llama_model * model) { return llama_add_bos_token_impl(model->vocab); } -int32_t llama_add_eos_token(const struct llama_model * model) { +bool llama_add_eos_token(const struct llama_model * model) { return llama_add_eos_token_impl(model->vocab); } From 37501d9c79ed5897db4b73ea7502211d25b3f763 Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Thu, 15 Aug 2024 15:28:05 +0800 Subject: [PATCH 141/154] server : fix duplicated n_predict key in the generation_settings (#8994) --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3fe0e6558..e073f5813 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1321,7 +1321,7 @@ struct server_context { return json { {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, + {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, {"temperature", slot.sparams.temp}, @@ -1343,7 +1343,7 @@ struct server_context { {"mirostat_eta", slot.sparams.mirostat_eta}, {"penalize_nl", slot.sparams.penalize_nl}, {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict + {"max_tokens", slot.params.n_predict}, // User configured n_predict {"n_keep", slot.params.n_keep}, {"n_discard", slot.params.n_discard}, {"ignore_eos", ignore_eos}, From 4b9afbbe9037f8a2d659097c0c7d9fce32c6494c Mon Sep 17 00:00:00 2001 From: gtygo Date: Thu, 15 Aug 2024 15:40:12 +0800 Subject: [PATCH 142/154] retrieval : fix memory leak in retrieval query handling (#8955) * retrieval * Reuse querybatch to reduce frequent memory allocation * delete unused white space --- examples/retrieval/retrieval.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 65b19ce71..aab9d8105 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -253,6 +253,8 @@ int main(int argc, char ** argv) { chunks[i].tokens.clear(); } + struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); + // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { @@ -260,7 +262,6 @@ int main(int argc, char ** argv) { std::getline(std::cin, query); std::vector query_tokens = llama_tokenize(ctx, query, true); - struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); @@ -293,6 +294,7 @@ int main(int argc, char ** argv) { } // clean up + llama_batch_free(query_batch); llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); From e3f6fd56b1ab3c426596217d786910d641ae6ce0 Mon Sep 17 00:00:00 2001 From: Nico Bosshard Date: Fri, 16 Aug 2024 04:22:55 +0200 Subject: [PATCH 143/154] ggml : dynamic ggml_sched_max_splits based on graph_size (#9047) * ggml : Dynamic ggml_sched_max_splits based on graph_size * Fixed and readded debug code for causes --- ggml/src/ggml-backend.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index e1651cc64..8856967c9 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1018,10 +1018,6 @@ static bool ggml_is_view_op(enum ggml_op op) { #define GGML_SCHED_MAX_BACKENDS 16 #endif -#ifndef GGML_SCHED_MAX_SPLITS -#define GGML_SCHED_MAX_SPLITS 2048 -#endif - #ifndef GGML_SCHED_MAX_SPLIT_INPUTS #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif @@ -1125,7 +1121,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co } #if 0 -static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only +#define GGML_SCHED_MAX_SPLITS_DEBUG 4096 +static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -1549,7 +1546,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); GGML_ASSERT(sched->splits != NULL); } - GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS); split = &sched->splits[i_split]; split->backend_id = node_backend_id; split->i_start = i; @@ -1865,13 +1861,14 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); - const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; + const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph + const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); + sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); sched->context_buffer = malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; From 2a24c8caa6d10a7263ca317fa7cb64f0edc72aae Mon Sep 17 00:00:00 2001 From: Yoshi Suhara Date: Thu, 15 Aug 2024 19:23:33 -0700 Subject: [PATCH 144/154] Add Nemotron/Minitron GGUF Conversion & Inference Support (#8922) * Add nemotron GGUF conversion & inference support * Fix formatting issues * Remove unnecessary write_tensors() * Update convert_hf_to_gguf.py Co-authored-by: compilade * Update src/llama.cpp Co-authored-by: compilade * Address comments by @compilade * Replace ggml_mul_mat()->llm_build_lora_mm() * Remove mutable variable * Use for bias tensors * Cover corner case for role_scaling not in config.json --------- Co-authored-by: compilade --- convert_hf_to_gguf.py | 41 +++++++ gguf-py/gguf/constants.py | 21 ++++ gguf-py/gguf/tensor_mapping.py | 21 ++-- src/llama.cpp | 199 ++++++++++++++++++++++++++++++++- 4 files changed, 271 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 41063d94b..8f0e8680f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3740,6 +3740,47 @@ class ChatGLMModel(Model): name = name.removeprefix("transformer.") return [(self.map_tensor_name(name), data_torch)] + +@Model.register("NemotronForCausalLM") +class NemotronModel(Model): + model_arch = gguf.MODEL_ARCH.NEMOTRON + + def set_vocab(self): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + + # * Partial RoPE + rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + + # * RopeScaling for Nemotron + if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f63ec450a..63d51d872 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -219,6 +219,7 @@ class MODEL_ARCH(IntEnum): T5 = auto() T5ENCODER = auto() JAIS = auto() + NEMOTRON = auto() class MODEL_TENSOR(IntEnum): @@ -347,6 +348,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1065,6 +1067,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.NEMOTRON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -1105,6 +1122,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.ROPE_FREQS, ], + MODEL_ARCH.NEMOTRON: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } # diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 9aa2209e2..a07f7f582 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf + "model.embed_tokens", # llama-hf nemotron "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -52,7 +52,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -75,6 +75,7 @@ class TensorNameMap: "transformer.rms_norm", # Grok "encoder.final_layernorm", # chatglm "transformer.norm", # openelm + "model.norm", # nemotron ), # Rope frequencies @@ -93,7 +94,7 @@ class TensorNameMap: "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf + "model.layers.{bid}.input_layernorm", # llama-hf nemotron "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi @@ -135,7 +136,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -146,7 +147,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -158,7 +159,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -175,7 +176,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -218,7 +219,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi @@ -258,7 +259,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact + "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert "transformer.h.{bid}.mlp.fc_in", # gpt-j @@ -329,7 +330,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf + "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j diff --git a/src/llama.cpp b/src/llama.cpp index ee36de977..1128bf53b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -210,6 +210,7 @@ enum llm_arch { LLM_ARCH_T5, LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, + LLM_ARCH_NEMOTRON, LLM_ARCH_UNKNOWN, }; @@ -255,6 +256,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, + { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1296,6 +1298,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, + { + LLM_ARCH_NEMOTRON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -5235,6 +5255,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_NEMOTRON: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -7568,6 +7596,48 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); } } break; + case LLM_ARCH_NEMOTRON: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional MLP bias + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -8254,7 +8324,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 ggml_mul_mat_set_prec(kq, GGML_PREC_F32); @@ -13755,6 +13825,128 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_nemotron() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + //GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -14010,6 +14202,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_jais(); } break; + case LLM_ARCH_NEMOTRON: + { + result = llm.build_nemotron(); + } break; default: GGML_ABORT("fatal error"); } @@ -17080,6 +17276,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: case LLM_ARCH_CODESHELL: + case LLM_ARCH_NEMOTRON: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here From fb487bb5671b37267f35ff43fe8849ddccd925cd Mon Sep 17 00:00:00 2001 From: Liu Jia Date: Fri, 16 Aug 2024 14:23:12 +0800 Subject: [PATCH 145/154] common : add support for cpu_get_num_physical_cores() on Windows (#8771) * Add support for cpu_get_num_phsical_cores() on Windows * fix build bug on msys2-clang64 and ucrt64 * avoid adding new function * add new macros to avoid windows+mingw64 * Add error checking to return default value --- common/common.cpp | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ce46e65ae..382d585a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -110,8 +110,34 @@ int32_t cpu_get_num_physical_cores() { if (result == 0) { return num_physical_cores; } -#elif defined(_WIN32) - //TODO: Implement +#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + // TODO: windows + arm64 + mingw64 + unsigned int n_threads_win = std::thread::hardware_concurrency(); + unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4; + + DWORD buffer_size = 0; + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) { + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return default_threads; + } + } + + std::vector buffer(buffer_size); + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &buffer_size)) { + return default_threads; + } + + int32_t num_physical_cores = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast(buffer.data()); + while (buffer_size > 0) { + if (info->Relationship == RelationProcessorCore) { + num_physical_cores += info->Processor.GroupCount; + } + buffer_size -= info->Size; + info = reinterpret_cast(reinterpret_cast(info) + info->Size); + } + + return num_physical_cores > 0 ? num_physical_cores : default_threads; #endif unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; @@ -1727,7 +1753,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) { if (params.n_threads_batch != -1) { os << " (n_threads_batch = " << params.n_threads_batch << ")"; } +#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + // TODO: windows + arm64 + mingw64 + DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + os << " / " << logicalProcessorCount << " | " << llama_print_system_info(); +#else os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); +#endif return os.str(); } From c679e0cb5c378bfb63643c44a453345ba8b90126 Mon Sep 17 00:00:00 2001 From: Minsoo Cheong Date: Fri, 16 Aug 2024 15:35:18 +0900 Subject: [PATCH 146/154] llama : add EXAONE model support (#9025) * add exaone model support * add chat template * fix whitespace Co-authored-by: Georgi Gerganov * add ftype * add exaone pre-tokenizer in `llama-vocab.cpp` Co-Authored-By: compilade <113953597+compilade@users.noreply.github.com> * fix lint Co-Authored-By: compilade <113953597+compilade@users.noreply.github.com> * add `EXAONE` to supported models in `README.md` * fix space Co-authored-by: compilade --------- Co-authored-by: Georgi Gerganov Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> Co-authored-by: compilade --- README.md | 1 + convert_hf_to_gguf.py | 74 ++++++++++++ convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 18 +++ gguf-py/gguf/tensor_mapping.py | 19 ++- include/llama.h | 1 + src/llama-vocab.cpp | 1 + src/llama.cpp | 212 ++++++++++++++++++++++++++++++++- 8 files changed, 320 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7f48fde6e..e76ba9218 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) +- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8f0e8680f..6a1a3a937 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -596,6 +596,9 @@ class Model: if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small res = "gpt3-finnish" + if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct + res = "exaone" if res is None: logger.warning("\n") @@ -3781,6 +3784,77 @@ class NemotronModel(Model): return [(self.map_tensor_name(name), data_torch)] + +@Model.register("ExaoneForCausalLM") +class ExaoneModel(Model): + model_arch = gguf.MODEL_ARCH.EXAONE + + def set_gguf_parameters(self): + hparams = self.hparams + + assert(hparams["activation_function"] == "silu") + + max_position_embeddings = hparams["max_position_embeddings"] + embed_dim = hparams["hidden_size"] + num_heads = hparams["num_attention_heads"] + num_kv_heads = hparams.get("num_key_value_heads", num_heads) + layer_norm_eps = hparams["layer_norm_epsilon"] + intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim + num_layers = hparams["num_layers"] + # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 + # attention_dropout_rate = hparams["attention_dropout"] + # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 + # embed_dropout_rate = hparams["embed_dropout"] + self.gguf_writer.add_embedding_length(embed_dim) + self.gguf_writer.add_head_count(num_heads) + self.gguf_writer.add_head_count_kv(num_kv_heads) + self.gguf_writer.add_context_length(max_position_embeddings) + self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_block_count(num_layers) + self.gguf_writer.add_file_type(self.ftype) + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = rotary_factor if rotary_factor is not None else 1.0 + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: + if hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + + def prepare_tensors(self): + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + + super().prepare_tensors() + ###### CONVERSION LOGIC ###### diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ba98f5c88..ff4955f9c 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -96,6 +96,7 @@ models = [ {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, + {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, ] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 63d51d872..5541972ce 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -220,6 +220,7 @@ class MODEL_ARCH(IntEnum): T5ENCODER = auto() JAIS = auto() NEMOTRON = auto() + EXAONE = auto() class MODEL_TENSOR(IntEnum): @@ -349,6 +350,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1082,6 +1084,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.EXAONE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a07f7f582..a4f185c06 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf nemotron @@ -52,7 +52,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -62,7 +62,7 @@ class TensorNameMap: # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon jais + "transformer.ln_f", # gpt2 gpt-j falcon jais exaone "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "transformer.norm_f", # mpt dbrx @@ -89,7 +89,7 @@ class TensorNameMap: # Attention norm MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom @@ -143,6 +143,7 @@ class TensorNameMap: "model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.{bid}.attention.wq", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok + "transformer.h.{bid}.attn.attention.q_proj", # exaone ), # Attention key @@ -155,6 +156,7 @@ class TensorNameMap: "model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.{bid}.attention.wk", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok + "transformer.h.{bid}.attn.attention.k_proj", # exaone ), # Attention value @@ -166,7 +168,8 @@ class TensorNameMap: "transformer.h.{bid}.attn.v", # refact "model.layers.layers.{bid}.self_attn.v_proj", # plamo "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok + "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok + "transformer.h.{bid}.attn.attention.v_proj", # exaone ), # Attention output @@ -191,6 +194,7 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "encoder.layers.{bid}.self_attention.dense", # chatglm "transformer.layers.{bid}.attn.out_proj", # openelm + "transformer.h.{bid}.attn.attention.out_proj", # exaone ), # Attention output norm @@ -216,7 +220,7 @@ class TensorNameMap: # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron @@ -278,6 +282,7 @@ class TensorNameMap: "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm + "transformer.h.{bid}.mlp.c_fc_1", # exaone ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -309,6 +314,7 @@ class TensorNameMap: "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -348,6 +354,7 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w2", # arctic "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm + "model.layers.h.{bid}.mlp.c_proj", # exaone ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index ed81aa469..188ae76f8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -95,6 +95,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 11fffce93..17deefaa8 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -388,6 +388,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: + case LLAMA_VOCAB_PRE_TYPE_EXAONE: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/src/llama.cpp b/src/llama.cpp index 1128bf53b..7e9149eb9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -211,6 +211,7 @@ enum llm_arch { LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, + LLM_ARCH_EXAONE, LLM_ARCH_UNKNOWN, }; @@ -257,6 +258,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, + { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1316,6 +1318,25 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_EXAONE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -5263,6 +5284,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_EXAONE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -5501,6 +5531,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt3-finnish") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; + } else if ( + tokenizer_pre == "exaone") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -7638,6 +7671,36 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } } break; + case LLM_ARCH_EXAONE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -13895,7 +13958,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } if (il == n_layer - 1) { @@ -13947,6 +14009,133 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_exaone() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -14206,6 +14395,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_nemotron(); } break; + case LLM_ARCH_EXAONE: + { + result = llm.build_exaone(); + } break; default: GGML_ABORT("fatal error"); } @@ -17277,6 +17470,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_GPTNEOX: case LLM_ARCH_CODESHELL: case LLM_ARCH_NEMOTRON: + case LLM_ARCH_EXAONE: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here @@ -19207,6 +19401,22 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "Assistant:"; } + } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) { + // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb + // EXAONE-3.0-7.8B-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n"; + } else if (role == "user") { + ss << "[|user|]" << trim(message->content) << "\n"; + } else if (role == "assistant") { + ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n"; + } + } + if (add_ass) { + ss << "[|assistant|]"; + } } else { // template not supported return -1; From 23fd4535445e3e859ded2f02d6142b6e93b43890 Mon Sep 17 00:00:00 2001 From: compilade Date: Fri, 16 Aug 2024 02:36:11 -0400 Subject: [PATCH 147/154] gguf-py : bump version from 0.9.1 to 0.10.0 (#9051) --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 19f6761e2..eea381e5a 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.9.1" +version = "0.10.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From c8ddce85606d9fb6e30745b6e4fe103eecadc73f Mon Sep 17 00:00:00 2001 From: Aisuko Date: Fri, 16 Aug 2024 19:08:59 +1000 Subject: [PATCH 148/154] Fix inference example lacks required parameters (#9035) Signed-off-by: Aisuko --- examples/quantize/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 553c2701b..5d1e11c67 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -34,7 +34,7 @@ Run the quantized model: ```bash # start inference on a gguf model -./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 +./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant" ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. From ee2984bdaf10c14d440ad873a049bcc09b786d9b Mon Sep 17 00:00:00 2001 From: Farbod Bijary <110523279+farbodbj@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:06:30 +0330 Subject: [PATCH 149/154] py : fix wrong input type for raw_dtype in ggml to gguf scripts (#8928) Co-authored-by: farbod --- convert_llama_ggml_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 7b00b4398..29b14e98d 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -116,7 +116,7 @@ class Tensor: assert quant is not None, 'Unknown tensor type' (blksize, tysize) = quant offset += 12 - self.dtype= dtype + self.dtype= gguf.GGMLQuantizationType(dtype) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) offset += 4 * n_dims self.name = bytes(data[offset:offset + name_len]) From d565bb2fd5a2a58b9924a7a34e77a87c78c52137 Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Fri, 16 Aug 2024 21:34:41 +0800 Subject: [PATCH 150/154] llava : support MiniCPM-V-2.6 (#8967) * init * rename * add run android for termux in readme * add android readme * add instructions in readme * change name in readme * Update README.md * fixed line * add result in readme * random pos_embed * add positions index * change for ollama * change for ollama * better pos_embed in clip * support ollama * updata cmakelist * updata cmakelist * rename wrapper * clear code * replace and organize code * add link * sync master * fix warnings * fix warnings * fix bug in bicubic resize when need resize iamge smaller * receive review comments and modify * receive review comments and modify * put all code into llava dir * fix quality problem in pr code * change n_layer * add space in "-1" * imitate reshape bug of python code * fix bug in clip * fix issues for merging * fix llama-minicpmv-cli in cmake file * change pr readme * fix code review * remove in line 33 directory in the /cmakelists.txt (not in example, in the main dir * fix cmakefile * add warn * fix KEY_HAS_MINICPMV_PROJ * remove load_image_size into clip_ctx * remove the extern "C", MINICPMV_API * fix uhd code for review comment * delete minicpmv-wrapper in pr * remove uhd_image_embed * Modify 2 notes * support minicpmv2.6 * modify convert script of minicpmv * modify convert * modify convert * add readme * add resampler of v2.6 * modify clip * modify readme * fix type-check * fix type-check * fix type-check * fix type-check * modify convert script and readme * fix convert script and readme * fix convert * fix num in convert * fix type-check --------- Co-authored-by: Hongji Zhu Co-authored-by: harvestingmoon --- examples/llava/README-minicpmv2.5.md | 4 +- examples/llava/README-minicpmv2.6.md | 107 +++++ examples/llava/clip.cpp | 96 +++- examples/llava/clip.h | 2 +- examples/llava/llava.cpp | 9 +- examples/llava/minicpmv-cli.cpp | 26 +- .../minicpmv-convert-image-encoder-to-gguf.py | 432 +++++++++++++++++- examples/llava/minicpmv-surgery.py | 4 +- 8 files changed, 645 insertions(+), 35 deletions(-) create mode 100644 examples/llava/README-minicpmv2.6.md diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index 4affc1d0f..62009b0af 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -16,8 +16,8 @@ Convert PyTorch model to gguf files (You can also download the converted [gguf]( ```bash python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 -python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model +python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 +python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version ./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md new file mode 100644 index 000000000..c4be5e5dd --- /dev/null +++ b/examples/llava/README-minicpmv2.6.md @@ -0,0 +1,107 @@ +## MiniCPM-V 2.6 + +### Prepare models and code + +Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder. + +Clone llama.cpp: +```bash +git clone git@github.com:OpenBMB/llama.cpp.git +cd llama.cpp +git checkout minicpmv-main +``` + +### Usage of MiniCPM-V 2.6 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) + +```bash +python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6 +python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 +python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model + +# quantize int4 version +./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +make llama-minicpmv-cli +``` + +Inference on Linux or Mac +``` +# run f16 version +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Video +Install FFmpeg +``` +brew install ffmpeg +brew install pkg-config +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 54aa822c9..342042ffb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -81,6 +81,7 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_USE_GELU "clip.use_gelu" #define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_FF "clip.%s.feed_forward_length" @@ -526,6 +527,7 @@ struct clip_ctx { bool has_vision_encoder = false; bool has_llava_projector = false; bool has_minicpmv_projector = false; + int minicpmv_version = 2; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -641,7 +643,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_minicpmv_projector) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + if (ctx->minicpmv_version == 2) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + } + else if (ctx->minicpmv_version == 3) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); + } ggml_set_name(pos_embed, "pos_embed"); ggml_set_input(pos_embed); } @@ -768,8 +775,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + } + else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); @@ -949,10 +956,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } { // attention - const int hidden_size = 4096; + int hidden_size = 4096; const int d_head = 128; - const int n_head = hidden_size/d_head; - const int num_query = 96; + int n_head = hidden_size/d_head; + int num_query = 96; + if (ctx->minicpmv_version == 2) { + hidden_size = 4096; + n_head = hidden_size/d_head; + num_query = 96; + } + else if (ctx->minicpmv_version == 3) { + hidden_size = 3584; + n_head = hidden_size/d_head; + num_query = 64; + } struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); @@ -1149,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); } + idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION); + if (idx != -1) { + new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); + } + // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search GGML_ASSERT(new_clip->has_vision_encoder); @@ -1910,10 +1932,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { - if (clip_is_minicpmv(ctx)) { - std::vector> imgs = uhd_slice_image(img); + + if(clip_is_minicpmv(ctx)){ + int max_slice_nums = 9; + std::vector> imgs = uhd_slice_image(img, max_slice_nums); res_imgs->size = 0; - for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t i = 0; i < imgs.size(); ++i){ res_imgs->size += imgs[i].size(); } res_imgs->data = new clip_image_f32[res_imgs->size]; @@ -2146,7 +2170,12 @@ int clip_n_patches(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - n_patches = 96; + if (ctx->minicpmv_version == 2) { + n_patches = 96; + } + else if (ctx->minicpmv_version == 3) { + n_patches = 64; + } } return n_patches; @@ -2282,6 +2311,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + if(ctx->load_image_size==nullptr){ + ctx->load_image_size= clip_image_size_init(); + } + const int pos_w = ctx->load_image_size->width/patch_size; + const int pos_h = ctx->load_image_size->height/patch_size; { struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); @@ -2316,8 +2350,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = std::floor(70.0*i/num_positions); + int bucket_coords_h[70]; + int bucket_coords_w[70]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } } ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); @@ -2328,12 +2372,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // -> https://huggingface.co/Qwen/Qwen-VL/tree/main // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - if(ctx->load_image_size==nullptr){ - ctx->load_image_size= clip_image_size_init(); - } - int pos_w = ctx->load_image_size->width/patch_size; - int pos_h = ctx->load_image_size->height/patch_size; int embed_dim = 4096; + if (ctx->minicpmv_version == 2) { + embed_dim = 4096; + } + else if (ctx->minicpmv_version == 3) { + embed_dim = 3584; + } auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); @@ -2346,7 +2391,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); free(pos_embed_data); } - } else { + } + else{ { if (ctx->has_class_embedding) { struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); @@ -2548,13 +2594,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_3_b->ne[0]; } if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - return 4096; + if (ctx->minicpmv_version == 2) { + return 4096; + } + else if (ctx->minicpmv_version == 3) { + return 3584; + } } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } -bool clip_is_minicpmv(const struct clip_ctx * ctx) { - return ctx->has_minicpmv_projector; +int clip_is_minicpmv(const struct clip_ctx * ctx) { + if (ctx->has_minicpmv_projector) { + return ctx->minicpmv_version; + } + return 0; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 2ff4d3992..78588bdf1 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -85,7 +85,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); -CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); #ifdef __cplusplus } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 916d9dc40..851af0f00 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -256,7 +256,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli load_image_size->width = img_res_v.data[i].nx; load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); - const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + bool encoded = false; + int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); + if (has_minicpmv_projector == 2) { + encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + } + else if (has_minicpmv_projector == 3) { + encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); + } if (!encoded) { LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f951b57b2..379fc295f 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -134,7 +134,13 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e std::string system_prompt; int idx = 0; int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); + if (has_minicpmv_projector == 2) { + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + } + else if (has_minicpmv_projector == 3) { + system_prompt = "<|im_start|>user\n"; + } LOG_TEE("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); @@ -210,10 +216,24 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ std::string user_prompt = prompt; - if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); + if (!is_first) { + if (has_minicpmv_projector == 2) { + user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + } + else if (has_minicpmv_projector == 3) { + user_prompt = "<|im_start|>user\n" + prompt; + } + } eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + if (has_minicpmv_projector == 2) { + eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 3) { + eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); + } + // generate the response LOG_TEE("\n"); diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index 12cdd1281..ea773742a 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -1,9 +1,416 @@ -import argparse +# coding=utf-8 +# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Siglip model. """ +# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes + + import os +import math +import warnings + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn.init import _calculate_fan_in_and_fan_out + +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import ( + logging, +) +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +class SiglipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a + Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip + [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + Example: + ```python + >>> from transformers import SiglipVisionConfig, SiglipVisionModel + >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration + >>> configuration = SiglipVisionConfig() + >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration + >>> model = SiglipVisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "siglip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=16, + hidden_act="gelu_pytorch_tanh", + layer_norm_eps=1e-6, + attention_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + +_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" + +SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google/siglip-base-patch16-224", + # See all SigLIP models at https://huggingface.co/models?filter=siglip +] + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + if tensor.dtype in [torch.float16, torch.bfloat16]: + # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu + og_dtype = tensor.dtype + tensor = tensor.to(torch.float32) + tensor.erfinv_() + tensor = tensor.to(og_dtype) + else: + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + if tensor.dtype == torch.float16: + # The `clamp_` op is not (yet?) defined in float16+cpu + tensor = tensor.to(torch.float32) + tensor.clamp_(min=a, max=b) + tensor = tensor.to(torch.float16) + else: + tensor.clamp_(min=a, max=b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 +): + """Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \\leq \text{mean} \\leq b`. + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + denom = fan_in + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + + +def default_flax_embed_init(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="normal") + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches_per_side = self.image_size // self.patch_size + self.num_patches = self.num_patches_per_side**2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + +class SiglipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip +class SiglipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip +class SiglipEncoderLayer(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.self_attn = ( + SiglipAttention(config) + ) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + +class SiglipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SiglipVisionConfig + base_model_prefix = "siglip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + + if isinstance(module, SiglipVisionEmbeddings): + width = self.config.hidden_size + nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) + elif isinstance(module, nn.Embedding): + default_flax_embed_init(module.weight) + elif isinstance(module, SiglipAttention): + nn.init.normal_(module.q_proj.weight) + nn.init.normal_(module.k_proj.weight) + nn.init.normal_(module.v_proj.weight) + nn.init.normal_(module.out_proj.weight) + nn.init.zeros_(module.q_proj.bias) + nn.init.zeros_(module.k_proj.bias) + nn.init.zeros_(module.v_proj.bias) + nn.init.zeros_(module.out_proj.bias) + elif isinstance(module, SiglipMLP): + nn.init.normal_(module.fc1.weight) + nn.init.normal_(module.fc2.weight) + nn.init.normal_(module.fc1.bias, std=1e-6) + nn.init.normal_(module.fc2.bias, std=1e-6) + elif isinstance(module, (nn.Linear, nn.Conv2d)): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +SIGLIP_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + Parameters: + config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +SIGLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip +class SiglipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SiglipEncoderLayer`]. + Args: + config: SiglipConfig + """ + + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + +class SiglipVisionTransformer(SiglipPreTrainedModel): + config_class = SiglipVisionConfig + main_input_name = "pixel_values" + _supports_flash_attn_2 = True + + def __init__(self, config: SiglipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings(config) + self.encoder = SiglipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embedding + +import argparse import json import re -import torch import numpy as np from gguf import * from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig @@ -94,6 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2) # with proper args = ap.parse_args() @@ -135,6 +543,15 @@ if args.use_f32: # model = CLIPModel.from_pretrained(dir_model) # processor = CLIPProcessor.from_pretrained(dir_model) +minicpmv_version = args.minicpmv_version +emb_dim = 4096 +if minicpmv_version == 1: + emb_dim = 2304 +elif minicpmv_version == 2: + emb_dim = 4096 +elif minicpmv_version == 3: + emb_dim = 3584 + default_vision_config = { "hidden_size": 1152, "image_size": 980, @@ -144,8 +561,12 @@ default_vision_config = { "num_hidden_layers": 27, "patch_size": 14, } + vision_config = Idefics2VisionConfig(**default_vision_config) model = Idefics2VisionTransformer(vision_config) +if minicpmv_version == 3: + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) processor = None # if model.attn_pool is not None: @@ -158,6 +579,7 @@ fname_middle = None has_text_encoder = True has_vision_encoder = True has_minicpmv_projector = False + if args.text_only: fname_middle = "text-" has_vision_encoder = False @@ -165,6 +587,7 @@ elif args.minicpmv_projector is not None: fname_middle = "mmproj-" has_text_encoder = False has_minicpmv_projector = True + minicpmv_version = 3 elif args.vision_only: fname_middle = "vision-" has_text_encoder = False @@ -189,6 +612,7 @@ elif has_minicpmv_projector: fout.add_description("image encoder for MiniCPM-V") # add projector type fout.add_string("clip.projector_type", "resampler") + fout.add_int32("clip.minicpmv_version", minicpmv_version) else: fout.add_description("two-tower CLIP model") @@ -274,11 +698,11 @@ def _replace_name_resampler(s, v): if re.match("resampler.pos_embed", s): return { s: v, - re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), } if re.match("resampler.proj", s): return { - re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), } if re.match("resampler.attn.in_proj_.*", s): diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py index 2b6bce7cf..748ff5c57 100644 --- a/examples/llava/minicpmv-surgery.py +++ b/examples/llava/minicpmv-surgery.py @@ -4,7 +4,7 @@ import torch from transformers import AutoModel, AutoTokenizer ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") +ap.add_argument("-m", "--model", help="Path to MiniCPM-V model") args = ap.parse_args() # find the model part that includes the the multimodal projector weights @@ -29,7 +29,6 @@ if len(clip_tensors) > 0: f.write("{}\n") config = model.llm.config -config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" config.auto_map = { "AutoConfig": "configuration_minicpm.MiniCPMConfig", "AutoModel": "modeling_minicpm.MiniCPMModel", @@ -40,7 +39,6 @@ config.auto_map = { model.llm.save_pretrained(f"{args.model}/model") tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) tok.save_pretrained(f"{args.model}/model") -# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") print("Done!") print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") From 8b3befc0e2ed8fb18b903735831496b8b0c80949 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 16 Aug 2024 17:19:05 +0200 Subject: [PATCH 151/154] server : refactor middleware and /health endpoint (#9056) * server : refactor middleware and /health endpoint * move "fail_on_no_slot" to /slots * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov * fix server tests * fix CI * update server docs --------- Co-authored-by: Georgi Gerganov --- examples/server/README.md | 35 ++- examples/server/server.cpp | 281 ++++++++---------- examples/server/tests/features/steps/steps.py | 78 ++--- 3 files changed, 177 insertions(+), 217 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index e17595fe8..930ae15f6 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -368,15 +368,16 @@ node index.js ## API Endpoints -### GET `/health`: Returns the current state of the server +### GET `/health`: Returns heath check result - - 503 -> `{"status": "loading model"}` if the model is still being loaded. - - 500 -> `{"status": "error"}` if the model failed to load. - - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. - - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available. - - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available. +**Response format** - If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. +- HTTP status code 503 + - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}` + - Explanation: the model is still being loaded. +- HTTP status code 200 + - Body: `{"status": "ok" }` + - Explanation: the model is successfully loaded and the server is ready. ### POST `/completion`: Given a `prompt`, it returns the predicted completion. @@ -639,10 +640,16 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte }' ``` -### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. +### GET `/slots`: Returns the current slots processing state + +This endpoint can be disabled with `--no-slots` + +If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots. **Response format** +Example: + ```json [ { @@ -702,7 +709,13 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte ] ``` -### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled: +Possible values for `slot[i].state` are: +- `0`: SLOT_STATE_IDLE +- `1`: SLOT_STATE_PROCESSING + +### GET `/metrics`: Prometheus compatible metrics exporter + +This endpoint is only accessible if `--metrics` is set. Available metrics: - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. @@ -767,6 +780,10 @@ Available metrics: ### GET `/lora-adapters`: Get list of all LoRA adapters +This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` + +By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply` + If an adapter is disabled, the scale will be set to 0. **Response format** diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e073f5813..ce711eadd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,6 +15,8 @@ // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" +// mime type for sending response +#define MIMETYPE_JSON "application/json; charset=utf-8" // auto generated files (update with ./deps.sh) #include "colorthemes.css.hpp" @@ -67,7 +69,6 @@ enum slot_command { enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_READY, // Server is ready and model is loaded - SERVER_STATE_ERROR // An error occurred, load_model failed }; enum server_task_type { @@ -695,6 +696,7 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); + return true; } @@ -2555,19 +2557,19 @@ int main(int argc, char ** argv) { svr->set_default_headers({{"Server", "llama.cpp"}}); // CORS preflight - svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) { + // Access-Control-Allow-Origin is already set by middleware res.set_header("Access-Control-Allow-Credentials", "true"); res.set_header("Access-Control-Allow-Methods", "POST"); res.set_header("Access-Control-Allow-Headers", "*"); - return res.set_content("", "application/json; charset=utf-8"); + return res.set_content("", "text/html"); // blank response, no data }); svr->set_logger(log_server_request); auto res_error = [](httplib::Response & res, json error_data) { json final_response {{"error", error_data}}; - res.set_content(final_response.dump(), "application/json; charset=utf-8"); + res.set_content(final_response.dump(), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; @@ -2597,11 +2599,6 @@ int main(int argc, char ** argv) { svr->set_read_timeout (params.timeout_read); svr->set_write_timeout(params.timeout_write); - if (!svr->bind_to_port(params.hostname, params.port)) { - fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port); - return 1; - } - std::unordered_map log_data; log_data["hostname"] = params.hostname; @@ -2617,35 +2614,6 @@ int main(int argc, char ** argv) { // Necessary similarity of prompt for slot selection ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; - // load the model - if (!ctx_server.load_model(params)) { - state.store(SERVER_STATE_ERROR); - return 1; - } else { - ctx_server.init(); - state.store(SERVER_STATE_READY); - } - - LOG_INFO("model loaded", {}); - - const auto model_meta = ctx_server.model_meta(); - - // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { - LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - params.chat_template = "chatml"; - } - } - - // print sample chat example to make it clear which template is used - { - LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, - {"built_in", params.chat_template.empty()}, - }); - } - // // Middlewares // @@ -2689,8 +2657,6 @@ int main(int argc, char ** argv) { } // API key is invalid or not provided - // TODO: make another middleware for CORS related logic - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); LOG_WARNING("Unauthorized: Invalid API Key", {}); @@ -2698,8 +2664,21 @@ int main(int argc, char ** argv) { return false; }; + auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) { + server_state current_state = state.load(); + if (current_state == SERVER_STATE_LOADING_MODEL) { + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + return false; + } + return true; + }; + // register server middlewares - svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) { + svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + if (!middleware_server_state(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } if (!middleware_validate_api_key(req, res)) { return httplib::Server::HandlerResponse::Handled; } @@ -2710,62 +2689,15 @@ int main(int argc, char ** argv) { // Route handlers (or controllers) // - const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - switch (current_state) { - case SERVER_STATE_READY: - { - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.type = SERVER_TASK_TYPE_METRICS; - task.id_target = -1; - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - const int n_idle_slots = result.data.at("idle"); - const int n_processing_slots = result.data.at("processing"); - - json health = { - {"status", "ok"}, - {"slots_idle", n_idle_slots}, - {"slots_processing", n_processing_slots} - }; - - res.status = 200; // HTTP OK - if (params.endpoint_slots && req.has_param("include_slots")) { - health["slots"] = result.data.at("slots"); - } - - if (n_idle_slots == 0) { - health["status"] = "no slot available"; - if (req.has_param("fail_on_no_slot")) { - res.status = 503; // HTTP Service Unavailable - } - } - - res.set_content(health.dump(), "application/json"); - break; - } - case SERVER_STATE_LOADING_MODEL: - { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); - } break; - case SERVER_STATE_ERROR: - { - res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER)); - } break; - } + const auto handle_health = [&](const httplib::Request &, httplib::Response & res) { + // error and loading states are handled by middleware + json health = {{"status", "ok"}}; + res.set_content(health.dump(), "application/json"); }; - const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { + const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2783,13 +2715,22 @@ int main(int argc, char ** argv) { server_task_result result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - res.set_content(result.data.at("slots").dump(), "application/json"); + // optionally return "fail_on_no_slot" error + const int n_idle_slots = result.data.at("idle"); + if (req.has_param("fail_on_no_slot")) { + if (n_idle_slots == 0) { + res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); + return; + } + } + + res.set_content(result.data.at("slots").dump(), MIMETYPE_JSON); res.status = 200; // HTTP OK }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { if (!params.endpoint_metrics) { - res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2914,7 +2855,7 @@ int main(int argc, char ** argv) { if (result.error) { res_error(res, result.data); } else { - res.set_content(result.data.dump(), "application/json"); + res.set_content(result.data.dump(), MIMETYPE_JSON); } }; @@ -2944,7 +2885,7 @@ int main(int argc, char ** argv) { if (result.error) { res_error(res, result.data); } else { - res.set_content(result.data.dump(), "application/json"); + res.set_content(result.data.dump(), MIMETYPE_JSON); } }; @@ -2964,13 +2905,11 @@ int main(int argc, char ** argv) { if (result.error) { res_error(res, result.data); } else { - res.set_content(result.data.dump(), "application/json"); + res.set_content(result.data.dump(), MIMETYPE_JSON); } }; const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - std::string id_slot_str = req.path_params.at("id_slot"); int id_slot; @@ -2994,7 +2933,7 @@ int main(int argc, char ** argv) { } }; - const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_props = [&ctx_server](const httplib::Request &, httplib::Response & res) { std::string template_key = "tokenizer.chat_template", curr_tmpl; int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); if (tlen > 0) { @@ -3003,7 +2942,6 @@ int main(int argc, char ** argv) { curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); } } - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "system_prompt", ctx_server.system_prompt.c_str() }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, @@ -3011,7 +2949,7 @@ int main(int argc, char ** argv) { { "chat_template", curr_tmpl.c_str() } }; - res.set_content(data.dump(), "application/json; charset=utf-8"); + res.set_content(data.dump(), MIMETYPE_JSON); }; const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { @@ -3020,8 +2958,6 @@ int main(int argc, char ** argv) { return; } - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = json::parse(req.body); const int id_task = ctx_server.queue_tasks.get_new_id(); @@ -3032,7 +2968,7 @@ int main(int argc, char ** argv) { if (!json_value(data, "stream", false)) { server_task_result result = ctx_server.queue_results.recv(id_task); if (!result.error && result.stop) { - res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); } else { res_error(res, result.data); } @@ -3095,9 +3031,7 @@ int main(int argc, char ** argv) { } }; - const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - + const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { json models = { {"object", "list"}, {"data", { @@ -3106,12 +3040,12 @@ int main(int argc, char ** argv) { {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, - {"meta", model_meta} + {"meta", ctx_server.model_meta()} }, }} }; - res.set_content(models.dump(), "application/json; charset=utf-8"); + res.set_content(models.dump(), MIMETYPE_JSON); }; const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { @@ -3119,8 +3053,6 @@ int main(int argc, char ** argv) { res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } - - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); const int id_task = ctx_server.queue_tasks.get_new_id(); @@ -3135,7 +3067,7 @@ int main(int argc, char ** argv) { if (!result.error && result.stop) { json result_oai = format_final_response_oaicompat(data, result.data, completion_id); - res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); } else { res_error(res, result.data); } @@ -3197,8 +3129,6 @@ int main(int argc, char ** argv) { return; } - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = json::parse(req.body); const int id_task = ctx_server.queue_tasks.get_new_id(); @@ -3209,7 +3139,7 @@ int main(int argc, char ** argv) { if (!json_value(data, "stream", false)) { server_task_result result = ctx_server.queue_results.recv(id_task); if (!result.error && result.stop) { - res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); + res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); } else { res_error(res, result.data); } @@ -3257,7 +3187,6 @@ int main(int argc, char ** argv) { }; const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::vector tokens; @@ -3266,11 +3195,10 @@ int main(int argc, char ** argv) { tokens = ctx_server.tokenize(body.at("content"), add_special); } const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json; charset=utf-8"); + return res.set_content(data.dump(), MIMETYPE_JSON); }; const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::string content; @@ -3280,12 +3208,10 @@ int main(int argc, char ** argv) { } const json data = format_detokenized_response(content); - return res.set_content(data.dump(), "application/json; charset=utf-8"); + return res.set_content(data.dump(), MIMETYPE_JSON); }; const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - const json body = json::parse(req.body); bool is_openai = false; @@ -3331,11 +3257,10 @@ int main(int argc, char ** argv) { json root = is_openai ? format_embeddings_response_oaicompat(body, responses) : responses[0]; - return res.set_content(root.dump(), "application/json; charset=utf-8"); + return res.set_content(root.dump(), MIMETYPE_JSON); }; - const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { auto & la = ctx_server.lora_adapters[i]; @@ -3345,13 +3270,11 @@ int main(int argc, char ** argv) { {"scale", la.scale}, }); } - res.set_content(result.dump(), "application/json"); + res.set_content(result.dump(), MIMETYPE_JSON); res.status = 200; // HTTP OK }; const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - const std::vector body = json::parse(req.body); int max_idx = ctx_server.lora_adapters.size(); @@ -3379,7 +3302,7 @@ int main(int argc, char ** argv) { server_task_result result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - res.set_content(result.data.dump(), "application/json"); + res.set_content(result.data.dump(), MIMETYPE_JSON); res.status = 200; // HTTP OK }; @@ -3455,35 +3378,75 @@ int main(int argc, char ** argv) { log_data["n_threads_http"] = std::to_string(params.n_threads_http); svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; - LOG_INFO("HTTP server listening", log_data); + // clean up function, to be called before exit + auto clean_up = [&svr]() { + svr->stop(); + llama_backend_free(); + }; - // run the HTTP server in a thread - see comment below - std::thread t([&]() { - if (!svr->listen_after_bind()) { - state.store(SERVER_STATE_ERROR); - return 1; + // bind HTTP listen port, run the HTTP server in a thread + if (!svr->bind_to_port(params.hostname, params.port)) { + LOG_ERROR("couldn't bind HTTP server socket", { + {"hostname", params.hostname}, + {"port", params.port}, + }); + clean_up(); + LOG_ERROR("exiting due to HTTP server error", {}); + return 1; + } + std::thread t([&]() { svr->listen_after_bind(); }); + svr->wait_until_ready(); + + LOG_INFO("HTTP server is listening", log_data); + + // load the model + LOG_INFO("loading model", log_data); + if (!ctx_server.load_model(params)) { + clean_up(); + t.join(); + LOG_ERROR("exiting due to model loading error", {}); + return 1; + } else { + ctx_server.init(); + state.store(SERVER_STATE_READY); + + LOG_INFO("model loaded", {}); + + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { + if (!ctx_server.validate_model_chat_template()) { + LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + params.chat_template = "chatml"; + } } - return 0; - }); + // print sample chat example to make it clear which template is used + { + LOG_INFO("chat template", { + {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, + {"built_in", params.chat_template.empty()}, + }); + } - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_finish_multitask(std::bind( - &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); - ctx_server.queue_results.on_multitask_update(std::bind( - &server_queue::update_multitask, - &ctx_server.queue_tasks, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3 - )); + ctx_server.queue_tasks.on_new_task(std::bind( + &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_finish_multitask(std::bind( + &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( + &server_context::update_slots, &ctx_server)); + ctx_server.queue_results.on_multitask_update(std::bind( + &server_queue::update_multitask, + &ctx_server.queue_tasks, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3 + )); - shutdown_handler = [&](int) { - ctx_server.queue_tasks.terminate(); - }; + shutdown_handler = [&](int) { + ctx_server.queue_tasks.terminate(); + }; + ctx_server.queue_tasks.start_loop(); + } #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -3499,12 +3462,8 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - ctx_server.queue_tasks.start_loop(); - - svr->stop(); + clean_up(); t.join(); - llama_backend_free(); - return 0; } diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 6705a34fc..1ba7b60b6 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -205,27 +205,20 @@ def step_start_server(context): async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): match expecting_status: case 'healthy': - await wait_for_health_status(context, context.base_url, 200, 'ok', - timeout=30) + await wait_for_slots_status(context, context.base_url, 200, + timeout=30) case 'ready' | 'idle': - await wait_for_health_status(context, context.base_url, 200, 'ok', - timeout=30, - params={'fail_on_no_slot': 0, 'include_slots': 0}, - slots_idle=context.n_slots, - slots_processing=0, - expected_slots=[{'id': slot_id, 'state': 0} - for slot_id in - range(context.n_slots if context.n_slots else 1)]) + await wait_for_slots_status(context, context.base_url, 200, + timeout=30, + params={'fail_on_no_slot': 1}, + slots_idle=context.n_slots, + slots_processing=0) case 'busy': - await wait_for_health_status(context, context.base_url, 503, - 'no slot available', - params={'fail_on_no_slot': 0, 'include_slots': 0}, - slots_idle=0, - slots_processing=context.n_slots, - expected_slots=[{'id': slot_id, 'state': 1} - for slot_id in - range(context.n_slots if context.n_slots else 1)]) + await wait_for_slots_status(context, context.base_url, 503, + params={'fail_on_no_slot': 1}, + slots_idle=0, + slots_processing=context.n_slots) case _: assert False, "unknown status" @@ -1187,17 +1180,15 @@ async def gather_tasks_results(context): return n_completions -async def wait_for_health_status(context, - base_url, - expected_http_status_code, - expected_health_status, - timeout=3, - params=None, - slots_idle=None, - slots_processing=None, - expected_slots=None): +async def wait_for_slots_status(context, + base_url, + expected_http_status_code, + timeout=3, + params=None, + slots_idle=None, + slots_processing=None): if context.debug: - print(f"Starting checking for health for expected_health_status={expected_health_status}") + print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}") interval = 0.5 counter = 0 if 'GITHUB_ACTIONS' in os.environ: @@ -1205,26 +1196,19 @@ async def wait_for_health_status(context, async with aiohttp.ClientSession() as session: while True: - async with await session.get(f'{base_url}/health', params=params) as health_response: - status_code = health_response.status - health = await health_response.json() + async with await session.get(f'{base_url}/slots', params=params) as slots_response: + status_code = slots_response.status + slots = await slots_response.json() if context.debug: - print(f"HEALTH - response for expected health status='{expected_health_status}' on " - f"'{base_url}/health'?{params} is {health}\n") - if (status_code == expected_http_status_code - and health['status'] == expected_health_status - and (slots_idle is None or health['slots_idle'] == slots_idle) - and (slots_processing is None or health['slots_processing'] == slots_processing)): - if expected_slots is not None: - assert_slots_status(health['slots'], expected_slots) - return - if (status_code == expected_http_status_code - and health['status'] == expected_health_status - and (slots_idle is None or health['slots_idle'] == slots_idle) - and (slots_processing is None or health['slots_processing'] == slots_processing)): - if expected_slots is not None: - assert_slots_status(health['slots'], expected_slots) + print(f"slots responses {slots}\n") + if status_code == 503 and status_code == expected_http_status_code: return + if status_code == 200 and status_code == expected_http_status_code: + n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots) + n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots) + if ((slots_idle is None or slots_idle == n_slots_idle) + and (slots_processing is None or slots_processing == n_slots_processing)): + return await asyncio.sleep(interval) counter += interval @@ -1238,7 +1222,7 @@ async def wait_for_health_status(context, if n_completions > 0: return - assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}' + assert False, f'slots check timeout exceeded {counter}s>={timeout}' def assert_embeddings(embeddings): From 2fb9267887d24a431892ce4dccc75c7095b0d54d Mon Sep 17 00:00:00 2001 From: Yoshi Suhara Date: Sat, 17 Aug 2024 06:34:21 -0700 Subject: [PATCH 152/154] Fix incorrect use of ctx_split for bias tensors (#9063) --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7e9149eb9..5ab65ea97 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6172,9 +6172,9 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); // optional MLP bias - layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); @@ -6498,7 +6498,7 @@ static bool llm_load_tensors( layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens - layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); From 2339a0be1c8e31fcf4531427183b94f2ef019e56 Mon Sep 17 00:00:00 2001 From: ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> Date: Sun, 18 Aug 2024 10:58:04 +0100 Subject: [PATCH 153/154] tests : add integration test for lora adapters (#8957) * Add printing to check weights match torch version * minor code style changes --------- Co-authored-by: Xuan Son Nguyen --- .gitignore | 3 + tests/test-lora-conversion-inference.sh | 139 ++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100755 tests/test-lora-conversion-inference.sh diff --git a/.gitignore b/.gitignore index 5ae030200..9986ac6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,6 @@ poetry.toml # Scripts !/scripts/install-oneapi.bat + +# Test models for lora adapters +/lora-tests diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh new file mode 100755 index 000000000..c05c8e187 --- /dev/null +++ b/tests/test-lora-conversion-inference.sh @@ -0,0 +1,139 @@ +#!/bin/bash +set -e + +# Array of models to iterate over +declare -a params=( + "Gemma2ForCausalLM 64" + "LlamaForCausalLM 64" + "Phi3ForCausalLM 64" +) + +MODELS_REPO=lora-tests +MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO + +# Clone the Hugging Face repository if the directory does not exist +if [ ! -d "$MODELS_REPO" ]; then + echo "Cloning the Hugging Face repository..." + git clone $MODELS_REPO_URL +else + echo "Repository already exists. Skipping clone." +fi + +# Array to store results to print +results=() + +trim_leading_whitespace() { + local input_string="$1" + echo "${input_string#"${input_string%%[![:space:]]*}"}" +} + +extract_starting_substring() { + local reference_string="$1" + local target_string="$2" + + local target_length=${#target_string} + echo "${reference_string:0:$target_length}" +} + +get_first_word() { + local input_string="$1" + read -r first_word _ <<< "$input_string" + echo "$first_word" +} + +# Load the expected strings +EXPECTED_BASE_FULL=$(cat $MODELS_REPO/data/pale_blue_dot.txt) +EXPECTED_LORA_FULL=$(cat $MODELS_REPO/data/bohemian_rhapsody.txt) +EXPECTED_BASE_FIRST_WORD=$(get_first_word "$EXPECTED_BASE_FULL") +EXPECTED_LORA_FIRST_WORD=$(get_first_word "$EXPECTED_LORA_FULL") + +run_conversion_and_inference_lora() { + local model_name=$1 + local hidden_size=$2 + + echo -e "\n\n-------- RUNNING TEST FOR MODEL $model_name --------\n\n" + + # Convert safetensors to gguf + echo "Running convert_hf_to_gguf.py for $model_name with hidden_size $hidden_size..." + python convert_hf_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \ + --outfile $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + --outtype f32 + + echo -e "\n\n---------------------------\n\n" + echo "Running convert_lora_to_gguf.py for $model_name with hidden_size $hidden_size..." + python3 convert_lora_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora \ + --base $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \ + --outtype f32 + + echo -e "\n\n---------------------------\n\n" + echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..." + ./llama-export-lora \ + -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + -o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ + --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf + + # Run inference + echo -e "\n\n---------------------------\n\n" + echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..." + OUTPUT_BASE=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0) + + echo -e "\n\n---------------------------\n\n" + echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..." + OUTPUT_LORA_HOT=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \ + --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \ + -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) + + echo -e "\n\n---------------------------\n\n" + echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..." + OUTPUT_LORA_MERGED=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \ + -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0) + + # Remove any initial white space + OUTPUT_BASE=$(trim_leading_whitespace "$OUTPUT_BASE") + OUTPUT_LORA_HOT=$(trim_leading_whitespace "$OUTPUT_LORA_HOT") + OUTPUT_LORA_MERGED=$(trim_leading_whitespace "$OUTPUT_LORA_MERGED") + # Extract the corresponding substring from full string + EXPECTED_BASE=$(extract_starting_substring "$EXPECTED_BASE_FULL" "$OUTPUT_BASE") + EXPECTED_LORA=$(extract_starting_substring "$EXPECTED_LORA_FULL" "$OUTPUT_LORA_HOT") + + # Assert output equals the expected output + if [[ "$OUTPUT_BASE" != "$EXPECTED_BASE" ]]; then + echo "Error: $model_name OUTPUT_BASE does not start with the expected string." + echo -e "Out=$OUTPUT_BASE\n\nExp=$EXPECTED_BASE" + exit 1 + fi + if [[ "$OUTPUT_LORA_HOT" != "$EXPECTED_LORA" ]]; then + echo "Error: $model_name OUTPUT_LORA_HOT does not start with the expected string." + echo -e "Out=$OUTPUT_LORA_HOT\n\nExp=$EXPECTED_LORA" + exit 1 + fi + if [[ "$OUTPUT_LORA_MERGED" != "$EXPECTED_LORA" ]]; then + echo "Error: $model_name OUTPUT_LORA_MERGED does not start with the expected string." + echo -e "Out=$OUTPUT_LORA_MERGED\n\nExp=$EXPECTED_LORA" + exit 1 + fi + + # Store the results + results+=(" + \n\033[1mResults for $model_name with hidden_size $hidden_size:\033[0m + \n\033[32m • Base:\n$OUTPUT_BASE + \n\033[34m • Lora hot:\n$OUTPUT_LORA_HOT + \n\033[36m • Lora merged:\n$OUTPUT_LORA_MERGED + \n \033[0m + ") + + echo "All tests passed for $model_name with hidden_size $hidden_size!" +} + +# Run test for each model +for param in "${params[@]}"; do + run_conversion_and_inference_lora $param +done + +# Print results +echo -e "\n\n---------------------------\n\n" +echo -e "\n\033[1mSummary of All Results:\033[0m" +for result in "${results[@]}"; do + echo -e "$result" +done From 554b049068de24201d19dde2fa83e35389d4585d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 18 Aug 2024 17:43:32 +0300 Subject: [PATCH 154/154] flake.lock: Update (#9068) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index f9e1548a2..8e6c3e467 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1723175592, - "narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=", + "lastModified": 1723637854, + "narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "5e0ca22929f3342b19569b21b2f3462f053e497b", + "rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9", "type": "github" }, "original": {