diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 03d76d455..66ad85938 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -669,8 +669,7 @@ jobs: run: | cd examples/llama.android - # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820). - ./gradlew build --no-daemon -Pskip-armeabi-v7a + ./gradlew build --no-daemon # freeBSD-latest: # runs-on: macos-12 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c4629001..48880f720 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a # Raspberry Pi 3, 4, Zero 2 (32-bit) list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() diff --git a/Makefile b/Makefile index f03faf6ed..068f6ed02 100644 --- a/Makefile +++ b/Makefile @@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef LLAMA_CUBLAS $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) -CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH diff --git a/README.md b/README.md index 3bc512af0..d0af5d0b9 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well. - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) +**HTTP server** + +[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. **Bindings:** @@ -155,6 +158,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Msty](https://msty.app) (proprietary) --- diff --git a/common/common.cpp b/common/common.cpp index 10ef11829..ec596f5a0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; @@ -837,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { diff --git a/common/common.h b/common/common.h index 935771d44..3e21579b0 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/common/sampling.cpp b/common/sampling.cpp index de4331a11..e67096bea 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl( // } //} - LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); + //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); } } diff --git a/common/train.cpp b/common/train.cpp index e4c3d5df6..0dbfd24df 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -31,7 +31,7 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; @@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g std::string opt_type; GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; + opt->params.type = GGML_OPT_TYPE_ADAM; GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); @@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; + opt->params.type = GGML_OPT_TYPE_LBFGS; GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); @@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); @@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_add_tensor(fctx, opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 32d54b45f..ae30b2a76 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -192,7 +192,7 @@ class Model: return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel @@ -253,7 +253,7 @@ class Model: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN @@ -1074,10 +1074,11 @@ class StableLMModel(Model): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) class MixtralModel(Model): diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 65bb238a0..bf0125e75 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS); opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.lbfgs.n_iter = 16; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 98bf5a07a..3da5317b3 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_output = n_rank_output; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 92c67b7cf..d4b8729dd 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -447,8 +447,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae..8fec3d43d 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_NONE: return "none"; - case LLAMA_SPLIT_LAYER: return "layer"; - case LLAMA_SPLIT_ROW: return "row"; + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_LAYER}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_NONE; + mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_LAYER; + mode = LLAMA_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_ROW; + mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index aadbe22c9..d42140efe 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -21,12 +21,8 @@ android { useSupportLibrary = true } ndk { - // Workaround for https://github.com/llvm/llvm-project/issues/65820 - // affecting armeabi-v7a. Skip armeabi-v7a when invoked with - // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). - if (project.hasProperty("skip-armeabi-v7a")) { - abiFilters += listOf("arm64-v8a", "x86_64", "x86") - } + // Add NDK properties if wanted, e.g. + // abiFilters += listOf("arm64-v8a") } externalNativeBuild { cmake { diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 1a1cf7c78..980128166 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); - if (newline_tmp->backend != GGML_BACKEND_CPU) { + if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->buffer == NULL) { printf("newline_tmp tensor buffer is NULL\n"); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7555dffe4..34e84d0d4 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -548,8 +548,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -576,9 +576,9 @@ int main(int argc, char ** argv) { LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e12a1cdf1..47de67a93 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -126,7 +126,7 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token @@ -146,10 +146,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_cache_update (ctx); - n_past -= bd; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } llama_batch_clear(batch); @@ -179,10 +180,12 @@ int main(int argc, char ** argv) { LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); + llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; llama_batch_clear(batch); @@ -208,10 +211,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_kv_cache_defrag (ctx); + llama_kv_cache_update (ctx); - n_past -= n_discard; + n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } } diff --git a/examples/server/README.md b/examples/server/README.md index 0c43ac4c9..0e9bd7fd4 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,8 +1,20 @@ -# llama.cpp/example/server +# LLaMA.cpp HTTP Server -This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp. +Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**. -Command line options: +Set of LLM REST APIs and a simple web front end to interact with llama.cpp. + +**Features:** + * LLM inference of F16 and quantum models on GPU and CPU + * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes + * Parallel decoding with multi-user support + * Continuous batching + * Multimodal (wip) + * Monitoring endpoints + +The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). + +**Command line options:** - `--threads N`, `-t N`: Set the number of threads to use during generation. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. @@ -39,9 +51,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` -- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) +- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. +- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +- `--log-disable`: Output logs to stdout only, default: enabled. +- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) ## Build @@ -457,6 +472,18 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` +- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: + +Available metrics: +- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. +- `llamacpp:tokens_predicted_total`: Number of generation tokens processed. +- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s. +- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s. +- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage. +- `llamacpp:kv_cache_tokens`: KV-cache tokens. +- `llamacpp:requests_processing`: Number of request processing. +- `llamacpp:requests_deferred`: Number of request deferred. + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1ac3c3844..9d6b3b678 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,9 +43,11 @@ struct server_params int32_t read_timeout = 600; int32_t write_timeout = 600; bool slots_endpoint = true; + bool metrics_endpoint = false; }; bool server_verbose = false; +bool server_log_json = true; static size_t common_part(const std::vector &a, const std::vector &b) { @@ -301,12 +303,76 @@ struct llama_client_slot } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); + } +}; + +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; } }; @@ -344,6 +410,8 @@ struct llama_server_context llama_server_queue queue_tasks; llama_server_response queue_results; + llama_metrics metrics; + ~llama_server_context() { if (ctx) @@ -363,7 +431,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -416,7 +484,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_TEE("Available slots:\n"); + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -425,7 +493,10 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_INFO("new slot", { + {"slot_id", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -435,7 +506,12 @@ struct llama_server_context GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + + LOG_INFO("slot self-extend", { + {"slot_id", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); } slot.ga_i = 0; @@ -729,10 +805,16 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG_ERROR("failed to load image", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -792,7 +874,10 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); return true; } @@ -817,10 +902,24 @@ struct llama_server_context llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } - if (llama_decode(ctx, batch) != 0) + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; + const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + if (llama_decode(ctx, batch_view) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } } // assign the system KV cache to all parallel sequences @@ -1355,7 +1454,7 @@ struct llama_server_context if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {}); + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); queue_tasks.defer(task); break; } @@ -1404,7 +1503,7 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; - case TASK_TYPE_SLOTS_DATA: { + case TASK_TYPE_METRICS: { json slots_data = json::array(); int n_idle_slots = 0; int n_processing_slots = 0; @@ -1431,17 +1530,41 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); + LOG_VERBOSE("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; res.stop = true; res.error = false; res.result_json = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "slots", slots_data } + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", queue_tasks.queue_tasks_deferred.size() }, + + { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, + { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, + + { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, + { "t_prompt_processing", metrics.t_prompt_processing}, + { "n_tokens_predicted", metrics.n_tokens_predicted}, + { "t_tokens_generation", metrics.t_tokens_generation}, + + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + + { "slots", slots_data }, }; + metrics.reset_bucket(); queue_results.send(res); } break; } @@ -1469,7 +1592,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG_INFO("updating system prompt", {}); update_system_prompt(); } @@ -1479,12 +1602,13 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; } + LOG_VERBOSE("posting NEXT_RESPONSE", {}); task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.target_id = -1; @@ -1498,12 +1622,22 @@ struct llama_server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = system_tokens.size() + slot.n_past - n_keep; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + LOG_INFO("slot context shift", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1515,17 +1649,12 @@ struct llama_server_context slot.n_past -= n_discard; slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", n_keep }, - { "n_left", n_left }, - }); } } } // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); for (auto & slot : slots) { // release the slot @@ -1535,7 +1664,15 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); queue_tasks.notify_slot_changed(); continue; @@ -1662,6 +1799,14 @@ struct llama_server_context } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + + // the last token of the cache is not in the KV cache until the next call to llama_decode + // (it was sampled, pushed into the "cache_tokens", but not yet put in the context) + if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) + { + slot.n_past -= 1; + } + slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; if (slot.ga_n != 1) @@ -1683,7 +1828,12 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); } slot.cache_tokens = prompt_tokens; @@ -1691,7 +1841,10 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); slot.n_past--; if (slot.ga_i > 0) { @@ -1699,9 +1852,13 @@ struct llama_server_context } } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, @@ -1736,7 +1893,13 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_ERROR("failed processing images", { + "slot_id", slot.id, + "task_id", slot.task_id, + }); + // FIXME @phymbert: to be properly tested + // early returning without changing the slot state will block the slot for ever + // no one at the moment is checking the return value return false; } @@ -1778,9 +1941,9 @@ struct llama_server_context LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); + llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); slot.n_past_se -= bd; @@ -1836,7 +1999,7 @@ struct llama_server_context send_embedding(slot); slot.release(); slot.i_batch = -1; - return true; + continue; } completion_token_output result; @@ -1849,6 +2012,7 @@ struct llama_server_context { slot.t_start_genereration = ggml_time_us(); slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); } llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; @@ -1871,11 +2035,14 @@ struct llama_server_context slot.release(); slot.print_timings(); send_final_response(slot); + metrics.on_prediction(slot); } slot.i_batch = -1; } } + + LOG_VERBOSE("slots updated", {}); return true; } @@ -1953,8 +2120,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ctv TYPE, --cache-type-v TYPE\n"); printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-format log output format: json or text (default: json)\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); + printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); printf("\n"); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); @@ -2086,9 +2255,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2212,15 +2381,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; @@ -2405,6 +2574,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.mmproj = argv[i]; } + else if (arg == "--log-format") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + if (std::strcmp(argv[i], "json") == 0) + { + server_log_json = true; + } + else if (std::strcmp(argv[i], "text") == 0) + { + server_log_json = false; + } + else + { + invalid_param = true; + break; + } + } else if (arg == "--log-disable") { log_set_target(stdout); @@ -2414,6 +2604,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { sparams.slots_endpoint = false; } + else if (arg == "--metrics") + { + sparams.metrics_endpoint = true; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2447,15 +2641,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { @@ -2514,32 +2708,40 @@ static json format_partial_response( static json format_tokenizer_response(const std::vector &tokens) { - return json{ - {"tokens", tokens}}; + return json { + {"tokens", tokens} + }; } static json format_detokenized_response(std::string content) { - return json{ - {"content", content}}; + return json { + {"content", content} + }; } static void log_server_request(const httplib::Request &req, const httplib::Response &res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health" || req.path == "/v1/completions") + { + return; + } + LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + {"remote_addr", req.remote_addr}, + {"remote_port", req.remote_port}, + {"status", res.status}, + {"method", req.method}, + {"path", req.path}, + {"params", req.params}, + }); LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + {"request", req.body}, + {"response", res.body}, + }); } struct token_translator @@ -2621,7 +2823,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2668,7 +2870,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2683,6 +2885,87 @@ int main(int argc, char **argv) }); } + if (sparams.metrics_endpoint) { + svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) { + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_METRICS; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + json data = result.result_json; + + uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; + uint64_t t_prompt_processing = data["t_prompt_processing"]; + + uint64_t n_tokens_predicted = data["n_tokens_predicted"]; + uint64_t t_tokens_generation = data["t_tokens_generation"]; + + int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", data["n_prompt_tokens_processed_total"]} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", data["n_tokens_predicted_total"]} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0} + },{ + {"name", "kv_cache_usage_ratio"}, + {"help", "KV-cache usage. 1 means 100 percent usage."}, + {"value", 1. * kv_cache_used_cells / params.n_ctx} + },{ + {"name", "kv_cache_tokens"}, + {"help", "KV-cache tokens."}, + {"value", data["kv_cache_tokens_count"]} + },{ + {"name", "requests_processing"}, + {"help", "Number of request processing."}, + {"value", data["processing"]} + },{ + {"name", "requests_deferred"}, + {"help", "Number of request deferred."}, + {"value", data["deferred"]} + }}} + }; + + std::stringstream prometheus; + for (const auto& el : all_metrics_def.items()) { + const auto& type = el.key(); + const auto& metrics_def = el.value(); + for (const auto& metric_def : metrics_def) { + std::string name = metric_def["name"]; + std::string help = metric_def["help"]; + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << metric_def["value"] << "\n"; + } + } + + res.set_content(prometheus.str(), "text/plain; version=0.0.4"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) @@ -2735,9 +3018,6 @@ int main(int argc, char **argv) // Set the base directory for serving static files svr.set_base_dir(sparams.public_path); - // to make it ctrl+clickable: - LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - std::unordered_map log_data; log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index e44c5c286..0b9fdc4e7 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables: - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` + - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format ### Run @bug, @wip or @wrong_usage annotated scenario diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 13cc84101..09e826747 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -16,6 +16,8 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): + if context.server_process is None: + return if scenario.status == "failed": if 'GITHUB_ACTIONS' in os.environ: print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index 542006d9a..bf5a175a3 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,36 +1,4 @@ # List of ongoing issues @bug Feature: Issues - # Issue #5655 - Scenario: Multi users embeddings - Given a server listening on localhost:8080 - And a model file stories260K.gguf - And a model alias tinyllama-2 - And 42 as server seed - And 64 KV cache size - And 2 slots - And continuous batching - And embeddings extraction - Then the server is starting - Then the server is healthy - - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - Given concurrent embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated + # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 029a909ea..5f895cf90 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -8,6 +8,7 @@ Feature: Parallel And 42 as server seed And 64 KV cache size And 2 slots + And embeddings extraction And continuous batching Then the server is starting Then the server is healthy @@ -97,3 +98,48 @@ Feature: Parallel Then the server is busy Then the server is idle Then all prompts are predicted + + Scenario: Multi users embeddings + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated + + Scenario: Multi users OAI compatibility embeddings + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + And a prompt: + """ + What is the biggest US city ? + """ + And a prompt: + """ + What is the capital of Bulgaria ? + """ + And a model tinyllama-2 + Given concurrent OAI embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index fedcfe5ae..b571582a7 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -13,6 +13,7 @@ Feature: llama.cpp server And 1 slots And embeddings extraction And 32 server max tokens to predict + And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -25,11 +26,12 @@ Feature: llama.cpp server And max tokens to predict And a completion request with no api error Then tokens are predicted matching + And prometheus metrics are exposed Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | read | 8 | - | Write a joke about AI | 64 | (parkfriendsscared)+ | 32 | + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | (readgoing)+ | 8 | + | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 | Scenario Outline: OAI Compatibility Given a model @@ -60,6 +62,19 @@ Feature: llama.cpp server """ Then embeddings are generated + Scenario: OAI Embeddings compatibility with multiple inputs + Given a model tinyllama-2 + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + When an OAI compatible embeddings computation request for multiple inputs + Then embeddings are generated + Scenario: Tokenize / Detokenize When tokenizing: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index ea733a394..91dfccd28 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,4 +1,5 @@ import asyncio +import collections import json import os import re @@ -12,6 +13,7 @@ import aiohttp import openai from behave import step from behave.api.async_step import async_run_until_complete +from prometheus_client import parser @step(u"a server listening on {server_fqdn}:{server_port}") @@ -33,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port): context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False + context.server_metrics = False + context.server_process = None context.server_seed = None context.user_api_key = None @@ -81,6 +85,11 @@ def step_server_embeddings(context): context.server_embeddings = True +@step(u'prometheus compatible metrics exposed') +def step_server_metrics(context): + context.server_metrics = True + + @step(u"the server is starting") def step_start_server(context): start_server_background(context) @@ -262,57 +271,56 @@ def step_a_prompt_prompt(context, prompt): @step(u'concurrent completion requests') @async_run_until_complete() async def step_concurrent_completion_requests(context): - await concurrent_completion_requests(context, - request_completion, - # prompt is inserted automatically - context.base_url, - debug=context.debug, - n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - server_seed=context.server_seed if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key if hasattr(context, - 'user_api_key') else None) + await concurrent_requests(context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key if hasattr(context, + 'user_api_key') else None) @step(u'concurrent OAI completions requests') @async_run_until_complete async def step_oai_chat_completions(context): - await concurrent_completion_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - '/v1/chat/completions', - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) @step(u'concurrent OAI completions requests no v1') @async_run_until_complete async def step_oai_chat_completions(context): - await concurrent_completion_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - '/chat/completions', - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + '/chat/completions', + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) @step(u'all prompts are predicted') @@ -339,36 +347,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None): @step(u'embeddings are computed for') @async_run_until_complete async def step_compute_embedding(context): - content = context.text - base_url = context.base_url - context.embeddings = await request_embedding(content, base_url) + context.embeddings = await request_embedding(context.text, base_url=context.base_url) @step(u'embeddings are generated') def step_assert_embeddings(context): - assert_embeddings(context.embeddings) + if len(context.prompts) == 0: + assert_embeddings(context.embeddings) + else: + assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n" + f"context.prompts={context.prompts}\n" + f"context.embeddings={context.embeddings}") + for embedding in context.embeddings: + context.prompts.pop() + assert_embeddings(embedding) @step(u'an OAI compatible embeddings computation request for') -def step_oai_compute_embedding(context): - openai.api_key = 'nope' # openai client always expects an api_keu - if context.user_api_key is not None: - openai.api_key = context.user_api_key - openai.api_base = f'{context.base_url}/v1' - embeddings = openai.Embedding.create( - model=context.model, - input=context.text, - ) - context.embeddings = embeddings +@async_run_until_complete +async def step_oai_compute_embeddings(context): + context.embeddings = await request_oai_embeddings(context.text, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) + + +@step(u'an OAI compatible embeddings computation request for multiple inputs') +@async_run_until_complete +async def step_oai_compute_embeddings_multiple_inputs(context): + context.embeddings = await request_oai_embeddings(context.prompts, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) @step(u'concurrent embedding requests') @async_run_until_complete() async def step_concurrent_embedding_requests(context): - await concurrent_completion_requests(context, - request_embedding, - # prompt is inserted automatically - context.base_url) + await concurrent_requests(context, + request_embedding, + # prompt is inserted automatically + base_url=context.base_url) + + +@step(u'concurrent OAI embedding requests') +@async_run_until_complete() +async def step_concurrent_oai_embedding_requests(context): + await concurrent_requests(context, + request_oai_embeddings, + # prompt is inserted automatically + base_url=context.base_url, + async_client=True, + model=context.model) @step(u'all embeddings are generated') @@ -424,7 +454,24 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value -async def concurrent_completion_requests(context, f_completion, *args, **kwargs): +@step(u'prometheus metrics are exposed') +@async_run_until_complete +async def step_prometheus_metrics_exported(context): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/metrics') as metrics_response: + assert metrics_response.status == 200 + assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" + metrics_raw = await metrics_response.text() + metric_exported = False + for metric in parser.text_string_to_metric_families(metrics_raw): + match metric.name: + case "llamacpp:kv_cache_usage_ratio": + assert len(metric.samples) > 0 + metric_exported = True + assert metric_exported, "No metrics exported" + + +async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: print(f"starting {n_prompts} concurrent completion requests...") @@ -589,7 +636,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, base_url): +async def request_embedding(content, base_url=None): async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -600,6 +647,46 @@ async def request_embedding(content, base_url): return response_json['embedding'] +async def request_oai_embeddings(input, + base_url=None, user_api_key=None, + model=None, async_client=False): + # openai client always expects an api_key + user_api_key = user_api_key if user_api_key is not None else 'nope' + if async_client: + origin = 'llama.cpp' + if user_api_key is not None: + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/embeddings', + json={ + "input": input, + "model": model, + }, + headers=headers) as response: + assert response.status == 200, f"received status code not expected: {response.status}" + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + response_json = await response.json() + assert response_json['model'] == model, f"invalid model received: {response_json['model']}" + assert response_json['object'] == 'list' + return response_json['data'] + else: + openai.api_key = user_api_key + openai.api_base = f'{base_url}/v1' + oai_embeddings = openai.Embedding.create( + model=model, + input=input, + ) + + if isinstance(input, collections.abc.Sequence): + embeddings = [] + for an_oai_embeddings in oai_embeddings.data: + embeddings.append(an_oai_embeddings.embedding) + else: + embeddings = oai_embeddings.data.embedding + return embeddings + + def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] @@ -635,6 +722,8 @@ async def wait_for_health_status(context, if context.debug: print(f"Starting checking for health for expected_health_status={expected_health_status}") timeout = 3 # seconds + if expected_health_status == 'ok': + timeout = 10 # CI slow inference interval = 0.5 counter = 0 async with aiohttp.ClientSession() as session: @@ -672,7 +761,7 @@ async def wait_for_health_status(context, if n_completions > 0: return - assert False, 'timeout exceeded' + assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}' def assert_embeddings(embeddings): @@ -714,6 +803,8 @@ def start_server_background(context): server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') + if context.server_metrics: + server_args.append('--metrics') if context.model_alias is not None: server_args.extend(['--alias', context.model_alias]) if context.n_ctx is not None: @@ -726,6 +817,8 @@ def start_server_background(context): server_args.extend(['--api-key', context.server_api_key]) if context.debug: server_args.append('--verbose') + if 'SERVER_LOG_FORMAT_JSON' not in os.environ: + server_args.extend(['--log-format', "text"]) print(f"starting server with: {context.server_path}", *server_args) context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 3e51b12dc..334fa4a70 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,3 +1,4 @@ aiohttp~=3.9.3 behave~=1.2.6 openai~=0.25.0 +prometheus-client~=0.20.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 88545eb69..d7abd7cbb 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -14,6 +14,7 @@ using json = nlohmann::json; extern bool server_verbose; +extern bool server_log_json; #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -27,14 +28,14 @@ extern bool server_verbose; { \ if (server_verbose) \ { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ } \ } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) // // parallel @@ -50,7 +51,7 @@ enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, TASK_TYPE_NEXT_RESPONSE, - TASK_TYPE_SLOTS_DATA + TASK_TYPE_METRICS }; struct task_server { @@ -133,26 +134,48 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, - const char *message, const nlohmann::ordered_json &extra) +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { - nlohmann::ordered_json log - { + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = nlohmann::ordered_json{ + {"tid", ss_tid.str()}, {"timestamp", time(nullptr)}, - {"level", level}, - {"function", function}, - {"line", line}, - {"message", message}, }; - if (!extra.empty()) - { - log.merge_patch(extra); - } + if (server_log_json) { + log.merge_patch( + { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { + log.merge_patch(extra); + } - const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); + std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + } else { + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); + + if (!extra.empty()) { + log.merge_patch(extra); + } + std::stringstream ss; + ss << buf << " |"; + for (const auto& el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); + ss << buf; + } + + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); + } } // @@ -234,6 +257,7 @@ struct llama_server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; + LOG_VERBOSE("new task id", {{"new_id", task.id}}); } queue_tasks.push_back(std::move(task)); condition_tasks.notify_one(); @@ -249,7 +273,9 @@ struct llama_server_queue { // Get the next id for creating anew task int get_new_id() { std::unique_lock lock(mutex_tasks); - return id++; + int new_id = id++; + LOG_VERBOSE("new task id", {{"new_id", new_id}}); + return new_id; } // Register function to process a new task @@ -290,8 +316,7 @@ struct llama_server_queue { void start_loop() { running = true; while (true) { - // new task arrived - LOG_VERBOSE("have new task", {}); + LOG_VERBOSE("new task may arrive", {}); { while (true) { @@ -303,7 +328,7 @@ struct llama_server_queue { task_server task = queue_tasks.front(); queue_tasks.erase(queue_tasks.begin()); lock.unlock(); - LOG_VERBOSE("callback_new_task", {}); + LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } LOG_VERBOSE("callback_all_task_finished", {}); @@ -384,11 +409,13 @@ struct llama_server_response { std::condition_variable condition_results; void add_waiting_task_id(int task_id) { + LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } void remove_waiting_task_id(int task_id) { + LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); } @@ -401,7 +428,6 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", {}); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -426,22 +452,22 @@ struct llama_server_response { // Send a new result to a waiting task_id void send(task_result result) { std::unique_lock lock(mutex_results); - LOG_VERBOSE("send new result", {}); + LOG_VERBOSE("send new result", {{"task_id", result.id}}); for (auto& task_id : waiting_task_ids) { // LOG_TEE("waiting task id %i \n", task_id); // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result if (result.multitask_id == task_id) { - LOG_VERBOSE("callback_update_multitask", {}); + LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); callback_update_multitask(task_id, result.id, result); continue; } if (result.id == task_id) { - LOG_VERBOSE("queue_results.push_back", {}); + LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); queue_results.push_back(result); - condition_results.notify_one(); + condition_results.notify_all(); return; } } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index e78ab185d..7eafe8515 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -960,7 +960,7 @@ int main(int argc, char ** argv) { struct ggml_opt_context * opt = train->opt; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/flake.lock b/flake.lock index 47d6448b5..9f659ba8f 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1708118438, - "narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=", + "lastModified": 1708655239, + "narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80", + "rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a", "type": "github" }, "original": { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 21c612cb7..fb6d4f7d2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6369,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -7927,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co const dim3 block_dims(ncols, 1, 1); const dim3 block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols); - } else if (order == GGML_SORT_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols); + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_f32_i32<<>>(x, dst, ncols); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_f32_i32<<>>(x, dst, ncols); } else { GGML_ASSERT(false); } @@ -8362,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( cudaMemcpyKind kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = cudaMemcpyHostToDevice; src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -8771,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -8920,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -9096,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; const int compute_capability = g_device_caps[id].cc; @@ -9444,7 +9444,7 @@ static void ggml_cuda_op_soft_max( const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_GPU; + const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; if (src2_on_device) { ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; @@ -9502,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -9555,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -9636,8 +9636,8 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -9653,20 +9653,20 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); std::array tensor_split; if (split) { - // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; tensor_split = buft_ctx->tensor_split; @@ -9724,8 +9724,8 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; @@ -9776,8 +9776,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9802,12 +9802,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9820,14 +9820,14 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -9845,10 +9845,10 @@ static void ggml_cuda_op_mul_mat( if (!dst_on_device) { void * dst_off_device; cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device]; kind = cudaMemcpyDeviceToDevice; } else { @@ -9913,7 +9913,7 @@ static void ggml_cuda_op_mul_mat( } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_cuda_set_device(g_main_device); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -10019,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -10050,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -10109,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS @@ -10255,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -10409,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); @@ -10553,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s cudaStream_t stream = g_cudaStreams[g_main_device][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -10570,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -10611,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; for (int32_t row_id = 0; row_id < n_as; ++row_id) { @@ -10668,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaStreamSynchronize(stream)); } } @@ -10685,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -10817,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st if (!g_cublas_loaded) return false; ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -10966,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -11072,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -11087,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11098,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11333,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); } } - tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; tensor->extra = extra; } @@ -11605,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } @@ -11614,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } @@ -11644,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg ggml_cuda_set_main_device(cuda_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11654,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg } #ifndef NDEBUG - assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-metal.m b/ggml-metal.m index ee584cfa7..3d6b01263 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2262,8 +2262,8 @@ static bool ggml_metal_graph_compute( id pipeline = nil; switch (order) { - case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; default: GGML_ASSERT(false); }; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 797bee667..df619a884 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) { } void ggml_cl_free_data(const struct ggml_tensor* tensor) { - if (tensor->backend != GGML_BACKEND_GPU) { + if (tensor->backend != GGML_BACKEND_TYPE_GPU) { return; } @@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o } static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src } static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); } @@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); } @@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } - if (src1->backend != GGML_BACKEND_GPU) { + if (src1->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_Y, y_size); } - if (dst->backend != GGML_BACKEND_GPU) { + if (dst->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_D, d_size); } } @@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); @@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_fp16_to_fp32_row(tmp, d, d_ne); @@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } ggml_cl_pool_free(d_Y, y_size); @@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_Q; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { d_Q = ggml_cl_pool_malloc(q_sz, &q_size); } @@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { events.emplace_back(); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { d_Q = (cl_mem) src0->extra; } else { GGML_ASSERT(false); @@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (!mul_mat_vec) { // convert src0 to fp32 on device const size_t global = x_ne / global_denom; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); @@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // compute const size_t global = ne01 * local; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; const cl_int ncols = ne00; events.emplace_back(); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); @@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { ggml_cl_pool_free(d_Q, q_size); } } @@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) { return true; } @@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { CL_CHECK(clFinish(queue)); tensor->extra = dst; - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); } // ggml-backend @@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ctx->sub_buffers.push_back(sub_buffer); tensor->extra = sub_buffer; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; } static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { diff --git a/ggml-quants.c b/ggml-quants.c index 5c5f2ce1b..3d94d166d 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { return res; } +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define ggml_int16x8x2_t int16x8x2_t @@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 #define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 #endif @@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v qs += 16; vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); @@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index b897828f9..c6c3c6e6f 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){ size_t total_elements = ggml_nelements(src); - const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT; + const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT; float *src_data =NULL; if(src_on_device) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; @@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -10825,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const sycl::range<3> block_dims(1, 1, ncols); const sycl::range<3> block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { + if (order == GGML_SORT_ORDER_ASC) { /* DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10834,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); - } else if (order == GGML_SORT_DESC) { + } else if (order == GGML_SORT_ORDER_DESC) { /* DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10845,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); } else { GGML_ASSERT(false); @@ -11407,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, dpct::memcpy_direction kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = dpct::host_to_device; src_ptr = (char *) src->data; - // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr); - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = dpct::device_to_device; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -11846,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -12119,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check @@ -12501,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -12565,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); } @@ -12640,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -12656,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); @@ -12717,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; @@ -12782,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; const int64_t row_diff = row_high[id] - row_low[id]; ggml_sycl_set_device(get_device_id_by_index(id)); @@ -12809,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) { dst_dd_i += row_low[id]; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device_index) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset; @@ -12830,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, src1_ncols * ne10 * sizeof(float)))); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { SYCL_CHECK(ggml_sycl_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); /* DPCT1010:92: SYCL uses exceptions to report errors and does @@ -12867,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, if (!dst_on_device) { void * dst_off_device; dpct::memcpy_direction kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = dpct::device_to_host; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device_index]; kind = dpct::device_to_device; } else { @@ -12954,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); @@ -13091,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -13129,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13196,7 +13196,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13372,11 +13372,11 @@ catch (sycl::exception const &exc) { static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; for (int64_t id = 0; id < g_device_count; ++id) { @@ -13505,7 +13505,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne); @@ -13643,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index]; SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); @@ -13661,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -13756,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); } } @@ -13779,8 +13779,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -13887,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { memset(extra, 0, sizeof(*extra)); for (int64_t id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device_index) { + if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) { continue; } ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { + if (backend == GGML_BACKEND_TYPE_GPU) { row_low = 0; row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { + } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { const int64_t rounding = get_row_rounding(tensor->type); row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; @@ -13946,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { extra->data_device[id] = buf; - if (backend == GGML_BACKEND_GPU_SPLIT) { + if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { for (int64_t is = 0; is < MAX_STREAMS; ++is) { SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event())); @@ -13963,7 +13963,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_free_data(struct ggml_tensor *tensor) try { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) { return; } @@ -14016,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, return; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) { const ggml_op src0_op = tensor->src[0]->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); } } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) { ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); } @@ -14042,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, SYCL_CHECK(ggml_sycl_set_device(g_main_device)); const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t offset = 0; @@ -14111,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor, const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t view_offset = 0; @@ -14132,7 +14132,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_is_contiguous(tensor)); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -14219,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ if (!g_sycl_loaded) return false; ggml_sycl_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -14359,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_sycl_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -14517,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -14548,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14573,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14809,7 +14809,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( (char *)tensor->data + offset, data, size))); @@ -14827,7 +14827,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( data, (const char *)tensor->data + offset, size))); @@ -14880,7 +14880,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph ggml_sycl_set_main_device(sycl_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -14888,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) continue; - assert(node->backend == GGML_BACKEND_GPU); + assert(node->backend == GGML_BACKEND_TYPE_GPU); assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU); assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4e5eaff15..6caafb822 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su // compute ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data); ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); @@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_vk_sync_buffers(subctx); @@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; #endif GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t y_ne = ne10 * ne11 * ne12; @@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t d_ne = ne01 * ne11 * ne12; @@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx // TODO: support for transposed / permuted tensors GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; @@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } } - const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; @@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c vk_buffer d_D = extra->buffer_gpu.lock(); // Workaround for tiny tensor inputs on ROPE - if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { + if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } @@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { + if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) { ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); - } else if(dst->backend == GGML_BACKEND_CPU) { + } else if(dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); @@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); } @@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { // If backend is CPU, data from src0 has to be copied off the device - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; vk_buffer d_D = extra_src0->buffer_gpu.lock(); ggml_vk_sync_buffers(subctx); @@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU)); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { return; @@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { return; @@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod last_node = true; #endif - if (node->backend == GGML_BACKEND_CPU || last_node) { + if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) { ggml_vk_ctx_end(ctx->compute_ctx); ctx->compute_ctx->exit_tensor = node; ctx->compute_ctx = nullptr; @@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { return false; @@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } @@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; } @@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml int last_node = cgraph->n_nodes - 1; // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) { + while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) { last_node -= 1; } @@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml } ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::vector done; ggml_vk_print_graph_origin(tensor, done); - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { return; - GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU); if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } @@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src0_buffer = malloc(src0_size); src0_clone->data = src0_buffer; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { @@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src1_buffer = malloc(src1_size); src1_clone->data = src1_buffer; - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src1->backend == GGML_BACKEND_GPU) { + } else if (src1->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { @@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { @@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ comp_result = nullptr; comp_size = 0; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } diff --git a/ggml.c b/ggml.c index c09a3cad6..1d81553f4 100644 --- a/ggml.c +++ b/ggml.c @@ -2721,7 +2721,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } } - struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here @@ -2729,7 +2729,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_CPU, + /*.backend =*/ GGML_BACKEND_TYPE_CPU, /*.buffer =*/ NULL, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -3302,7 +3302,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3319,7 +3319,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3335,7 +3335,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); if (strcmp(cur->name, name) == 0) { return cur; @@ -5879,7 +5879,7 @@ struct ggml_tensor * ggml_top_k( int k) { GGML_ASSERT(a->ne[0] >= k); - struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC); + struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC); result = ggml_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], @@ -6673,7 +6673,7 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6705,7 +6705,7 @@ static void ggml_compute_forward_dup_f16( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6978,7 +6978,7 @@ static void ggml_compute_forward_dup_f32( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7231,7 +7231,7 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7411,7 +7411,7 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7419,7 +7419,7 @@ static void ggml_compute_forward_add_f32( const int nth = params->nth; #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -7501,7 +7501,7 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7580,7 +7580,7 @@ static void ggml_compute_forward_add_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7636,7 +7636,7 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7774,7 +7774,7 @@ static void ggml_compute_forward_add1_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7828,7 +7828,7 @@ static void ggml_compute_forward_add1_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7880,7 +7880,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7932,7 +7932,7 @@ static void ggml_compute_forward_add1_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8062,7 +8062,7 @@ static void ggml_compute_forward_acc_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -8074,7 +8074,7 @@ static void ggml_compute_forward_acc_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8176,7 +8176,7 @@ static void ggml_compute_forward_sub_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8257,14 +8257,14 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; #if defined(GGML_USE_CLBLAST) - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -8365,7 +8365,7 @@ static void ggml_compute_forward_div_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8460,7 +8460,7 @@ static void ggml_compute_forward_sqr_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8506,7 +8506,7 @@ static void ggml_compute_forward_sqrt_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8552,7 +8552,7 @@ static void ggml_compute_forward_log_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8598,7 +8598,7 @@ static void ggml_compute_forward_sum_f32( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8633,7 +8633,7 @@ static void ggml_compute_forward_sum_f16( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8690,7 +8690,7 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8745,7 +8745,7 @@ static void ggml_compute_forward_mean_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8804,7 +8804,7 @@ static void ggml_compute_forward_argmax_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8855,7 +8855,7 @@ static void ggml_compute_forward_repeat_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8900,7 +8900,7 @@ static void ggml_compute_forward_repeat_f16( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8974,7 +8974,7 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9051,7 +9051,7 @@ static void ggml_compute_forward_concat_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9123,7 +9123,7 @@ static void ggml_compute_forward_abs_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9169,7 +9169,7 @@ static void ggml_compute_forward_sgn_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9215,7 +9215,7 @@ static void ggml_compute_forward_neg_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9261,7 +9261,7 @@ static void ggml_compute_forward_step_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9307,7 +9307,7 @@ static void ggml_compute_forward_tanh_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9353,7 +9353,7 @@ static void ggml_compute_forward_elu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9399,7 +9399,7 @@ static void ggml_compute_forward_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9446,7 +9446,7 @@ static void ggml_compute_forward_gelu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9509,7 +9509,7 @@ static void ggml_compute_forward_gelu_quick_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9572,7 +9572,7 @@ static void ggml_compute_forward_silu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9633,7 +9633,7 @@ static void ggml_compute_forward_leaky_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9686,7 +9686,7 @@ static void ggml_compute_forward_silu_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0, grad)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9748,7 +9748,7 @@ static void ggml_compute_forward_hardswish_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9791,7 +9791,7 @@ static void ggml_compute_forward_hardsigmoid_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9837,7 +9837,7 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9912,7 +9912,7 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9983,7 +9983,7 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10161,7 +10161,7 @@ static void ggml_compute_forward_group_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10328,7 +10328,7 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; @@ -10341,7 +10341,7 @@ static void ggml_compute_forward_mul_mat( const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); UNUSED(desired_wsize); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (type != GGML_TYPE_F32) { assert(params->wsize >= desired_wsize); // parallelize by src0 rows @@ -10364,7 +10364,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10402,7 +10402,7 @@ static void ggml_compute_forward_mul_mat( } #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10426,7 +10426,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10583,7 +10583,7 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10620,7 +10620,7 @@ static void ggml_compute_forward_mul_mat_id( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10768,7 +10768,7 @@ static void ggml_compute_forward_out_prod_f32( (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst if (use_blas) { return; @@ -10781,7 +10781,7 @@ static void ggml_compute_forward_out_prod_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10961,7 +10961,7 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10969,7 +10969,7 @@ static void ggml_compute_forward_out_prod_q_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11087,7 +11087,7 @@ static void ggml_compute_forward_scale_f32( GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11159,7 +11159,7 @@ static void ggml_compute_forward_set_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -11171,7 +11171,7 @@ static void ggml_compute_forward_set_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11319,7 +11319,7 @@ static void ggml_compute_forward_get_rows_q( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11359,7 +11359,7 @@ static void ggml_compute_forward_get_rows_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11396,7 +11396,7 @@ static void ggml_compute_forward_get_rows_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11499,14 +11499,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11538,14 +11538,14 @@ static void ggml_compute_forward_get_rows_back_f32( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11615,7 +11615,7 @@ static void ggml_compute_forward_diag_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11684,7 +11684,7 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(n_past >= 0); - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (ith != 0) { return; } @@ -11698,7 +11698,7 @@ static void ggml_compute_forward_diag_mask_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11772,7 +11772,7 @@ static void ggml_compute_forward_soft_max_f32( assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11910,7 +11910,7 @@ static void ggml_compute_forward_soft_max_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12004,7 +12004,7 @@ static void ggml_compute_forward_alibi_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12063,7 +12063,7 @@ static void ggml_compute_forward_alibi_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12170,7 +12170,7 @@ static void ggml_compute_forward_clamp_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12310,7 +12310,7 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12488,7 +12488,7 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12719,7 +12719,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12759,7 +12759,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12818,7 +12818,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12858,7 +12858,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12962,11 +12962,11 @@ static void ggml_compute_forward_im2col_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13050,11 +13050,11 @@ static void ggml_compute_forward_im2col_f16( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13136,7 +13136,7 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -13178,7 +13178,7 @@ static void ggml_compute_forward_conv_transpose_2d( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13230,7 +13230,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13299,7 +13299,7 @@ static void ggml_compute_forward_pool_2d( GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13372,7 +13372,7 @@ static void ggml_compute_forward_upscale_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13432,7 +13432,7 @@ static void ggml_compute_forward_pad_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13493,7 +13493,7 @@ static void ggml_compute_forward_argsort_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13519,8 +13519,8 @@ static void ggml_compute_forward_argsort_f32( // C doesn't have a functional sort, so we do a bubble sort instead for (int64_t j = 0; j < ne0; j++) { for (int64_t k = j + 1; k < ne0; k++) { - if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || - (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { int32_t tmp = dst_data[j]; dst_data[j] = dst_data[k]; dst_data[k] = tmp; @@ -13603,11 +13603,11 @@ static void ggml_compute_forward_flash_attn_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13795,11 +13795,11 @@ static void ggml_compute_forward_flash_attn_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14054,11 +14054,11 @@ static void ggml_compute_forward_flash_ff_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14213,14 +14213,14 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14536,7 +14536,7 @@ static void ggml_compute_forward_win_part_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14602,7 +14602,7 @@ static void ggml_compute_forward_win_unpart_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14730,7 +14730,7 @@ static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14782,14 +14782,14 @@ static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_tensor * src2 = dst->src[2]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; - if (!inplace && params->type == GGML_TASK_INIT) { + if (!inplace && params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14871,7 +14871,7 @@ static void ggml_compute_forward_map_unary_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14920,7 +14920,7 @@ static void ggml_compute_forward_map_binary_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14969,7 +14969,7 @@ static void ggml_compute_forward_map_custom1_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14988,7 +14988,7 @@ static void ggml_compute_forward_map_custom2_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15008,7 +15008,7 @@ static void ggml_compute_forward_map_custom3_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15023,7 +15023,7 @@ static void ggml_compute_forward_map_custom1( const struct ggml_tensor * a = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15041,7 +15041,7 @@ static void ggml_compute_forward_map_custom2( const struct ggml_tensor * a = dst->src[0]; const struct ggml_tensor * b = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15060,7 +15060,7 @@ static void ggml_compute_forward_map_custom3( const struct ggml_tensor * b = dst->src[1]; const struct ggml_tensor * c = dst->src[2]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15094,14 +15094,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { if (ith == 0) { float * dp = (float *) dst->data; ggml_vec_sum_f32(nth, dp, sums); @@ -15216,7 +15216,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ith = params->ith; const int64_t nth = params->nth; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15323,8 +15323,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #elif defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS @@ -15335,8 +15335,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #endif // GGML_USE_CUBLAS #ifdef GGML_USE_SYCL @@ -16882,7 +16882,7 @@ size_t ggml_graph_overhead(void) { struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) { const size_t obj_size = ggml_graph_nbytes(size, grads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); @@ -17429,7 +17429,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith); int node_n = -1; - int task_phase = GGML_TASK_FINALIZE; + int task_phase = GGML_TASK_TYPE_FINALIZE; while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { @@ -17441,7 +17441,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_FINALIZE, + /*.type =*/ GGML_TASK_TYPE_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, /*.wsize =*/ cplan->work_size, @@ -17472,17 +17472,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (n_tasks == 1) { /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_INIT; + params.type = GGML_TASK_TYPE_INIT; ggml_compute_forward(¶ms, node); } // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); if (GGML_OP_HAS_FINALIZE[node->op]) { - params.type = GGML_TASK_FINALIZE; + params.type = GGML_TASK_TYPE_FINALIZE; ggml_compute_forward(¶ms, node); } @@ -17496,7 +17496,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - task_phase = GGML_TASK_INIT; + task_phase = GGML_TASK_TYPE_INIT; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_n, node_n); atomic_store(&state->shared->node_task, task_phase); @@ -17513,7 +17513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_tasks = ggml_get_n_tasks(node, n_threads); struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, + /*.type =*/ GGML_TASK_TYPE_INIT, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, /*.wsize =*/ cplan->work_size, @@ -17527,7 +17527,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_COMPUTE; + task_phase = GGML_TASK_TYPE_COMPUTE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17542,12 +17542,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (state->ith < n_tasks) { - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_FINALIZE; + task_phase = GGML_TASK_TYPE_FINALIZE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17783,7 +17783,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, - /*.node_task =*/ GGML_TASK_FINALIZE, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; @@ -17851,7 +17851,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -18659,7 +18659,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; bool cancel = false; @@ -18671,7 +18671,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18762,7 +18762,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL;; + return GGML_OPT_RESULT_CANCEL;; } } // ggml_graph_reset (gf); @@ -18779,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_adam( if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -18789,7 +18789,7 @@ static enum ggml_opt_result ggml_opt_adam( const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -18805,7 +18805,7 @@ static enum ggml_opt_result ggml_opt_adam( ++n_no_improvement[0]; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } @@ -18823,7 +18823,7 @@ static enum ggml_opt_result ggml_opt_adam( } } - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // @@ -18904,7 +18904,7 @@ static enum ggml_opt_result linesearch_backtracking( float sched = 0; callback(callback_data, accum_step, &sched, cancel); if (*cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18977,7 +18977,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return GGML_OPT_INVALID_WOLFE; + return GGML_OPT_RESULT_INVALID_WOLFE; } } @@ -19006,7 +19006,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; float * x = opt->lbfgs.x->data; // current parameters @@ -19047,7 +19047,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( float sched = 0; callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -19075,7 +19075,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // already optimized if (gnorm/xnorm <= params.lbfgs.eps) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } if (opt->just_initialized) { @@ -19120,7 +19120,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // way to test and don't want to break something with so many changes lined up ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } if (ls < 0) { @@ -19143,7 +19143,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } if (gnorm/xnorm <= params.lbfgs.eps) { // converged - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -19153,7 +19153,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -19169,14 +19169,14 @@ static enum ggml_opt_result ggml_opt_lbfgs( n_no_improvement[0]++; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // update vectors s and y: @@ -19232,17 +19232,17 @@ static enum ggml_opt_result ggml_opt_lbfgs( GGML_ASSERT(false && "lbfgs failed"); - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { struct ggml_opt_params result; switch (type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = (struct ggml_opt_params) { - .type = GGML_OPT_ADAM, + .type = GGML_OPT_TYPE_ADAM, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ? .past = 0, @@ -19270,10 +19270,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { }, }; } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = (struct ggml_opt_params) { - .type = GGML_OPT_LBFGS, + .type = GGML_OPT_TYPE_LBFGS, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, .past = 0, @@ -19318,12 +19318,12 @@ GGML_API void ggml_opt_init( opt->just_initialized = true; if (opt->ctx == NULL) { struct ggml_init_params ctx_opt_params; - if (opt->params.type == GGML_OPT_ADAM) { + if (opt->params.type == GGML_OPT_TYPE_ADAM) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; } - } else if (opt->params.type == GGML_OPT_LBFGS) { + } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; @@ -19335,7 +19335,7 @@ GGML_API void ggml_opt_init( opt->ctx = ggml_init(ctx_opt_params); } switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19349,7 +19349,7 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19393,13 +19393,13 @@ enum ggml_opt_result ggml_opt( ctx = ggml_init(params_ctx); if (ctx == NULL) { - return GGML_OPT_NO_CONTEXT; + return GGML_OPT_RESULT_NO_CONTEXT; } free_ctx = true; } - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); @@ -19438,14 +19438,14 @@ enum ggml_opt_result ggml_opt_resume_g( void * callback_data) { // build forward + backward compute graphs - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; diff --git a/ggml.h b/ggml.h index a4166e1f7..75fd035a4 100644 --- a/ggml.h +++ b/ggml.h @@ -364,9 +364,9 @@ extern "C" { }; enum ggml_backend_type { - GGML_BACKEND_CPU = 0, - GGML_BACKEND_GPU = 10, - GGML_BACKEND_GPU_SPLIT = 20, + GGML_BACKEND_TYPE_CPU = 0, + GGML_BACKEND_TYPE_GPU = 10, + GGML_BACKEND_TYPE_GPU_SPLIT = 20, }; // model file types @@ -498,9 +498,9 @@ extern "C" { }; enum ggml_object_type { - GGML_OBJECT_TENSOR, - GGML_OBJECT_GRAPH, - GGML_OBJECT_WORK_BUFFER + GGML_OBJECT_TYPE_TENSOR, + GGML_OBJECT_TYPE_GRAPH, + GGML_OBJECT_TYPE_WORK_BUFFER }; enum ggml_log_level { @@ -642,9 +642,9 @@ extern "C" { // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, + GGML_TASK_TYPE_INIT = 0, + GGML_TASK_TYPE_COMPUTE, + GGML_TASK_TYPE_FINALIZE, }; struct ggml_compute_params { @@ -1649,8 +1649,8 @@ extern "C" { // sort rows enum ggml_sort_order { - GGML_SORT_ASC, - GGML_SORT_DESC, + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, }; GGML_API struct ggml_tensor * ggml_argsort( @@ -1943,8 +1943,8 @@ extern "C" { // optimization methods enum ggml_opt_type { - GGML_OPT_ADAM, - GGML_OPT_LBFGS, + GGML_OPT_TYPE_ADAM, + GGML_OPT_TYPE_LBFGS, }; // linesearch methods @@ -1958,12 +1958,12 @@ extern "C" { // optimization return values enum ggml_opt_result { - GGML_OPT_OK = 0, - GGML_OPT_DID_NOT_CONVERGE, - GGML_OPT_NO_CONTEXT, - GGML_OPT_INVALID_WOLFE, - GGML_OPT_FAIL, - GGML_OPT_CANCEL, + GGML_OPT_RESULT_OK = 0, + GGML_OPT_RESULT_DID_NOT_CONVERGE, + GGML_OPT_RESULT_NO_CONTEXT, + GGML_OPT_RESULT_INVALID_WOLFE, + GGML_OPT_RESULT_FAIL, + GGML_OPT_RESULT_CANCEL, GGML_LINESEARCH_FAIL = -128, GGML_LINESEARCH_MINIMUM_STEP, diff --git a/llama.cpp b/llama.cpp index 1f6b6cff4..28430254f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_NONE, "none" }, - { LLAMA_ROPE_SCALING_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_UNSPECIFIED; + return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1550,8 +1550,9 @@ static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; struct llama_hparams { - bool vocab_only; - bool rope_finetuned; + bool vocab_only; + bool rope_finetuned; + uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -1580,7 +1581,8 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1707,11 +1709,20 @@ struct llama_kv_cell { bool has_seq_id(const llama_seq_id & id) const { return seq_id.find(id) != seq_id.end(); } + + bool is_empty() const { + return seq_id.empty(); + } + + bool is_same_seq(const llama_kv_cell & other) const { + return seq_id == other.seq_id; + } }; // ring-buffer of cached KV data struct llama_kv_cache { bool has_shift = false; + bool do_defrag = false; // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_internal also uses it, so it @@ -1723,6 +1734,9 @@ struct llama_kv_cache { // computed before each graph build uint32_t n = 0; + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + std::vector cells; std::vector k_l; // per layer @@ -1958,8 +1972,8 @@ struct llama_context { static bool llama_kv_cache_init( struct llama_kv_cache & cache, const llama_model & model, - ggml_type ktype, - ggml_type vtype, + ggml_type type_k, + ggml_type type_v, uint32_t n_ctx, bool offload) { const struct llama_hparams & hparams = model.hparams; @@ -1974,6 +1988,9 @@ static bool llama_kv_cache_init( cache.size = n_ctx; cache.used = 0; + cache.type_k = type_k; + cache.type_v = type_v; + cache.cells.clear(); cache.cells.resize(n_ctx); @@ -2014,8 +2031,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -2099,7 +2116,7 @@ static bool llama_kv_cache_find_slot( // find how many cells are currently in use static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { for (uint32_t i = cache.size - 1; i > 0; --i) { - if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) { + if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) { return i + 1; } } @@ -2135,7 +2152,7 @@ static void llama_kv_cache_seq_rm( } else { continue; } - if (cache.cells[i].seq_id.empty()) { + if (cache.cells[i].is_empty()) { // keep count of the number of used cells if (cache.cells[i].pos >= 0) cache.used--; @@ -2186,7 +2203,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id if (new_head != cache.size && new_head < cache.head) cache.head = new_head; } -static void llama_kv_cache_seq_shift( +static void llama_kv_cache_seq_add( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0, @@ -2204,10 +2221,14 @@ static void llama_kv_cache_seq_shift( cache.cells[i].delta += delta; if (cache.cells[i].pos < 0) { - if (!cache.cells[i].seq_id.empty()) cache.used--; + if (!cache.cells[i].is_empty()) { + cache.used--; + } cache.cells[i].pos = -1; cache.cells[i].seq_id.clear(); - if (new_head == cache.size) new_head = i; + if (new_head == cache.size) { + new_head = i; + } } } } @@ -2239,6 +2260,22 @@ static void llama_kv_cache_seq_div( } } +static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id)) { + result = std::max(result, cache.cells[i].pos); + } + } + + return result; +} + +static void llama_kv_cache_defrag(struct llama_kv_cache & cache) { + cache.do_defrag = true; +} + // // model loading and saving // @@ -2310,7 +2347,7 @@ namespace GGUFMeta { } }; - struct ArrayInfo{ + struct ArrayInfo { const gguf_type gt; const size_t length; const void * data; @@ -2329,7 +2366,7 @@ namespace GGUFMeta { }; template - class GKV: public GKV_Base { + class GKV : public GKV_Base { GKV() = delete; public: @@ -2345,46 +2382,46 @@ namespace GGUFMeta { static const char * override_type_to_str(const llama_model_kv_override_type ty) { switch (ty) { - case LLAMA_KV_OVERRIDE_BOOL: return "bool"; - case LLAMA_KV_OVERRIDE_INT: return "int"; - case LLAMA_KV_OVERRIDE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; } return "unknown"; } - static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { - if (!override) { return false; } - if (override->tag == expected_type) { + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { + if (!ovrd) { return false; } + if (ovrd->tag == expected_type) { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", - __func__, override_type_to_str(override->tag), override->key); - switch (override->tag) { - case LLAMA_KV_OVERRIDE_BOOL: { - LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); + __func__, override_type_to_str(ovrd->tag), ovrd->key); + switch (ovrd->tag) { + case LLAMA_KV_OVERRIDE_TYPE_BOOL: { + LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); } break; - case LLAMA_KV_OVERRIDE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); + case LLAMA_KV_OVERRIDE_TYPE_INT: { + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); } break; - case LLAMA_KV_OVERRIDE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", override->float_value); + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { + LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... throw std::runtime_error( format("Unsupported attempt to override %s type for metadata key %s\n", - override_type_to_str(override->tag), override->key)); + override_type_to_str(ovrd->tag), ovrd->key)); } return true; } LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", - __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); return false; } template static typename std::enable_if::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { - target = override->bool_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { + target = ovrd->bool_value; return true; } return false; @@ -2392,9 +2429,9 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { - target = override->int_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { + target = ovrd->int_value; return true; } return false; @@ -2402,9 +2439,9 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { - target = override->float_value; + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { + target = ovrd->float_value; return true; } return false; @@ -2412,17 +2449,17 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { + try_override(T & target, const struct llama_model_kv_override * ovrd) { (void)target; - (void)override; - if (!override) { return false; } + (void)ovrd; + if (!ovrd) { return false; } // Currently, we should never end up here so it would be a bug if we do. throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - override ? override->key : "NULL")); + ovrd ? ovrd->key : "NULL")); } - static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { - if (try_override(target, override)) { + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + if (try_override(target, ovrd)) { return true; } if (k < 0) { return false; } @@ -2430,12 +2467,12 @@ namespace GGUFMeta { return true; } - static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, gguf_find_key(ctx, key), target, override); + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, gguf_find_key(ctx, key), target, ovrd); } - static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, key.c_str(), target, override); + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, key.c_str(), target, ovrd); } }; } @@ -2846,6 +2883,15 @@ struct llama_model_loader { } }; +template<> +bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { + uint32_t tmp; + const bool found = get_key(kid, tmp, required); + result = (enum llama_pooling_type) tmp; + return found; +} + + // // load LLaMA models // @@ -2926,16 +2972,16 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } + static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } - static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); if (model.arch == LLM_ARCH_UNKNOWN) { @@ -2999,7 +3045,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3112,10 +3158,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); switch (hparams.n_layer) { case 3: @@ -3133,10 +3179,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_NOMIC_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { model.type = e_model::MODEL_137M; @@ -3275,6 +3321,8 @@ static void llm_load_hparams( if (hparams.f_max_alibi_bias > 0.0f) { hparams.need_kq_pos = true; } + + hparams.rope_type = llama_rope_type(&model); } // TODO: This should probably be in llama.h @@ -3577,6 +3625,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); + LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); + LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -3643,7 +3693,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_LAYER) { + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3682,10 +3732,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_ROW) { + if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -4598,12 +4648,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam using llm_build_cb = std::function; -enum llm_rope_type { - LLM_ROPE, - LLM_ROPE_NEOX, - LLM_ROPE_GLM, -}; - enum llm_ffn_op_type { LLM_FFN_SILU, LLM_FFN_GELU, @@ -4649,55 +4693,6 @@ static struct ggml_tensor * llm_build_inp_embd( return inpL; } -// Persimmon: n_rot = n_embd_head_k/2 -// Other: n_rot = n_embd_head_k -static void llm_build_k_shift( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * K_shift, - llm_rope_type type, - int64_t n_ctx, - float freq_base, - float freq_scale, - const llm_build_cb & cb) { - const int64_t n_layer = hparams.n_layer; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int32_t n_rot = hparams.n_rot; - const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; - const float ext_factor = cparams.yarn_ext_factor; - const float attn_factor = cparams.yarn_attn_factor; - const float beta_fast = cparams.yarn_beta_fast; - const float beta_slow = cparams.yarn_beta_slow; - - int rope_type = 0; - - switch (type) { - case LLM_ROPE: rope_type = 0; break; - case LLM_ROPE_NEOX: rope_type = 2; break; - case LLM_ROPE_GLM: rope_type = 4; break; - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - 0), - K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(graph, tmp); - } -} - static void llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, @@ -5001,6 +4996,7 @@ struct llm_build_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; @@ -5025,8 +5021,8 @@ struct llm_build_context { const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; - const bool do_rope_shift; - const uint32_t pooling_type; + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; const llm_build_cb & cb; @@ -5048,6 +5044,7 @@ struct llm_build_context { kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), + n_rot (hparams.n_rot), n_ctx (cparams.n_ctx), n_head (hparams.n_head), n_head_kv (hparams.n_head_kv), @@ -5069,8 +5066,8 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), + pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5093,6 +5090,74 @@ struct llm_build_context { } } + struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx0, + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0), + lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); + } + + return gf; + } + + struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + for (int i = 0; i < n_kv; ++i) { + const int id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + int nm = 1; + + while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { + nm++; + } + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + return gf; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5114,11 +5179,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5154,14 +5214,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5302,11 +5362,6 @@ struct llm_build_context { struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); cb(KQ_pos, "KQ_pos", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5330,12 +5385,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -5420,11 +5475,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5463,13 +5513,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5639,10 +5689,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; @@ -5700,7 +5746,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -5708,7 +5754,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -5717,29 +5763,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, hparams.n_rot, n_head, n_tokens, + ctx0, tmpq, n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * hparams.n_rot + ggml_element_size(tmpq) * n_rot ); cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, hparams.n_rot, n_head, n_tokens, + ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * hparams.n_rot + ggml_element_size(tmpk) * n_rot ); cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5991,14 +6037,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6050,12 +6096,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_MEAN) { + if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_CLS) { + } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -6287,11 +6333,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6328,14 +6369,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6410,11 +6451,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6444,13 +6480,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6524,11 +6560,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6564,14 +6595,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6645,11 +6676,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { attn_norm_output = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -6687,7 +6713,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -6698,7 +6724,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -6767,11 +6793,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -6795,14 +6816,14 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, - n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, + n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -6972,11 +6993,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, @@ -7002,14 +7018,14 @@ struct llm_build_context { struct ggml_tensor * Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7080,11 +7096,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7120,14 +7131,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7199,11 +7210,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7239,14 +7245,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7331,11 +7337,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7371,14 +7372,14 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7467,11 +7468,6 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - for (int il = 0; il < n_layer; ++il) { // norm @@ -7494,7 +7490,7 @@ struct llm_build_context { Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); @@ -7503,7 +7499,7 @@ struct llm_build_context { Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, - n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, + n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -7556,6 +7552,40 @@ struct llm_build_context { } }; +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_defrag(ids); + + llm.free(); + + return result; +} + +static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_k_shift(); + + llm.free(); + + return result; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -7675,6 +7705,20 @@ static struct ggml_cgraph * llama_build_graph( return result; } +static void llama_set_k_shift(llama_context & lctx) { + const auto & cparams = lctx.cparams; + + const int64_t n_ctx = cparams.n_ctx; + + assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); + + int32_t * data = (int32_t *) lctx.inp_K_shift->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } +} + static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // // set input data @@ -7742,19 +7786,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (kv_self.has_shift) { - const int64_t n_ctx = cparams.n_ctx; - - assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7782,7 +7814,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -7798,6 +7830,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } +static void llama_graph_compute( + llama_context & lctx, + ggml_cgraph * gf, + int n_threads) { +#ifdef GGML_USE_MPI + const int64_t n_layer = lctx.model.hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); + } +#endif + + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + } + + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); + +#ifdef GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); +#endif +} + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -7893,14 +7953,17 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + llama_kv_cache_update(&lctx); + ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + if (strcmp(res->name, "result_output") == 0) { // the embeddings could be the second to last tensor, or the third to last tensor if (strcmp(embeddings->name, "result_norm") != 0) { @@ -7927,40 +7990,12 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } -#ifdef GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend_metal)) { - ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); - } -#endif - - if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); - } - llama_set_inputs(lctx, batch); - ggml_backend_sched_graph_compute(lctx.sched, gf); - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - -#ifdef GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif + llama_graph_compute(lctx, gf, n_threads); // update the kv ring buffer { - if (kv_self.has_shift) { - kv_self.has_shift = false; - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; - } - } - kv_self.head += n_tokens; // Ensure kv cache head points to a valid index. @@ -8056,6 +8091,221 @@ static int llama_decode_internal( return 0; } +// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache +static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + auto & kv_self = lctx.kv_self; + + const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + // determine the size of the hole + uint32_t nh = 1; + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + // starting from the end, find nh non-empty cells + uint32_t nf = 0; + uint32_t is = n_kv - 1; + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + // go back and move the nf cells to the hole + for (uint32_t i1 = is; i1 < n_kv; ++i1) { + const auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + n_moves++; + nf++; + } + + LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + kv_self.head = n_used; + kv_self.used = n_used; + + // zero the rest of the cells + for (uint32_t i = n_used; i < n_kv; ++i) { + kv_self.cells[i] = llama_kv_cell(); + } + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + // ggml_graph defrag + + ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); +#endif + + const int64_t t_end = ggml_time_us(); + + LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); +} + +static void llama_kv_cache_update_internal(struct llama_context & lctx) { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + llama_set_k_shift(lctx); + + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } + + { + auto & kv_self = lctx.kv_self; + + kv_self.has_shift = false; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } + } + } + + // defragment the KV cache if needed + if (lctx.kv_self.do_defrag) { + llama_kv_cache_defrag_internal(lctx); + + lctx.kv_self.do_defrag = false; + } +} + // // tokenizer // @@ -11351,7 +11601,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11377,7 +11627,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11565,16 +11815,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11608,8 +11858,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11618,7 +11868,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { @@ -11671,8 +11921,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, - cparams.n_ctx, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -11820,6 +12069,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +enum llama_rope_type llama_rope_type(const struct llama_model * model) { + switch (model->arch) { + // these models do not use RoPE + case LLM_ARCH_GPT2: + case LLM_ARCH_GPTJ: + case LLM_ARCH_GPTNEOX: + case LLM_ARCH_MPT: + case LLM_ARCH_REFACT: + case LLM_ARCH_BLOOM: + return LLAMA_ROPE_TYPE_NONE; + + // use what we call a normal RoPE, operating on pairs of consecutive head values + case LLM_ARCH_LLAMA: + case LLM_ARCH_BAICHUAN: + case LLM_ARCH_STARCODER: + case LLM_ARCH_PLAMO: + case LLM_ARCH_CODESHELL: + case LLM_ARCH_ORION: + case LLM_ARCH_INTERNLM2: + case LLM_ARCH_MINICPM: + return LLAMA_ROPE_TYPE_NORM; + + // the pairs of head values are offset by n_rot/2 + case LLM_ARCH_FALCON: + case LLM_ARCH_PERSIMMON: + case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_STABLELM: + case LLM_ARCH_QWEN: + case LLM_ARCH_QWEN2: + case LLM_ARCH_PHI2: + case LLM_ARCH_GEMMA: + return LLAMA_ROPE_TYPE_NEOX; + + // all model arches should be listed explicitly here + case LLM_ARCH_UNKNOWN: + GGML_ASSERT(false && "unknown architecture"); + break; + } + + return LLAMA_ROPE_TYPE_NONE; +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -12062,12 +12354,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { llama_kv_cache_seq_keep(ctx->kv_self, seq_id); } -void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { +void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { if (delta == 0) { return; } - llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); + llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); } void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { @@ -12078,6 +12370,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); +} + +void llama_kv_cache_defrag(struct llama_context * ctx) { + llama_kv_cache_defrag(ctx->kv_self); +} + +void llama_kv_cache_update(struct llama_context * ctx) { + llama_kv_cache_update_internal(*ctx); +} + + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. @@ -12204,10 +12509,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const auto n_layer = hparams.n_layer; - const auto n_embd_k_gqa = hparams.n_embd_k_gqa(); - const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); - const auto n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; @@ -12222,14 +12527,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + tmp_buf.resize(k_size); ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); @@ -12316,10 +12623,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const int n_layer = hparams.n_layer; - const int n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int n_ctx = cparams.n_ctx; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_ctx = cparams.n_ctx; size_t kv_buf_size; uint32_t kv_head; @@ -12335,13 +12642,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { GGML_ASSERT(kv_self.total_size() == kv_buf_size); for (int il = 0; il < (int) n_layer; ++il) { - size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; // v is not contiguous, copy row by row - size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; diff --git a/llama.h b/llama.h index 889edf4d9..ff131996d 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,15 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; + // note: these values should be synchronized with ggml_rope + // TODO: maybe move this enum to ggml.h (ggml_rope_type) + enum llama_rope_type { + LLAMA_ROPE_TYPE_NONE = -1, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = 2, + LLAMA_ROPE_TYPE_GLM = 4, + }; + enum llama_token_type { LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, @@ -109,23 +118,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_NONE = 0, - LLAMA_ROPE_SCALING_LINEAR = 1, - LLAMA_ROPE_SCALING_YARN = 2, - LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_TYPE_NONE = 0, + LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, + LLAMA_ROPE_SCALING_TYPE_YARN = 2, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_NONE = 0, - LLAMA_POOLING_MEAN = 1, - LLAMA_POOLING_CLS = 2, + LLAMA_POOLING_TYPE_NONE = 0, + LLAMA_POOLING_TYPE_MEAN = 1, + LLAMA_POOLING_TYPE_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_NONE = 0, // single GPU - LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { @@ -173,9 +182,9 @@ extern "C" { } llama_batch; enum llama_model_kv_override_type { - LLAMA_KV_OVERRIDE_INT, - LLAMA_KV_OVERRIDE_FLOAT, - LLAMA_KV_OVERRIDE_BOOL, + LLAMA_KV_OVERRIDE_TYPE_INT, + LLAMA_KV_OVERRIDE_TYPE_FLOAT, + LLAMA_KV_OVERRIDE_TYPE_BOOL, }; struct llama_model_kv_override { @@ -360,6 +369,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @@ -514,10 +524,12 @@ extern "C" { llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_shift( + LLAMA_API void llama_kv_cache_seq_add( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, @@ -525,7 +537,9 @@ extern "C" { llama_pos delta); // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( @@ -535,6 +549,20 @@ extern "C" { llama_pos p1, int d); + // Returns the largest position present in the KV cache for the specified sequence + LLAMA_API llama_pos llama_kv_cache_seq_pos_max( + struct llama_context * ctx, + llama_seq_id seq_id); + + // Defragment the KV cache + // This will be applied: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_update() + LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); + + // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) + LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); + // // State / sessions // diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8574588b..24d12ef14 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1264,7 +1264,7 @@ struct test_argsort : public test_case { test_argsort(ggml_type type = GGML_TYPE_F32, std::array ne = {16, 10, 10, 10}, - ggml_sort_order order = GGML_SORT_ASC) + ggml_sort_order order = GGML_SORT_ORDER_ASC) : type(type), ne(ne), order(order) {} ggml_tensor * build_graph(ggml_context * ctx) override { @@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32)); - for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) { + for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); } diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 2c9997fca..546ca230b 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -118,7 +118,7 @@ int main(void) { const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); - struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); ggml_opt(ctx, opt_params, e);