diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index f10b3a2b2..b85bf5741 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -65,12 +65,22 @@ body: If possible, please do a git bisect and identify the exact commit that introduced the bug. validations: required: false + - type: textarea + id: command + attributes: + label: Compile command + description: > + Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true - type: textarea id: logs attributes: label: Relevant log output description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. + Please copy and paste any relevant log output, including any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index d157ea307..1904e31fd 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -52,6 +52,16 @@ body: - Other (Please specify in the next section) validations: required: false + - type: textarea + id: command + attributes: + label: Command line + description: > + Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: false - type: textarea id: info attributes: @@ -74,7 +84,7 @@ body: attributes: label: Relevant log output description: > - If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text. + If applicable, please copy and paste any relevant log output, including any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 602cf5220..c85999b89 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -665,7 +665,7 @@ jobs: - build: 'llvm-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'llvm-arm64-opencl-adreno' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' @@ -1237,7 +1237,7 @@ jobs: - name: Create release id: create_release - uses: anzz1/action-create-release@v1 + uses: ggml-org/action-create-release@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 41f1a89ee..d71f1eb38 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -97,10 +97,9 @@ jobs: GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} - uses: jlumbroso/free-disk-space@main + uses: ggml-org/free-disk-space@v1.3.1 with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index ae86e9927..f02b7c219 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -23,5 +23,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@main + - uses: editorconfig-checker/action-editorconfig-checker@v2 + with: + version: v3.0.3 - run: editorconfig-checker diff --git a/CODEOWNERS b/CODEOWNERS index adeba5395..72d594b46 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,11 @@ # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs /ci/ @ggerganov -/.devops/ @ngxson +/.devops/*.Dockerfile @ngxson /examples/server/ @ngxson +/ggml/src/ggml-cuda/fattn* @JohannesGaessler +/ggml/src/ggml-cuda/mmq.* @JohannesGaessler +/ggml/src/ggml-cuda/mmv.* @JohannesGaessler +/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler +/ggml/src/ggml-opt.cpp @JohannesGaessler +/ggml/src/gguf.cpp @JohannesGaessler diff --git a/common/arg.cpp b/common/arg.cpp index c81b15217..27886b84e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list return *this; } +common_arg & common_arg::set_excludes(std::initializer_list excludes) { + this->excludes = std::move(excludes); + return *this; +} + common_arg & common_arg::set_env(const char * env) { help = help + "\n(env: " + env + ")"; this->env = env; @@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } +bool common_arg::is_exclude(enum llama_example ex) { + return excludes.find(ex) != excludes.end(); +} + bool common_arg::get_value_from_env(std::string & output) { if (env == nullptr) return false; char * value = std::getenv(env); @@ -420,7 +429,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ auto add_opt = [&](common_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { ctx_arg.options.push_back(std::move(arg)); } }; @@ -649,7 +658,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.prompt = value; } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -673,7 +682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt.pop_back(); } } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -700,7 +709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), diff --git a/common/arg.h b/common/arg.h index a6700d323..49ab8667b 100644 --- a/common/arg.h +++ b/common/arg.h @@ -12,6 +12,7 @@ struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::set excludes = {}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value @@ -53,9 +54,11 @@ struct common_arg { ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} common_arg & set_examples(std::initializer_list examples); + common_arg & set_excludes(std::initializer_list excludes); common_arg & set_env(const char * env); common_arg & set_sparam(); bool in_example(enum llama_example ex); + bool is_exclude(enum llama_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); diff --git a/common/common.cpp b/common/common.cpp index 4bb140ee2..86e4e1e24 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2,6 +2,9 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#include "ggml.h" +#include "gguf.h" + #include "common.h" #include "log.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: @@ -846,7 +849,7 @@ struct common_init_result common_init_from_params(common_params & params) { } else if (!params.model_url.empty()) { model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); } else { - model = llama_load_model_from_file(params.model.c_str(), mparams); + model = llama_model_load_from_file(params.model.c_str(), mparams); } if (model == NULL) { @@ -873,7 +876,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (!ok) { - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -884,7 +887,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -900,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) { const auto cvec = common_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -913,7 +916,7 @@ struct common_init_result common_init_from_params(common_params & params) { params.control_vector_layer_end); if (err) { llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -926,7 +929,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -982,7 +985,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = bos; } tmp.clear(); @@ -1411,7 +1414,7 @@ struct llama_model * common_load_model_from_url( } } - return llama_load_model_from_file(local_path.c_str(), params); + return llama_model_load_from_file(local_path.c_str(), params); } struct llama_model * common_load_model_from_hf( diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a9dfb6714..a057ae45f 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { - return -1; + return LLAMA_TOKEN_NULL; } const common_ngram_cache_part part_static = part_static_it->second; int max_count_static = 0; int sum_count_static = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_static : part_static) { const llama_token token = token_count_static.first; @@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram } if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { - return -1; + return LLAMA_TOKEN_NULL; } if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { - return -1; + return LLAMA_TOKEN_NULL; } return max_token; } @@ -98,9 +98,9 @@ static llama_token try_draft( common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; - for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) { const common_ngram ngram_primary = ngrams_primary[i]; common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); @@ -112,7 +112,7 @@ static llama_token try_draft( int max_count_primary = 0; int max_count_static = 0; int sum_count_primary = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; @@ -154,7 +154,7 @@ void common_ngram_cache_draft( } while ((int) draft.size()-1 < n_draft) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; common_ngram ngram_static; @@ -177,17 +177,17 @@ void common_ngram_cache_draft( } ngrams_cd.push_back(ngram_cd); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_static, ngram_static); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { break; } diff --git a/common/ngram-cache.h b/common/ngram-cache.h index 09c2b0319..dfe012abe 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -17,13 +17,13 @@ struct common_ngram { common_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = -1; + tokens[i] = LLAMA_TOKEN_NULL; } } common_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = i < ngram_size ? input[i] : -1; + tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4e6c0f60c..01b58f976 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -687,6 +687,9 @@ class Model: if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct res = "megrez" + if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" if res is None: logger.warning("\n") @@ -3373,6 +3376,24 @@ class CommandR2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) +@Model.register("Cohere2ForCausalLM") +class Cohere2Model(Model): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + @Model.register("OlmoForCausalLM") @Model.register("OLMoForCausalLM") class OlmoModel(Model): @@ -3831,6 +3852,7 @@ class DeepseekModel(Model): @Model.register("DeepseekV2ForCausalLM") +@Model.register("DeepseekV3ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -3852,6 +3874,15 @@ class DeepseekV2Model(Model): self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: @@ -3864,6 +3895,16 @@ class DeepseekV2Model(Model): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # skip Multi-Token Prediction (MTP) layers + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return [] + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index fea23ddb4..56edc64a7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -107,6 +107,7 @@ models = [ {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, {"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"}, {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, + {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, ] diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ed1014cae..6dea14a23 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") + # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") + base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace: "--base", type=Path, help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", ) + parser.add_argument( + "--base-model-id", type=str, + help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -290,6 +297,7 @@ if __name__ == '__main__': dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path + base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -313,7 +321,10 @@ if __name__ == '__main__': lparams: dict[str, Any] = json.load(f) # load base model - if dir_base_model is None: + if base_model_id is not None: + logger.info(f"Loading base model from Hugging Face: {base_model_id}") + hparams = load_hparams_from_hf(base_model_id) + elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") @@ -371,11 +382,16 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - is_lora_a = ".lora_A.weight" in name - is_lora_b = ".lora_B.weight" in name + # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them + if "_layernorm" in name or ".norm" in name: + yield (base_name, tensor) + continue logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") @@ -407,9 +423,21 @@ if __name__ == '__main__': if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: + # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: + assert dest_data.dim() == 1 + yield (dest_name, dest_data) + continue + + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # note: mergekit-extract-lora flip and transpose A and B + # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() + if "token_embd.weight" in dest_name: + lora_a = lora_a.T + yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a3b21ad6b..dd75ff9f1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -38,7 +38,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); @@ -194,7 +194,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index e2e01f2d5..d34b03099 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -41,7 +41,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: error: unable to load model\n" , __func__); @@ -120,7 +120,7 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = llama_token_bos(model); } @@ -236,7 +236,7 @@ int main(int argc, char ** argv) { llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 736035d78..1256abb17 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,4 +1,6 @@ #include "ggml.h" +#include "gguf.h" + #include "llama.h" #include "common.h" #include "log.h" @@ -689,8 +691,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); - gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 7c9f50228..e899c1078 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,7 +1,9 @@ +#include "ggml.h" +#include "gguf.h" + #include "arg.h" #include "common.h" #include "llama.h" -#include "ggml.h" #include "pca.hpp" #include "mean.hpp" diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 058b5cc86..d5dcd20a0 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,7 +1,9 @@ -#include "arg.h" -#include "common.h" #include "ggml.h" #include "ggml-alloc.h" +#include "gguf.h" + +#include "arg.h" +#include "common.h" #include #include diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index e96c75117..9523ec122 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "gguf.h" #include /* abort() */ #include diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 9e3d44984..ef3ceb686 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,16 +1,18 @@ +#include "ggml.h" +#include "gguf.h" #include "llama.h" #include "common.h" #include +#include +#include +#include #include +#include +#include #include #include #include -#include - -#include -#include -#include #if defined(_WIN32) #include @@ -296,7 +298,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 7498f85ef..f31989c8c 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,10 +1,9 @@ #include "ggml.h" +#include "gguf.h" #include -#include #include #include -#include #include #undef MIN @@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); + const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); } } @@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); + const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); } } @@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n", + __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; @@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { const float * data = (const float *) cur->data; for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { - fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i)); gguf_free(ctx); return false; } @@ -245,6 +247,8 @@ int main(int argc, char ** argv) { check_data = false; } + srand(123456); + const std::string fname(argv[1]); const std::string mode (argv[2]); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 18a945b33..4d2db5624 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -165,7 +165,7 @@ int main(int argc, char * argv[]) { llama_backend_init(); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); // create generation context llama_context * ctx = llama_new_context_with_model(model, cparams); @@ -219,7 +219,7 @@ int main(int argc, char * argv[]) { llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); return 0; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2338ad106..2a0916766 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1526,10 +1526,10 @@ int main(int argc, char ** argv) { // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { - llama_free_model(lmodel); + llama_model_free(lmodel); } - lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -1540,7 +1540,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_free_model(lmodel); + llama_model_free(lmodel); return 1; } @@ -1626,7 +1626,7 @@ int main(int argc, char ** argv) { ggml_threadpool_free_fn(threadpool); } - llama_free_model(lmodel); + llama_model_free(lmodel); if (p) { p->print_footer(); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3cd0d2fa8..7a8a3156b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -7,6 +7,7 @@ #include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "gguf.h" //#ifdef GGML_USE_CUDA //#include "ggml-cuda.h" @@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = gguf_get_arr_data(ctx_gguf, i); + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); std::stringstream ss; ss << "["; for (int j = 0; j < arr_n; j++) { @@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i total_size_org += orig_size; total_size_new += new_size; gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data); fout.write((const char *)new_data, new_size); size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; for (size_t j = 0; j < pad; ++j) { diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 2691c6e6b..27215a42e 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -221,7 +221,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -265,7 +265,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } @@ -323,7 +323,7 @@ int main(int argc, char ** argv) { } } - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index e9cbb51ed..2342bdd09 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index e86a60280..f3e5d66e2 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -310,7 +310,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -354,7 +354,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } @@ -575,7 +575,7 @@ int main(int argc, char ** argv) { } } - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b5e477f5b..aaee47e32 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -494,7 +494,7 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = llama_token_bos(model); } @@ -831,7 +831,7 @@ int main(int argc, char ** argv) { // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { llama_token eot = llama_token_eot(model); - embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot); need_insert_eot = false; } diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 09bba708f..ea91f376c 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -63,7 +63,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); @@ -266,7 +266,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index ab91d0b40..9bfbb8862 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -309,7 +309,7 @@ int main(int argc, char ** argv) { auto mparams = llama_model_default_params(); mparams.use_mlock = false; - model = llama_load_model_from_file(params.model.c_str(), mparams); + model = llama_model_load_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); @@ -323,7 +323,7 @@ int main(int argc, char ** argv) { if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); + llama_model_free(model); return 1; } } @@ -347,7 +347,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); return 1; } included_layers++; @@ -409,7 +409,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - llama_free_model(model); + llama_model_free(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 75b817272..61420e441 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -11,6 +11,8 @@ # include #endif +#include + #include #include #include @@ -25,6 +27,13 @@ #include "json.hpp" #include "llama-cpp.h" +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) +[[noreturn]] static void sigint_handler(int) { + printf("\n"); + exit(0); // not ideal, but it's the only way to guarantee exit in all cases +} +#endif + GGML_ATTRIBUTE_FORMAT(1, 2) static std::string fmt(const char * fmt, ...) { va_list ap; @@ -83,6 +92,7 @@ class Opt { } ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; + ctx_params.n_ctx = ctx_params.n_batch; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -664,7 +674,7 @@ class LlamaData { "\r%*s" "\rLoading model", get_terminal_width(), " "); - llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params)); + llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params)); if (!model) { printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } @@ -800,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Should have data in happy path + if (std::cin.eof()) { + printf("\n"); + return 1; + } + + if (user == "/bye") { + return 1; + } + + if (user.empty()) { + return 2; + } + + return 0; // Should have data in happy path } // Function to generate a response based on the prompt @@ -867,7 +890,25 @@ static bool is_stdout_a_terminal() { #endif } -// Function to tokenize the prompt +// Function to handle user input +static int get_user_input(std::string & user_input, const std::string & user) { + while (true) { + const int ret = handle_user_input(user_input, user); + if (ret == 1) { + return 1; + } + + if (ret == 2) { + continue; + } + + break; + } + + return 0; +} + +// Main chat loop function static int chat_loop(LlamaData & llama_data, const std::string & user) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); @@ -875,7 +916,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) { while (true) { // Get user input std::string user_input; - while (handle_user_input(user_input, user)) { + if (get_user_input(user_input, user) == 1) { + return 0; } add_message("user", user.empty() ? user_input : user, llama_data); @@ -916,7 +958,23 @@ static std::string read_pipe_data() { return result.str(); } +static void ctrl_c_handling() { +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset(&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined(_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif +} + int main(int argc, const char ** argv) { + ctrl_c_handling(); Opt opt; const int ret = opt.init(argc, argv); if (ret == 2) { diff --git a/examples/server/README.md b/examples/server/README.md index 3ce16945a..1f0a27d96 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `-p, --prompt PROMPT` | prompt to start generation with | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-f, --file FNAME` | a file containing the prompt (default: none) | -| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c2e62ba69..127323e77 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3797,7 +3797,7 @@ int main(int argc, char ** argv) { data["input_extra"] = input_extra; // default to empty array if it's not exist std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); data["prompt"] = format_infill( ctx_server.ctx, diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index ad4b8192a..10554db0f 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -18,7 +18,7 @@ def test_infill_without_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Ann|small|shiny)+", res.body["content"]) + assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"]) def test_infill_with_input_extra(): diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index dc6e6e67e..ad130d490 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -507,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); + std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 7f4da666b..d72f5bcdd 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -69,7 +69,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; - llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); if (!model) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; @@ -194,7 +194,7 @@ int main(int argc, char ** argv) { } llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3288c0250..f69117890 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -83,7 +83,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; - llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); @@ -199,7 +199,7 @@ int main(int argc, char ** argv) { llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index c97e22724..684ca054a 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) { printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); printf(" --stdin read prompt from standard input.\n"); printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + printf(" --no-escape do not escape input (such as \\n, \\t, etc.).\n"); printf(" --no-parse-special do not parse control tokens.\n"); printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n"); printf(" --show-count print the total number of tokens.\n"); @@ -198,6 +199,7 @@ int main(int raw_argc, char ** raw_argv) { // variables where to put any arguments we see. bool printing_ids = false; bool no_bos = false; + bool no_escape = false; bool no_parse_special = false; bool disable_logging = false; bool show_token_count = false; @@ -233,6 +235,9 @@ int main(int raw_argc, char ** raw_argv) { else if (arg == "--no-bos") { no_bos = true; } + else if (arg == "--no-escape") { + no_escape = true; + } else if (arg == "--no-parse-special") { no_parse_special = true; } @@ -333,7 +338,7 @@ int main(int raw_argc, char ** raw_argv) { llama_model_params model_params = llama_model_default_params(); model_params.vocab_only = true; - llama_model * model = llama_load_model_from_file(model_path, model_params); + llama_model * model = llama_model_load_from_file(model_path, model_params); if (!model) { fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path); return 1; @@ -363,6 +368,11 @@ int main(int raw_argc, char ** raw_argv) { const bool model_wants_add_bos = llama_add_bos_token(model); const bool add_bos = model_wants_add_bos && !no_bos; const bool parse_special = !no_parse_special; + const bool escape = !no_escape; + + if (escape) { + string_process_escapes(prompt); + } std::vector tokens; tokens = common_tokenize(model, prompt, add_bos, parse_special); @@ -398,7 +408,7 @@ int main(int raw_argc, char ** raw_argv) { } // silence valgrind llama_free(ctx); - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index e33d97482..fe8acc803 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -243,7 +243,8 @@ set(GGML_PUBLIC_HEADERS include/ggml-metal.h include/ggml-rpc.h include/ggml-sycl.h - include/ggml-vulkan.h) + include/ggml-vulkan.h + include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") #if (GGML_METAL) @@ -252,26 +253,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") install(TARGETS ggml LIBRARY PUBLIC_HEADER) install(TARGETS ggml-base LIBRARY) -# FIXME: this should be done in the backend cmake files -if (GGML_METAL) - # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY? - install( - FILES src/ggml-metal/ggml-metal.metal - PERMISSIONS - OWNER_READ - OWNER_WRITE - GROUP_READ - WORLD_READ - DESTINATION ${CMAKE_INSTALL_BINDIR}) - - if (NOT GGML_METAL_EMBED_LIBRARY) - install( - FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DESTINATION ${CMAKE_INSTALL_BINDIR} - ) - endif() -endif() - if (GGML_STANDALONE) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h index 219361af4..a12342c25 100644 --- a/ggml/include/ggml-cpp.h +++ b/ggml/include/ggml-cpp.h @@ -7,6 +7,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "gguf.h" #include // Smart pointers for ggml types diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c714fc8c8..8630d92c5 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -241,12 +241,6 @@ #define GGML_ROPE_TYPE_MROPE 8 #define GGML_ROPE_TYPE_VISION 24 -#define GGUF_MAGIC "GGUF" - -#define GGUF_VERSION 3 - -#define GGUF_DEFAULT_ALIGNMENT 32 - #define GGML_UNUSED(x) (void)(x) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) @@ -403,12 +397,6 @@ extern "C" { GGML_PREC_F32, }; - enum ggml_backend_type { - GGML_BACKEND_TYPE_CPU = 0, - GGML_BACKEND_TYPE_GPU = 10, - GGML_BACKEND_TYPE_GPU_SPLIT = 20, - }; - // model file types enum ggml_ftype { GGML_FTYPE_UNKNOWN = -1, @@ -587,8 +575,6 @@ extern "C" { struct ggml_tensor { enum ggml_type type; - GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); - struct ggml_backend_buffer * buffer; int64_t ne[GGML_MAX_DIMS]; // number of elements @@ -2111,132 +2097,6 @@ extern "C" { int64_t n_per_row, const float * imatrix); - // - // gguf - // - - enum gguf_type { - GGUF_TYPE_UINT8 = 0, - GGUF_TYPE_INT8 = 1, - GGUF_TYPE_UINT16 = 2, - GGUF_TYPE_INT16 = 3, - GGUF_TYPE_UINT32 = 4, - GGUF_TYPE_INT32 = 5, - GGUF_TYPE_FLOAT32 = 6, - GGUF_TYPE_BOOL = 7, - GGUF_TYPE_STRING = 8, - GGUF_TYPE_ARRAY = 9, - GGUF_TYPE_UINT64 = 10, - GGUF_TYPE_INT64 = 11, - GGUF_TYPE_FLOAT64 = 12, - GGUF_TYPE_COUNT, // marks the end of the enum - }; - - struct gguf_context; - - struct gguf_init_params { - bool no_alloc; - - // if not NULL, create a ggml_context and allocate the tensor data in it - struct ggml_context ** ctx; - }; - - GGML_API struct gguf_context * gguf_init_empty(void); - GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); - - GGML_API void gguf_free(struct gguf_context * ctx); - - GGML_API const char * gguf_type_name(enum gguf_type type); - - GGML_API int gguf_get_version (const struct gguf_context * ctx); - GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); - GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); - GGML_API void * gguf_get_data (const struct gguf_context * ctx); - - GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); - GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); - GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id); - - GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id); - GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id); - - // will abort if the wrong type is used for the key - GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id); - GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id); - GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id); - GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id); - GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id); - GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id); - GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id); - GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id); - GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id); - GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); - GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); - GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); - GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id); - GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); - GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); - GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); - - GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); - GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); - GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); - GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); - GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); - - // removes key if it exists - GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key); - - // overrides existing values or adds a new one - GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); - GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); - GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); - GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); - GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); - GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); - GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); - GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); - GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); - GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); - GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); - GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); - GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); - GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); - - // set or add KV pairs from another context - GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); - - // manage tensor info - GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); - GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); - GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); - - // writing gguf files can be done in 2 ways: - // - // - write the entire gguf_context to a binary file in a single pass: - // - // gguf_write_to_file(ctx, fname); - // - // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: - // - // FILE * f = fopen(fname, "wb"); - // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); - // fwrite(f, ...); - // void * data = gguf_meta_get_meta_data(ctx); - // fseek(f, 0, SEEK_SET); - // fwrite(f, data, gguf_get_meta_size(ctx)); - // free(data); - // fclose(f); - // - - // write the entire context to a binary file - GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); - - // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding - GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); - GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); - #ifdef __cplusplus // restrict not standard in C++ # if defined(__GNUC__) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h new file mode 100644 index 000000000..79ee20206 --- /dev/null +++ b/ggml/include/gguf.h @@ -0,0 +1,202 @@ +// This file contains functionality related to "GGUF" files, the binary file format used by ggml. +// GGUF files have the following structure: +// +// 1. File magic "GGUF" (4 bytes). +// 2. File version (uint32_t). +// 3. Number of ggml tensors in file (int64_t). +// 4. Number of key-value-pairs in file (int64_t). +// 5. For each KV pair: +// 1. The key (string). +// 2. The value type (gguf_type). +// 3a. If the value type is GGUF_TYPE_ARRAY: +// 1. The type of the array (gguf_type). +// 2. The number of elements in the array (uint64_t). +// 3. The binary representation of each element in the array. +// 3b. Otherwise: +// 1. The binary representation of the value. +// 6. For each ggml tensor: +// 1. The tensor name (string). +// 2. The number of dimensions of the tensor (uint32_t). +// 3. For each dimension: +// 1. The size of the tensor in the dimension (int64_t). +// 4. The tensor data type (ggml_type). +// 5. The tensor data offset in the tensor data binary blob (uint64_t). +// 7. The tensor data binary blob (optional, aligned). +// +// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator. +// All enums are stored as int32_t. +// All bool values are stored as int8_t. +// If the special key "general.alignment" (uint32_t) is defined it is used for alignment, +// otherwise GGUF_DEFAULT_ALIGNMENT is used. +// +// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de) + +#pragma once + +#include "ggml.h" + +#include +#include + +#define GGUF_MAGIC "GGUF" +#define GGUF_VERSION 3 + +#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment" + +#define GGUF_DEFAULT_ALIGNMENT 32 + +#ifdef __cplusplus +extern "C" { +#endif + + // types that can be stored as GGUF KV data + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + + GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id); + + // will abort if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id); + GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id); + GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id); + + // get raw pointer to the first element of the array with the given key_id + // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference) + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id); + + // get ith C string from array with given key_id + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); + + GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id); + GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id); + GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id); + + // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist) + GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key); + + // overrides an existing KV pair or adds a new one, the new KV pair is always at the back + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + + // creates a new array with n elements of the given type and copies the corresponding number of bytes from data + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n); + + // creates a new array with n strings and copies the corresponding strings from data + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src); + + // add tensor to GGUF context, tensor name must be unique + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + + // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated + // in such a way that the tensor data remains as one contiguous block (except for padding) + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + + // assumes that at least gguf_get_tensor_size bytes can be read from data + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data); + + // writing gguf files can be done in 3 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname, /*only_meta =*/ false); + // + // - write only the meta data to a file, then re-open the file and append the tensor data: + // + // gguf_write_to_file(ctx, fname, /*only_meta =*/ true); + // FILE * f = fopen(fname, "ab"); + // fwrite(f, ...); // write tensor data + // fclose(f); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // const size_t size_meta = gguf_get_meta_size(ctx); + // fseek(f, size_meta, SEEK_SET); + // fwrite(f, ...); // write tensor data + // void * data = malloc(size_meta); + // gguf_get_meta_data(ctx, data); + // rewind(f); + // fwrite(data, 1, data, f); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + + // writes the meta data to pointer "data" + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 84101c32c..ae1cd2337 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -208,6 +208,7 @@ add_library(ggml-base ../include/ggml-backend.h ../include/ggml-cpp.h ../include/ggml-opt.h + ../include/gguf.h ggml.c ggml-alloc.c ggml-backend.cpp @@ -215,7 +216,8 @@ add_library(ggml-base ggml-threading.cpp ggml-threading.h ggml-quants.c - ggml-quants.h) + ggml-quants.h + gguf.cpp) target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 7ddd178b5..955ed505f 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -574,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); + // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend + const char * backend_path = std::getenv("GGML_BACKEND_PATH"); + if (backend_path) { + ggml_backend_load(backend_path); + } } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index fdb4b986f..dba7be33b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -764,7 +764,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1) { + if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { + if (j == 0) { + GGML_LOG_DEBUG(": "); + } GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); } diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 622c63f1f..b311a5b1c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -4169,6 +4169,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g buffer->buft = buft; buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; + buffer->iface.get_tensor = nullptr; + buffer->iface.cpy_tensor = nullptr; return buffer; } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 8fce576c3..c22a66287 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -54,6 +54,7 @@ #include "ggml-quants.h" #include +#include #ifdef _MSC_VER #define NOINLINE __declspec(noinline) @@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX { } \ } \ +template +class tinyBLAS_Q0_PPC { + public: + tinyBLAS_Q0_PPC(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n) { + mnpack(0, m, 0, n); + } + + private: + + template + inline void save_res(int ii, int jj, int idx, vector float* fin_res) { + for (int I = 0; I < RM; I++) { + for (int J = 0; J < RN; J++) { + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J); + } + } + } + + template + inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array& comparray, vector float* vs, vector float* fin_res) { + vector signed int vec_C[4]; + vector float CA[4] = {0}; + vector float res[4] = {0}; + __builtin_mma_disassemble_acc(vec_C, ACC); + for (int i = 0; i < 4; i++) { + CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0)); + res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); + } + } + + template + void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { + int64_t i, j; + TA *aoffset = NULL; + VA *vecOffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; + VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0}; + VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0}; + VB t1, t2, t3, t4, t5, t6, t7, t8; + vector unsigned char xor_vector; + uint8_t flip_vec = 0x80; + xor_vector = vec_splats(flip_vec); + vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; + vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + + aoffset = const_cast(a); + vecOffset = vec; + j = (rows >> 3); + if (j > 0) { + do { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (cols >> 3); + if (i > 0) { + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); + C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs); + C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs); + C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs); + C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs); + + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + __builtin_vsx_disassemble_pair(c5, &C5); + __builtin_vsx_disassemble_pair(c6, &C6); + __builtin_vsx_disassemble_pair(c7, &C7); + __builtin_vsx_disassemble_pair(c8, &C8); + + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + t1 = vec_perm(c5[0], c6[0], swiz1); + t2 = vec_perm(c5[0], c6[0], swiz2); + t3 = vec_perm(c7[0], c8[0], swiz1); + t4 = vec_perm(c7[0], c8[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+128); + vec_xst(t6, 0, vecOffset+144); + vec_xst(t7, 0, vecOffset+160); + vec_xst(t8, 0, vecOffset+176); + + t1 = vec_perm(c5[1], c6[1], swiz1); + t2 = vec_perm(c5[1], c6[1], swiz2); + t3 = vec_perm(c7[1], c8[1], swiz1); + t4 = vec_perm(c7[1], c8[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+192); + vec_xst(t6, 0, vecOffset+208); + vec_xst(t7, 0, vecOffset+224); + vec_xst(t8, 0, vecOffset+240); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + aoffset4 += lda; + aoffset5 += lda; + aoffset6 += lda; + aoffset7 += lda; + aoffset8 += lda; + vecOffset += 256; + i--; + } while(i > 0); + } + j--; + } while(j > 0); + } + + if (rows & 4) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (cols >> 3); + if (i > 0) { + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); + + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + aoffset4 += lda; + vecOffset += 128; + i--; + } while(i > 0); + } + } + if (rows & 3) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + i = (cols >> 3); + if (i > 0) { + do { + switch(rows) { + case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + __builtin_vsx_disassemble_pair(c3, &C3); + case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + __builtin_vsx_disassemble_pair(c2, &C2); + case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + __builtin_vsx_disassemble_pair(c1, &C1); + break; + } + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + vecOffset += 128; + i--; + } while(i > 0); + } + } + } + + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + int m_rem = MIN(m - m0, 8); + int n_rem = MIN(n - n0, 8); + // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance + // issues. After resolving them, below code will be enabled. + /*if (m_rem >= 16 && n_rem >= 8) { + mc = 16; + nc = 8; + gemm<16,8>(m0, m, n0, n); + } else if(m_rem >= 8 && n_rem >= 16) { + mc = 8; + nc = 16; + gemm<8,16>(m0, m, n0, n); + }*/ + if (m_rem >= 8 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 8) { + mc = 4; + nc = 8; + gemm<4,8>(m0, m, n0, n); + } else if (m_rem >= 8 && n_rem >= 4) { + mc = 8; + nc = 4; + gemm<8,4>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 4) { + mc = 4; + nc = 4; + gemm_small<4, 4>(m0, m, n0, n); + } else if ((m_rem < 4) && (n_rem > 4)) { + nc = 4; + switch(m_rem) { + case 1: + mc = 1; + gemm_small<1, 4>(m0, m, n0, n); + break; + case 2: + mc = 2; + gemm_small<2, 4>(m0, m, n0, n); + break; + case 3: + mc = 3; + gemm_small<3, 4>(m0, m, n0, n); + break; + default: + return; + } + } else if ((m_rem > 4) && (n_rem < 4)) { + mc = 4; + switch(n_rem) { + case 1: + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 2: + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 3: + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + default: + return; + } + } else { + switch((m_rem << 4) | n_rem) { + case 0x43: + mc = 4; + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 0x41: + mc = 4; + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 0x34: + mc = 3; + nc = 4; + gemm_small<3, 4>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm_small<3, 3>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm_small<3, 2>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm_small<3, 1>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm_small<2, 4>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm_small<2, 3>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm_small<2, 2>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm_small<2, 1>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm_small<1, 4>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm_small<1, 3>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm_small<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm_small<1, 1>(m0, m, n0, n); + break; + default: + return; + } + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + void KERNEL_4x8(int64_t ii, int64_t jj) { + vec_t vec_A[8], vec_B[16] = {0}; + acc_t acc_0, acc_1; + std::array comparray; + vector float fin_res[8] = {0}; + vector float vs[8] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + packNormal((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]); + } + for (int I = 0; I<4; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 4; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<4>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<4>(&acc_1, 0, 4, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii, jj+4, 4, fin_res); + } + + void KERNEL_8x4(int64_t ii, int64_t jj) { + vec_t vec_A[16], vec_B[8] = {0}; + acc_t acc_0, acc_1; + std::array comparray; + vector float fin_res[8] = {0}; + vector float vs[8] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + packNormal((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); + } + for (int I = 0; I<8; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 8; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii+4, jj, 4, fin_res); + } + + void KERNEL_8x8(int64_t ii, int64_t jj) { + vec_t vec_A[16], vec_B[16] = {0}; + acc_t acc_0, acc_1, acc_2, acc_3; + std::array comparray; + vector float fin_res[16] = {0}; + vector float vs[16] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + __builtin_mma_xxsetaccz(&acc_2); + __builtin_mma_xxsetaccz(&acc_3); + packNormal((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]); + __builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]); + } + for (int I = 0; I<8; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 8; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); + compute<8>(&acc_2, 0, 8, comparray, vs, fin_res); + compute<8>(&acc_3, 4, 12, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii+4, jj, 4, fin_res); + save_res<4, 4>(ii, jj+4, 8, fin_res); + save_res<4, 4>(ii+4, jj+4, 12, fin_res); + } + + template + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + vec_t vec_A[8], vec_B[8] = {0}; + vector signed int vec_C[4]; + acc_t acc_0; + + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + std::array comparray; + vector float res[4] = {0}; + vector float fin_res[4] = {0}; + vector float vs[4] = {0}; + vector float CA[4] = {0}; + __builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value + __builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value + for (int l = 0; l < k; l++) { + __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead + __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead + __builtin_mma_xxsetaccz(&acc_0); + packNormal((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x+=4) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]); + } + for (int I = 0; Id) * unhalf((B+((jj+J)*ldb)+l)->d)); + } + } + __builtin_mma_disassemble_acc(vec_C, &acc_0); + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < RM; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + + for (int i = 0; i < RM; i++) { + CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0)); + res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]); + } + } + save_res(ii, jj, 0, fin_res); + } + } + + template + inline void kernel(int64_t ii, int64_t jj) { + if constexpr(RM == 4 && RN == 8) { + KERNEL_4x8(ii,jj); + } else if constexpr(RM == 8 && RN == 4) { + KERNEL_8x4(ii,jj); + } else if constexpr(RM == 8 && RN == 8) { + KERNEL_8x8(ii,jj); + } else { + static_assert(false, "RN/RM values not supported"); + } + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + kernel(ii, jj); + } + } + + const TA *const A; + const TB *const B; + TC *C; + TA *At; + TB *Bt; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; + template class tinyBLAS_PPC { public: @@ -1070,13 +1769,17 @@ class tinyBLAS_PPC { void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); - void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) { + template + void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) { int64_t i, j; - float *aoffset = NULL, *boffset = NULL; - float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; - float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; - - aoffset = const_cast(a); + TA *aoffset = NULL, *boffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; + VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; + VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; + VA t1, t2, t3, t4, t5, t6, t7, t8; + aoffset = const_cast(a); boffset = vec; j = (rows >> 3); if (j > 0) { @@ -1092,9 +1795,6 @@ class tinyBLAS_PPC { aoffset += 8 * lda; i = (cols >> 3); if (i > 0) { - __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; - vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2]; - vector float t1, t2, t3, t4, t5, t6, t7, t8; do { C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); @@ -1174,21 +1874,19 @@ class tinyBLAS_PPC { } while(i > 0); } if (cols & 4) { - vector float c1, c2, c3, c4, c5, c6, c7, c8; - vector float t1, t2, t3, t4, t5, t6, t7, t8; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); - c4 = vec_xl(0, aoffset4); - c5 = vec_xl(0, aoffset5); - c6 = vec_xl(0, aoffset6); - c7 = vec_xl(0, aoffset7); - c8 = vec_xl(0, aoffset8); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); + c4[0] = vec_xl(0, aoffset4); + c5[0] = vec_xl(0, aoffset5); + c6[0] = vec_xl(0, aoffset6); + c7[0] = vec_xl(0, aoffset7); + c8[0] = vec_xl(0, aoffset8); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); - t3 = vec_mergeh(c5, c6); - t4 = vec_mergeh(c7, c8); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); + t3 = vec_mergeh(c5[0], c6[0]); + t4 = vec_mergeh(c7[0], c8[0]); t5 = vec_xxpermdi(t1, t2, 0); t6 = vec_xxpermdi(t3, t4, 0); t7 = vec_xxpermdi(t1, t2, 3); @@ -1198,10 +1896,10 @@ class tinyBLAS_PPC { vec_xst(t7, 0, boffset+8); vec_xst(t8, 0, boffset+12); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); - t3 = vec_mergel(c5, c6); - t4 = vec_mergel(c7, c8); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); + t3 = vec_mergel(c5[0], c6[0]); + t4 = vec_mergel(c7[0], c8[0]); t5 = vec_xxpermdi(t1, t2, 0); t6 = vec_xxpermdi(t3, t4, 0); t7 = vec_xxpermdi(t1, t2, 3); @@ -1223,9 +1921,6 @@ class tinyBLAS_PPC { aoffset += 4 * lda; i = (cols >> 3); if (i > 0) { - __vector_pair C1, C2, C3, C4; - vector float c1[2], c2[2], c3[2], c4[2]; - vector float t1, t2, t3, t4, t5, t6, t7, t8; do { C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); @@ -1272,22 +1967,20 @@ class tinyBLAS_PPC { } if (cols & 4) { - vector float c1, c2, c3, c4; - vector float t1, t2, t3, t4; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); - c4 = vec_xl(0, aoffset4); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); + c4[0] = vec_xl(0, aoffset4); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset); vec_xst(t4, 0, boffset+4); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset+8); @@ -1299,21 +1992,19 @@ class tinyBLAS_PPC { aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; if (cols & 4) { - vector float c1, c2, c3, c4 = {0}; - vector float t1, t2, t3, t4; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset); vec_xst(t4, 0, boffset+4); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset+8); @@ -1321,14 +2012,13 @@ class tinyBLAS_PPC { } } } - void KERNEL_4x4(int64_t ii, int64_t jj) { vec_t vec_A[4], vec_B[4], vec_C[4]; acc_t acc_0; __builtin_mma_xxsetaccz(&acc_0); for (int l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); @@ -1343,8 +2033,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]); @@ -1364,8 +2054,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]); @@ -1387,8 +2077,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_3); for (int l = 0; l < k; l+=8) { - READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B); for(int x = 0; x < 16; x+=2) { __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]); @@ -1571,15 +2261,15 @@ class tinyBLAS_PPC { vec_t vec_A[4], vec_B[4]; for (int l=0; l= 4 && RM == 1) { - float* a = const_cast(A+(ii)*lda+l); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + TA* a = const_cast(A+(ii)*lda+l); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); vec_A[0] = (vec_t)vec_xl(0,a); - vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1)); - vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2)); - vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3)); + vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); } else { - READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); } __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); @@ -1589,7 +2279,7 @@ class tinyBLAS_PPC { __builtin_mma_disassemble_acc(vec_C, &acc_0); for (int I = 0; I < RM; I++) { for (int J = 0; J < RN; J++) { - *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); + *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J); } } } @@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 params->ith, params->nth}; tb.matmul(m, n); return true; + +#elif defined(__MMA__) + if (n < 8 && n != 4) + return false; + if (m < 8 && m != 4) + return false; + tinyBLAS_Q0_PPC tb{ + k, (const block_q8_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + params->ith, params->nth}; + tb.matmul(m, n); + return true; + #else return false; #endif diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index 2f42b8a95..aafbaf803 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) uint64_t nb1, uint64_t nb2, uint64_t nb3){ - static_assert(dim >= 0 && dim <= 3); + static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]"); const int64_t i3 = blockIdx.z; const int64_t i2 = blockIdx.y; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 3896f956d..5b0dfacef 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -680,6 +680,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: return convert_unary_cuda; + case GGML_TYPE_BF16: + return convert_unary_cuda; default: return nullptr; } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c180adc84..0b06be729 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1728,7 +1728,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); - bool use_mul_mat_vec = src0->type == GGML_TYPE_F16 + bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) @@ -2869,6 +2869,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: + case GGML_TYPE_BF16: #ifdef GGML_USE_MUSA if (a->type == GGML_TYPE_Q3_K) { return false; diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index a4b4f6bc1..ac45f2d17 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -1,9 +1,9 @@ #include "common.cuh" #include "mmv.cuh" -template +template static __global__ void mul_mat_vec( - const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, + const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) { const int64_t row = blockIdx.x; const int64_t channel = blockIdx.z; @@ -13,7 +13,6 @@ static __global__ void mul_mat_vec( y += channel *stride_channel_y; dst += channel *stride_channel_dst; - const half2 * x2 = (const half2 *) x; const float2 * y2 = (const float2 *) y; extern __shared__ char data_mmv[]; @@ -28,28 +27,44 @@ static __global__ void mul_mat_vec( float sumf; - if (std::is_same::value) { + if constexpr (std::is_same::value) { + const half2 * x2 = (const half2 *) x; + + if (std::is_same::value) { + sumf = 0.0f; + + for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + const float2 tmpx = __half22float2(x2[col2]); + const float2 tmpy = y2[col2]; + sumf += tmpx.x * tmpy.x; + sumf += tmpx.y * tmpy.y; + } + } else { +#ifdef FP16_AVAILABLE + half2 sumh2 = make_half2(0.0f, 0.0f); + + for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + const float2 tmp = y2[col2]; + sumh2 += x2[col2] * make_half2(tmp.x, tmp.y); + } + + sumf = __low2float(sumh2) + __high2float(sumh2); +#else + NO_DEVICE_CODE; +#endif // FP16_AVAILABLE + } + } else if constexpr (std::is_same::value) { + const int * x2 = (const int *) x; sumf = 0.0f; for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmpx = __half22float2(x2[col2]); + const int tmpx = x2[col2]; const float2 tmpy = y2[col2]; - sumf += tmpx.x * tmpy.x; - sumf += tmpx.y * tmpy.y; + sumf += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; } } else { -#ifdef FP16_AVAILABLE - half2 sumh2 = make_half2(0.0f, 0.0f); - - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmp = y2[col2]; - sumh2 += x2[col2] * make_half2(tmp.x, tmp.y); - } - - sumf = __low2float(sumh2) + __high2float(sumh2); -#else - NO_DEVICE_CODE; -#endif // FP16_AVAILABLE + static_assert(std::is_same::value, "unsupported type"); } sumf = warp_reduce_sum(sumf); @@ -71,9 +86,9 @@ static __global__ void mul_mat_vec( dst[row] = sumf; } -template +template static void launch_mul_mat_vec_cuda( - const half * x, const float * y, float * dst, + const T * x, const float * y, float * dst, const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, cudaStream_t stream) { @@ -97,35 +112,35 @@ static void launch_mul_mat_vec_cuda( const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 64: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 96: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 128: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 160: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 192: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 224: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; case 256: { - mul_mat_vec<<>> + mul_mat_vec<<>> (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst); } break; default: { @@ -134,25 +149,25 @@ static void launch_mul_mat_vec_cuda( } } +template static void mul_mat_vec_cuda( - const half * x, const float * y, float * dst, + const T * x, const float * y, float * dst, const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, enum ggml_prec prec, cudaStream_t stream) { switch (prec) { case GGML_PREC_DEFAULT: { - launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, + launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, stream); } break; case GGML_PREC_F32: { - launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, + launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, stream); } break; } } void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -164,7 +179,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const half * src0_d = (const half *) src0->data; const float * src1_d = (const float *) src1->data; float * dst_d = (float *) dst->data; @@ -181,7 +195,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type); const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type); - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream()); + switch (src0->type) { + case GGML_TYPE_F16: { + const half * src0_d = (const half *) src0->data; + mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, + channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream()); + } break; + case GGML_TYPE_BF16: { + const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; + mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, + channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream()); + } break; + default: + GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); + } } void ggml_cuda_op_mul_mat_vec( @@ -190,7 +217,6 @@ void ggml_cuda_op_mul_mat_vec( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -211,8 +237,20 @@ void ggml_cuda_op_mul_mat_vec( const int64_t channel_stride_y = 0; const int64_t channel_stride_dst = 0; - mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream); + switch (src0->type) { + case GGML_TYPE_F16: { + const half * src0_d = (const half *) src0_dd_i; + mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream); + } break; + case GGML_TYPE_BF16: { + const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; + mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream); + } break; + default: + GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); + } GGML_UNUSED(ctx); GGML_UNUSED(src1); diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h index db9f6a165..1746b0732 100644 --- a/ggml/src/ggml-cuda/vendors/cuda.h +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #if CUDART_VERSION < 11020 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 3205534d6..c905b15d7 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -3,6 +3,7 @@ #include #include #include +#include #ifdef __HIP_PLATFORM_AMD__ // for rocblas_initialize() #include "rocblas/rocblas.h" @@ -121,6 +122,8 @@ #define __has_builtin(x) 0 #endif +typedef hip_bfloat16 nv_bfloat16; + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 1604b8229..6cc1b69ee 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #define CUBLAS_COMPUTE_16F CUDA_R_16F #define CUBLAS_COMPUTE_32F CUDA_R_32F @@ -132,3 +133,5 @@ #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamEndCapture musaStreamEndCapture + +typedef mt_bfloat16 nv_bfloat16; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 549772c57..eab017889 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -3,6 +3,8 @@ // GGML internal header #include "ggml.h" +#include "gguf.h" + #include #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ @@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) -// expose GGUF internals for test code - -GGML_API size_t gguf_type_size(enum gguf_type type); - -GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); - -struct gguf_buf { - void * data; - size_t size; - size_t offset; -}; -GGML_API struct gguf_buf gguf_buf_init(size_t size); -GGML_API void gguf_buf_free(struct gguf_buf buf); - -GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta); - #ifdef __cplusplus } #endif + +#ifdef __cplusplus +#include + +// expose GGUF internals for test code +GGML_API size_t gguf_type_size(enum gguf_type type); +GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); +GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); +#endif // __cplusplus diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 1bad27206..89fcde2fa 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -103,3 +103,19 @@ else() DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib ) endif() # GGML_METAL_EMBED_LIBRARY + +if (NOT GGML_METAL_EMBED_LIBRARY) + install( + FILES src/ggml-metal/ggml-metal.metal + PERMISSIONS + OWNER_READ + OWNER_WRITE + GROUP_READ + WORLD_READ + DESTINATION ${CMAKE_INSTALL_BINDIR}) + + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) +endif() diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 431082426..63da2b86b 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -27,15 +27,6 @@ #endif #include -#define UNUSED GGML_UNUSED - -#define GGML_DEBUG 0 -#if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG(...) -#endif - #ifdef _WIN32 typedef SOCKET sockfd_t; using ssize_t = __int64; @@ -93,9 +84,23 @@ enum rpc_cmd { RPC_CMD_COPY_TENSOR, RPC_CMD_GRAPH_COMPUTE, RPC_CMD_GET_DEVICE_MEMORY, + RPC_CMD_INIT_TENSOR, + RPC_CMD_GET_ALLOC_SIZE, RPC_CMD_COUNT, }; +struct rpc_msg_get_alloc_size_req { + rpc_tensor tensor; +}; + +struct rpc_msg_get_alloc_size_rsp { + uint64_t alloc_size; +}; + +struct rpc_msg_init_tensor_req { + rpc_tensor tensor; +}; + struct rpc_msg_alloc_buffer_req { uint64_t size; }; @@ -397,7 +402,7 @@ static std::shared_ptr get_socket(const std::string & endpoint) { initialized = true; } #else - UNUSED(initialized); + GGML_UNUSED(initialized); #endif auto sock = socket_connect(host.c_str(), port); if (sock == nullptr) { @@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { } static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - UNUSED(buffer); - if (ggml_is_quantized(tensor->type)) { - // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized - GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor"); + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + + // CUDA backend on the server pads everything to 512 due to CUDA limitations. + // Due to bandwidth constraints, we only call the server init tensor functions if necessary. + // In particular, only quantized tensors need padding + if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) { + rpc_msg_init_tensor_req request; + + request.tensor = serialize_tensor(tensor); + + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0); + GGML_ASSERT(status); } } @@ -577,8 +590,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { } static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - UNUSED(buft); - return ggml_nbytes(tensor); + // See comments in init_tensor. + if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) { + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + auto sock = get_socket(buft_ctx->endpoint); + + rpc_msg_get_alloc_size_req request; + + request.tensor = serialize_tensor(tensor); + + rpc_msg_get_alloc_size_rsp response; + bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response)); + GGML_ASSERT(status); + + return response.alloc_size; + } else { + return ggml_nbytes(tensor); + } } static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { @@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) { } static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { - UNUSED(backend); + GGML_UNUSED(backend); // this is no-op because we don't have any async operations } @@ -757,6 +785,8 @@ public: bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response); bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response); bool graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response); + bool init_tensor(const rpc_msg_init_tensor_req & request); + bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response); private: ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor); @@ -770,6 +800,36 @@ private: std::unordered_set buffers; }; +bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) { + ggml_backend_buffer_type_t buft; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx = ggml_init(params); + ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); + + if (tensor == nullptr) { + GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n"); + ggml_free(ctx); + return false; + } + + if (tensor->buffer == nullptr) { + //No buffer allocated. + buft = ggml_backend_get_default_buffer_type(backend); + } else { + buft = tensor->buffer->buft; + } + + response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor); + + ggml_free(ctx); + return true; +} + void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) { ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size); @@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_ GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size); buffers.insert(buffer); } else { - GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size); + GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size); } } @@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr); ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { - GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); + GGML_LOG_ERROR("[%s] buffer not found\n", __func__); return false; } void * base = ggml_backend_buffer_get_base(buffer); @@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) { GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr); ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { - GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); + GGML_LOG_ERROR("[%s] buffer not found\n", __func__); return false; } ggml_backend_buffer_free(buffer); @@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) { GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value); ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr); if (buffers.find(buffer) == buffers.end()) { - GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__); + GGML_LOG_ERROR("[%s] buffer not found\n", __func__); return false; } ggml_backend_buffer_clear(buffer, request.value); @@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector & input) { struct ggml_context * ctx = ggml_init(params); ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); if (tensor == nullptr) { - GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__); + GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); ggml_free(ctx); return false; } @@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector & input) { return true; } +bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) { + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); + if (tensor == nullptr) { + GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n"); + ggml_free(ctx); + return false; + } + + // Call the backend's buffer_init_tensor function + ggml_backend_buffer_t buffer = tensor->buffer; + if (buffer && buffer->iface.init_tensor) { + buffer->iface.init_tensor(buffer, tensor); + } else { + GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n"); + } + + if (tensor->extra != nullptr) { + // This pointer can either be passed around client/server, or probably better stored server-side and kept track of. + // Currently unimplemented. + GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n"); + ggml_free(ctx); + return false; + } + + ggml_free(ctx); + return true; +} + bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response) { struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), @@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector< struct ggml_context * ctx = ggml_init(params); ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); if (tensor == nullptr) { - GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__); + GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); ggml_free(ctx); return false; } @@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co ggml_tensor * src = deserialize_tensor(ctx, &request.src); ggml_tensor * dst = deserialize_tensor(ctx, &request.dst); if (src == nullptr || dst == nullptr) { - GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__); + GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__); ggml_free(ctx); return false; } @@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre } break; } + case RPC_CMD_GET_ALLOC_SIZE: { + rpc_msg_get_alloc_size_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { + return; + } + rpc_msg_get_alloc_size_rsp response; + server.get_alloc_size(request, response); + if (!send_msg(sockfd, &response, sizeof(response))) { + return; + } + break; + } case RPC_CMD_GET_ALIGNMENT: { if (!recv_msg(sockfd, nullptr, 0)) { return; @@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre } break; } + case RPC_CMD_INIT_TENSOR: { + rpc_msg_init_tensor_req request; + if (!recv_msg(sockfd, &request,sizeof(request))) { + return; + } + if (!server.init_tensor(request)) { + return; + } + if (!send_msg(sockfd, nullptr, 0)) { + return; + } + break; + } case RPC_CMD_GET_TENSOR: { rpc_msg_get_tensor_req request; if (!recv_msg(sockfd, &request, sizeof(request))) { @@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total); - UNUSED(dev); + GGML_UNUSED(dev); } static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) { // TODO: obtain value from the server return GGML_BACKEND_DEVICE_TYPE_GPU; - UNUSED(dev); + GGML_UNUSED(dev); } static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { @@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const return ggml_backend_rpc_init(ctx->endpoint.c_str()); - UNUSED(params); + GGML_UNUSED(params); } static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) { @@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); - UNUSED(dev); + GGML_UNUSED(dev); } static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - UNUSED(dev); - UNUSED(op); + GGML_UNUSED(dev); + GGML_UNUSED(op); //TODO: call the remote backend and cache the results return true; } @@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = { static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) { return "RPC"; - UNUSED(reg); + GGML_UNUSED(reg); } static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) { return 0; - UNUSED(reg); + GGML_UNUSED(reg); } static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead"); - UNUSED(reg); - UNUSED(index); + GGML_UNUSED(reg); + GGML_UNUSED(index); } static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) { @@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch } return NULL; - UNUSED(reg); + GGML_UNUSED(reg); } static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index 75ddfb86a..105db6f03 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s [=](sycl::nd_item<3> item_ct1) { rwkv_wkv_f32_kernel( B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, - item_ct1, shared_mem_acc.get_pointer() + item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() ); }); }); diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 6d46e5f24..c0ddaac82 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -8,6 +8,20 @@ if (Vulkan_FOUND) ../../include/ggml-vulkan.h ) + # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) + + if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*") + message(STATUS "GL_KHR_cooperative_matrix not supported by glslc") + else() + message(STATUS "GL_KHR_cooperative_matrix supported by glslc") + add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + endif() + # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported. # If it's not, there will be an error to stderr. # If it's supported, set a define to indicate that we should compile those shaders @@ -69,6 +83,10 @@ if (Vulkan_FOUND) file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp") + if (NOT CMAKE_CROSSCOMPILING) + set(_ggml_vk_genshaders_cmd "$/${_ggml_vk_genshaders_cmd}") + endif () + add_custom_command( OUTPUT ${_ggml_vk_header} ${_ggml_vk_source} diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 020e61280..077452424 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1645,6 +1645,7 @@ static void ggml_vk_load_shaders(vk_device& device) { #undef CREATE_MM2 } else #endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ @@ -1739,7 +1740,9 @@ static void ggml_vk_load_shaders(vk_device& device) { } #undef CREATE_MM2 #undef CREATE_MM - } else if (device->fp16) { + } else +#endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (device->fp16) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l) \ @@ -2040,6 +2043,8 @@ static void ggml_vk_load_shaders(vk_device& device) { std::cerr << "Done!" << std::endl; } +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props); + static vk_device ggml_vk_get_device(size_t idx) { VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); @@ -2175,9 +2180,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) { - // Intel drivers don't support coopmat properly yet - // Only RADV supports coopmat properly on AMD + if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) { device->coopmat_support = false; } @@ -2242,6 +2245,7 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct = (VkBaseOutStructure *)&subgroup_size_control_features; } +#if defined(VK_KHR_cooperative_matrix) VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; coopmat_features.pNext = nullptr; coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; @@ -2251,6 +2255,7 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; last_struct = (VkBaseOutStructure *)&coopmat_features; } +#endif #if defined(VK_NV_cooperative_matrix2) VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; @@ -2283,7 +2288,9 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_EXT_subgroup_size_control"); } +#if defined(VK_KHR_cooperative_matrix) device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; +#endif if (coopmat2_support) { #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) @@ -2376,6 +2383,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } +#if defined(VK_KHR_cooperative_matrix) if (device->coopmat_support) { // Query supported shapes std::vector cm_props; @@ -2442,7 +2450,7 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->coopmat_support) { device_extensions.push_back("VK_KHR_cooperative_matrix"); } - +#endif device->name = GGML_VK_NAME + std::to_string(idx); device_create_info = { @@ -2515,7 +2523,6 @@ static vk_device ggml_vk_get_device(size_t idx) { return vk_instance.devices[idx]; } - static void ggml_vk_print_gpu_info(size_t idx) { GGML_ASSERT(idx < vk_instance.device_indices.size()); size_t dev_num = vk_instance.device_indices[idx]; @@ -2554,9 +2561,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; - } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; +#endif #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT2")) { @@ -2565,9 +2574,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } - if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) { - // Intel drivers don't support coopmat properly yet - // Only RADV supports coopmat properly on AMD + if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) { coopmat_support = false; } @@ -2596,6 +2603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { // Pointer to the last chain element VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features; +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; coopmat_features.pNext = nullptr; coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; @@ -2611,6 +2619,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { fp16 = fp16 && vk12_features.shaderFloat16; coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix; +#endif std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; @@ -8088,6 +8097,25 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve UNUSED(instance_extensions); } +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) { + switch (props.vendorID) { + case VK_VENDOR_ID_INTEL: + // Intel drivers don't support coopmat properly yet + return false; + case VK_VENDOR_ID_AMD: + if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { + // Workaround for AMD proprietary driver reporting support on all GPUs + const std::string name = props.deviceName; + return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs + name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs + name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs + } + return true; + default: + return true; + } +} + // checks #ifdef GGML_VULKAN_CHECK_RESULTS diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp new file mode 100644 index 000000000..8c5dd1bd1 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_KHR_cooperative_matrix : require + +void main() +{ +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 8111c0638..7b5044798 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -342,9 +342,11 @@ void process_shaders() { matmul_shaders(true, matmul_id, false, false, false); matmul_shaders(true, matmul_id, false, false, true); +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) // Coopmat, fp32acc and fp16acc matmul_shaders(true, matmul_id, true, false, false); matmul_shaders(true, matmul_id, true, false, true); +#endif #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) // Coopmat2, fp32acc and fp16acc diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2bbe5f482..90abc6ad4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1588,15 +1588,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); -#ifdef __clang__ - // temporary until ggml_tensor::backend is removed - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif - *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_TYPE_CPU, /*.buffer =*/ NULL, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -1612,10 +1605,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.padding =*/ { 0 }, }; -#ifdef __clang__ - #pragma clang diagnostic pop -#endif - // TODO: this should not be needed as long as we don't rely on aligned SIMD loads //GGML_ASSERT_ALIGNED(result->data); @@ -6417,1271 +6406,6 @@ size_t ggml_quantize_chunk( //////////////////////////////////////////////////////////////////////////////// -struct gguf_str { - uint64_t n; // GGUFv2 - char * data; -}; - -static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = sizeof(uint8_t), - [GGUF_TYPE_INT8] = sizeof(int8_t), - [GGUF_TYPE_UINT16] = sizeof(uint16_t), - [GGUF_TYPE_INT16] = sizeof(int16_t), - [GGUF_TYPE_UINT32] = sizeof(uint32_t), - [GGUF_TYPE_INT32] = sizeof(int32_t), - [GGUF_TYPE_FLOAT32] = sizeof(float), - [GGUF_TYPE_BOOL] = sizeof(bool), - [GGUF_TYPE_STRING] = sizeof(struct gguf_str), - [GGUF_TYPE_UINT64] = sizeof(uint64_t), - [GGUF_TYPE_INT64] = sizeof(int64_t), - [GGUF_TYPE_FLOAT64] = sizeof(double), - [GGUF_TYPE_ARRAY] = 0, // undefined -}; -static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); - -static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "u8", - [GGUF_TYPE_INT8] = "i8", - [GGUF_TYPE_UINT16] = "u16", - [GGUF_TYPE_INT16] = "i16", - [GGUF_TYPE_UINT32] = "u32", - [GGUF_TYPE_INT32] = "i32", - [GGUF_TYPE_FLOAT32] = "f32", - [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "str", - [GGUF_TYPE_ARRAY] = "arr", - [GGUF_TYPE_UINT64] = "u64", - [GGUF_TYPE_INT64] = "i64", - [GGUF_TYPE_FLOAT64] = "f64", -}; -static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); - -union gguf_value { - uint8_t uint8; - int8_t int8; - uint16_t uint16; - int16_t int16; - uint32_t uint32; - int32_t int32; - float float32; - uint64_t uint64; - int64_t int64; - double float64; - bool bool_; - - struct gguf_str str; - - struct { - enum gguf_type type; - - uint64_t n; // GGUFv2 - void * data; - } arr; -}; - -struct gguf_kv { - struct gguf_str key; - - enum gguf_type type; - union gguf_value value; -}; - -struct gguf_header { - char magic[4]; - - uint32_t version; - uint64_t n_tensors; // GGUFv2 - uint64_t n_kv; // GGUFv2 -}; - -struct gguf_tensor_info { - struct gguf_str name; - - uint32_t n_dims; - uint64_t ne[GGML_MAX_DIMS]; - - enum ggml_type type; - - uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` - - // for writing API - const void * data; - size_t size; -}; - -struct gguf_context { - struct gguf_header header; - - struct gguf_kv * kv; - struct gguf_tensor_info * infos; - - size_t alignment; - size_t offset; // offset of `data` from beginning of file - size_t size; // size of `data` in bytes - - //uint8_t * padding; - void * data; -}; - -size_t gguf_type_size(enum gguf_type type) { - GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT); - return GGUF_TYPE_SIZE[type]; -} - -static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { - if (info->n_dims > GGML_MAX_DIMS) { - fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims); - return false; - } - - if (info->type < 0 || info->type >= GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type); - return false; - } - - if (strlen(info->name.data) >= GGML_MAX_NAME) { - fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data); - return false; - } - - for (uint32_t i = 0; i < info->n_dims; ++i) { - if (info->ne[i] <= 0) { - fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]); - return false; - } - } - - // prevent overflow for total number of elements - if (INT64_MAX/info->ne[1] <= info->ne[0]) { - fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]); - return false; - } - - if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) { - fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]); - return false; - } - - if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) { - fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]); - return false; - } - - return true; -} - -static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { - const size_t n = fread(dst, 1, size, file); - *offset += n; - return n == size; -} - -static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { - p->n = 0; - p->data = NULL; - - bool ok = true; - - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); - - // early exit if string length is invalid, prevents from integer overflow - if (p->n == SIZE_MAX) { - fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n); - return false; - } - - p->data = calloc(p->n + 1, 1); - if (!p->data) { - fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n); - return false; - } - - ok = ok && gguf_fread_el(file, p->data, p->n, offset); - - return ok; -} - -static void gguf_free_kv(struct gguf_kv * kv) { - if (kv->key.data) { - GGML_FREE(kv->key.data); - } - - if (kv->type == GGUF_TYPE_STRING) { - if (kv->value.str.data) { - GGML_FREE(kv->value.str.data); - } - } - - if (kv->type == GGUF_TYPE_ARRAY) { - if (kv->value.arr.data) { - if (kv->value.arr.type == GGUF_TYPE_STRING) { - for (uint64_t j = 0; j < kv->value.arr.n; ++j) { - struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; - if (str->data) { - GGML_FREE(str->data); - } - } - } - GGML_FREE(kv->value.arr.data); - } - } -} - -struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context)); - if (!ctx) { - fprintf(stderr, "%s: failed to allocate memory for context\n", __func__); - return NULL; - } - - memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); - ctx->header.version = GGUF_VERSION; - ctx->header.n_tensors = 0; - ctx->header.n_kv = 0; - - ctx->kv = NULL; - ctx->infos = NULL; - - ctx->alignment = GGUF_DEFAULT_ALIGNMENT; - ctx->offset = 0; - ctx->size = 0; - - ctx->data = NULL; - - return ctx; -} - -struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { - // offset from start of file - size_t offset = 0; - - char magic[4]; - - // check the magic before making allocations - { - gguf_fread_el(file, &magic, sizeof(magic), &offset); - - for (uint32_t i = 0; i < sizeof(magic); i++) { - if (magic[i] != GGUF_MAGIC[i]) { - fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]); - return NULL; - } - } - } - - bool ok = true; - - struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context)); - if (!ctx) { - fprintf(stderr, "%s: failed to allocate memory for context\n", __func__); - return NULL; - } - - // read the header - { - strncpy(ctx->header.magic, magic, 4); - - ctx->kv = NULL; - ctx->infos = NULL; - ctx->data = NULL; - - ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); - - if (ctx->header.version == 1) { - fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); - gguf_free(ctx); - return NULL; - } - - // sanity-checks to prevent from integer/buffer overflows - - ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info)); - ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead()); - ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv)); - - if (!ok) { - fprintf(stderr, "%s: failed to read header\n", __func__); - gguf_free(ctx); - return NULL; - } - } - - // read the kv pairs - { - const uint64_t n_kv = ctx->header.n_kv; - - if (n_kv > 0) { - ctx->kv = calloc(n_kv, sizeof(struct gguf_kv)); - if (!ctx->kv) { - fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__); - gguf_free(ctx); - return NULL; - } - } - - for (uint64_t i = 0; i < n_kv; ++i) { - struct gguf_kv * kv = &ctx->kv[i]; - - //fprintf(stderr, "%s: reading kv %d\n", __func__, i); - - ok = ok && gguf_fread_str(file, &kv->key, &offset); - ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); - - //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); - - switch (kv->type) { - case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; - case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; - case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; - case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; - case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; - case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; - case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; - case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; - case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; - case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; - case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; - case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; - case GGUF_TYPE_ARRAY: - { - ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); - - switch (kv->value.arr.type) { - case GGUF_TYPE_UINT8: - case GGUF_TYPE_INT8: - case GGUF_TYPE_UINT16: - case GGUF_TYPE_INT16: - case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: - case GGUF_TYPE_FLOAT32: - case GGUF_TYPE_UINT64: - case GGUF_TYPE_INT64: - case GGUF_TYPE_FLOAT64: - case GGUF_TYPE_BOOL: - { - // prevent from integer overflow in the malloc below - if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) { - fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - gguf_free(ctx); - return NULL; - } - - kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); - if (!kv->value.arr.data) { - fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); - gguf_free(ctx); - return NULL; - } - - ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); - } break; - case GGUF_TYPE_STRING: - { - // prevent from integer overflow in the malloc below - if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) { - fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - gguf_free(ctx); - return NULL; - } - - kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str)); - if (!kv->value.arr.data) { - fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); - gguf_free(ctx); - return NULL; - } - - for (uint64_t j = 0; j < kv->value.arr.n; ++j) { - ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); - } - } break; - case GGUF_TYPE_ARRAY: - default: - { - fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type); - ok = false; - } break; - } - } break; - default: - { - fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type); - ok = false; - } break; - } - - if (!ok) { - break; - } - } - - if (!ok) { - fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); - gguf_free(ctx); - return NULL; - } - } - - // read the tensor infos - if (ctx->header.n_tensors > 0) { - ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); - if (!ctx->infos) { - fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__); - gguf_free(ctx); - return NULL; - } - - for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { - struct gguf_tensor_info * info = &ctx->infos[i]; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - info->ne[j] = 1; - } - - ok = ok && gguf_fread_str(file, &info->name, &offset); - ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); - - ok = ok && (info->n_dims <= GGML_MAX_DIMS); - - for (uint32_t j = 0; j < info->n_dims; ++j) { - ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); - } - - ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); - ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); - - ok = ok && gguf_tensor_info_sanitize(info); - - // make sure there is no duplicated tensor names - for (uint64_t j = 0; j < i && ok; ++j) { - if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) { - fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data); - ok = false; - } - } - - if (!ok) { - fprintf(stderr, "%s: failed to read tensor info\n", __func__); - gguf_free(ctx); - return NULL; - } - } - } - - ctx->alignment = GGUF_DEFAULT_ALIGNMENT; - - int alignment_idx = gguf_find_key(ctx, "general.alignment"); - if (alignment_idx != -1) { - ctx->alignment = gguf_get_val_u32(ctx, alignment_idx); - } - - // we require the data section to be aligned, so take into account any padding - { - const size_t offset_pad = offset % ctx->alignment; - - if (offset_pad != 0) { - offset += ctx->alignment - offset_pad; - fseek(file, offset, SEEK_SET); - } - } - - // store the current file offset - this is where the data section starts - ctx->offset = offset; - - // compute the total size of the data section, taking into account the alignment - { - ctx->size = 0; - for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { - struct gguf_tensor_info * info = &ctx->infos[i]; - - const int64_t ne = - (int64_t) info->ne[0] * - (int64_t) info->ne[1] * - (int64_t) info->ne[2] * - (int64_t) info->ne[3]; - - if (ggml_blck_size(info->type) == 0 ) { - // this tensor type support have been removed: - fprintf(stderr, "%s: tensor '%s' of type %d: %s\n", - __func__, info->name.data, (int) info->type, ggml_type_name(info->type)); - gguf_free(ctx); - return NULL; - } - - if (ne % ggml_blck_size(info->type) != 0) { - fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", - __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); - gguf_free(ctx); - return NULL; - } - - const size_t size_cur = ggml_row_size(info->type, ne); - - ctx->size += GGML_PAD(size_cur, ctx->alignment); - } - } - - // load the tensor data only if requested - if (params.ctx != NULL) { - // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob - // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of - // the ggml_tensor structs to the appropriate locations in the binary blob - - // compute the exact size needed for the new ggml_context - const size_t mem_size = - params.no_alloc ? - (ctx->header.n_tensors )*ggml_tensor_overhead() : - (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; - - struct ggml_init_params pdata = { - .mem_size = mem_size, - .mem_buffer = NULL, - .no_alloc = params.no_alloc, - }; - - *params.ctx = ggml_init(pdata); - if (*params.ctx == NULL) { - fprintf(stderr, "%s: failed to initialize context\n", __func__); - gguf_free(ctx); - return NULL; - } - - struct ggml_context * ctx_data = *params.ctx; - - struct ggml_tensor * data = NULL; - - if (!params.no_alloc) { - data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); - - ok = ok && data != NULL; - - // read the binary blob with the tensor data - ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); - - if (!ok) { - fprintf(stderr, "%s: failed to read tensor data\n", __func__); - ggml_free(ctx_data); - gguf_free(ctx); - return NULL; - } - - ctx->data = data->data; - } - - ggml_set_no_alloc(ctx_data, true); - - // create the tensors - for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { - const int64_t ne[GGML_MAX_DIMS] = { - ctx->infos[i].ne[0], - ctx->infos[i].ne[1], - ctx->infos[i].ne[2], - ctx->infos[i].ne[3], - }; - - struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); - - ok = ok && cur != NULL; - - if (!ok) { - break; - } - - ggml_set_name(cur, ctx->infos[i].name.data); - - // point the data member to the appropriate location in the binary blob using the tensor infos - if (!params.no_alloc) { - //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file - cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data - } - } - - if (!ok) { - fprintf(stderr, "%s: failed to read the tensor data\n", __func__); - ggml_free(ctx_data); - gguf_free(ctx); - return NULL; - } - - ggml_set_no_alloc(ctx_data, params.no_alloc); - } - - return ctx; -} - -struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { - FILE * file = ggml_fopen(fname, "rb"); - if (!file) { - fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno)); - return NULL; - } - - struct gguf_context * result = gguf_init_from_file_impl(file, params); - fclose(file); - return result; -} - -void gguf_free(struct gguf_context * ctx) { - if (ctx == NULL) { - return; - } - - if (ctx->kv) { - // free string memory - not great.. - for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { - gguf_free_kv(&ctx->kv[i]); - } - - GGML_FREE(ctx->kv); - } - - if (ctx->infos) { - for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { - struct gguf_tensor_info * info = &ctx->infos[i]; - - if (info->name.data) { - GGML_FREE(info->name.data); - } - } - - GGML_FREE(ctx->infos); - } - - GGML_FREE(ctx); -} - -const char * gguf_type_name(enum gguf_type type) { - return GGUF_TYPE_NAME[type]; -} - -int gguf_get_version(const struct gguf_context * ctx) { - return ctx->header.version; -} - -size_t gguf_get_alignment(const struct gguf_context * ctx) { - return ctx->alignment; -} - -size_t gguf_get_data_offset(const struct gguf_context * ctx) { - return ctx->offset; -} - -void * gguf_get_data(const struct gguf_context * ctx) { - return ctx->data; -} - -int gguf_get_n_kv(const struct gguf_context * ctx) { - return ctx->header.n_kv; -} - -int gguf_find_key(const struct gguf_context * ctx, const char * key) { - // return -1 if key not found - int keyfound = -1; - - const int n_kv = gguf_get_n_kv(ctx); - - for (int i = 0; i < n_kv; ++i) { - if (strcmp(key, gguf_get_key(ctx, i)) == 0) { - keyfound = i; - break; - } - } - - return keyfound; -} - -const char * gguf_get_key(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - return ctx->kv[key_id].key.data; -} - -enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - return ctx->kv[key_id].type; -} - -enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); - return ctx->kv[key_id].value.arr.type; -} - -const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); - return ctx->kv[key_id].value.arr.data; -} - -const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); - struct gguf_kv * kv = &ctx->kv[key_id]; - struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; - return str->data; -} - -int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); - return ctx->kv[key_id].value.arr.n; -} - -uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8); - return ctx->kv[key_id].value.uint8; -} - -int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8); - return ctx->kv[key_id].value.int8; -} - -uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16); - return ctx->kv[key_id].value.uint16; -} - -int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16); - return ctx->kv[key_id].value.int16; -} - -uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32); - return ctx->kv[key_id].value.uint32; -} - -int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32); - return ctx->kv[key_id].value.int32; -} - -float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32); - return ctx->kv[key_id].value.float32; -} - -uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64); - return ctx->kv[key_id].value.uint64; -} - -int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64); - return ctx->kv[key_id].value.int64; -} - -double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64); - return ctx->kv[key_id].value.float64; -} - -bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL); - return ctx->kv[key_id].value.bool_; -} - -const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); - return ctx->kv[key_id].value.str.data; -} - -const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) { - GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); - GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY); - GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING); - return &ctx->kv[key_id].value; -} - -int gguf_get_n_tensors(const struct gguf_context * ctx) { - return ctx->header.n_tensors; -} - -int gguf_find_tensor(const struct gguf_context * ctx, const char * name) { - // return -1 if tensor not found - int tensorfound = -1; - - const int n_tensors = gguf_get_n_tensors(ctx); - - for (int i = 0; i < n_tensors; ++i) { - if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { - tensorfound = i; - break; - } - } - - return tensorfound; -} - -size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) { - return ctx->infos[i].offset; -} - -char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { - return ctx->infos[i].name.data; -} - -enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) { - return ctx->infos[i].type; -} - -// returns the index -static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { - const int idx = gguf_find_key(ctx, key); - if (idx >= 0) { - return idx; - } - - const int n_kv = gguf_get_n_kv(ctx); - - ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); - ctx->kv[n_kv].key.n = strlen(key); - ctx->kv[n_kv].key.data = strdup(key); - ctx->header.n_kv++; - - return n_kv; -} - -void gguf_remove_key(struct gguf_context * ctx, const char * key) { - const int idx = gguf_find_key(ctx, key); - if (idx >= 0) { - const int n_kv = gguf_get_n_kv(ctx); - gguf_free_kv(&ctx->kv[idx]); - for (int i = idx; i < n_kv-1; ++i) { - ctx->kv[i] = ctx->kv[i+1]; - } - ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv)); - ctx->header.n_kv--; - } -} - -void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_UINT8; - ctx->kv[idx].value.uint8 = val; -} - -void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_INT8; - ctx->kv[idx].value.int8 = val; -} - -void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_UINT16; - ctx->kv[idx].value.uint16 = val; -} - -void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_INT16; - ctx->kv[idx].value.int16 = val; -} - -void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_UINT32; - ctx->kv[idx].value.uint32 = val; -} - -void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_INT32; - ctx->kv[idx].value.int32 = val; -} - -void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_FLOAT32; - ctx->kv[idx].value.float32 = val; -} - -void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_UINT64; - ctx->kv[idx].value.uint64 = val; -} - -void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_INT64; - ctx->kv[idx].value.int64 = val; -} - -void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_FLOAT64; - ctx->kv[idx].value.float64 = val; -} - -void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_BOOL; - ctx->kv[idx].value.bool_ = val; -} - -void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_STRING; - ctx->kv[idx].value.str.n = strlen(val); - ctx->kv[idx].value.str.data = strdup(val); -} - -void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_ARRAY; - ctx->kv[idx].value.arr.type = type; - ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type)); - memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); -} - -void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { - const int idx = gguf_get_or_add_key(ctx, key); - - ctx->kv[idx].type = GGUF_TYPE_ARRAY; - ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; - ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str)); - for (int i = 0; i < n; i++) { - struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; - str->n = strlen(data[i]); - str->data = strdup(data[i]); - } -} - -// set or add KV pairs from another context -void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { - for (uint32_t i = 0; i < src->header.n_kv; i++) { - switch (src->kv[i].type) { - case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break; - case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break; - case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break; - case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break; - case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; - case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; - case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; - case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; - case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; - case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; - case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; - case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; - case GGUF_TYPE_ARRAY: - { - if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *)); - for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { - data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; - } - gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); - GGML_FREE((void *)data); - } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { - GGML_ABORT("nested arrays not supported"); - } else { - gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); - } - } break; - default: GGML_ABORT("invalid type"); - } - } -} - -void gguf_add_tensor( - struct gguf_context * ctx, - const struct ggml_tensor * tensor) { - GGML_ASSERT(tensor); - if (gguf_find_tensor(ctx, tensor->name) != -1) { - GGML_ABORT("duplicated tensor name"); - } - - const int idx = ctx->header.n_tensors; - ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); - - ctx->infos[idx].name.n = strlen(tensor->name); - ctx->infos[idx].name.data = strdup(tensor->name); - - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - ctx->infos[idx].ne[i] = 1; - } - - ctx->infos[idx].n_dims = ggml_n_dims(tensor); - for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) { - ctx->infos[idx].ne[i] = tensor->ne[i]; - } - - ctx->infos[idx].type = tensor->type; - ctx->infos[idx].offset = 0; - ctx->infos[idx].data = tensor->data; - ctx->infos[idx].size = ggml_nbytes(tensor); - - if (ctx->header.n_tensors > 0) { - ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); - } - - ctx->header.n_tensors++; -} - -void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { - const int idx = gguf_find_tensor(ctx, name); - if (idx < 0) { - GGML_ABORT("tensor not found"); - } - - ctx->infos[idx].type = type; -} - -void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { - const int idx = gguf_find_tensor(ctx, name); - if (idx < 0) { - GGML_ABORT("tensor not found"); - } - - ctx->infos[idx].data = data; - ctx->infos[idx].size = size; - - // update offsets - for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { - ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); - } -} - -//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { -// fwrite(&val->n, sizeof(val->n), 1, file); -// fwrite(val->data, sizeof(char), val->n, file); -//} -// -//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { -// fwrite(val, sizeof(char), size, file); -//} - -struct gguf_buf gguf_buf_init(size_t size) { - struct gguf_buf buf = { - /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size), - /*buf.size =*/ size, - /*buf.offset =*/ 0, - }; - - return buf; -} - -void gguf_buf_free(struct gguf_buf buf) { - if (buf.data) { - GGML_FREE(buf.data); - } -} - -static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { - if (buf->offset + size > buf->size) { - buf->size = 1.5*(buf->offset + size); - if (buf->data) { - buf->data = realloc(buf->data, buf->size); - } - } -} - -static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { - gguf_buf_grow(buf, sizeof(val->n) + val->n); - - if (buf->data) { - memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); - } - buf->offset += sizeof(val->n); - - if (buf->data) { - memcpy((char *) buf->data + buf->offset, val->data, val->n); - } - buf->offset += val->n; -} - -static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { - gguf_buf_grow(buf, el_size); - - if (buf->data) { - memcpy((char *) buf->data + buf->offset, val, el_size); - } - buf->offset += el_size; -} - -void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { - // write header - gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); - gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); - gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); - gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); - - // write key-value pairs - for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { - struct gguf_kv * kv = &ctx->kv[i]; - - gguf_bwrite_str(buf, &kv->key); - gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); - - switch (kv->type) { - case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; - case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; - case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; - case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; - case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; - case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; - case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; - case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; - case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; - case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; - case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; - case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; - case GGUF_TYPE_ARRAY: - { - gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); - gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); - - switch (kv->value.arr.type) { - case GGUF_TYPE_UINT8: - case GGUF_TYPE_INT8: - case GGUF_TYPE_UINT16: - case GGUF_TYPE_INT16: - case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: - case GGUF_TYPE_FLOAT32: - case GGUF_TYPE_UINT64: - case GGUF_TYPE_INT64: - case GGUF_TYPE_FLOAT64: - case GGUF_TYPE_BOOL: - { - gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type)); - } break; - case GGUF_TYPE_STRING: - { - for (uint32_t j = 0; j < kv->value.arr.n; ++j) { - gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); - } - } break; - case GGUF_TYPE_ARRAY: - default: GGML_ABORT("invalid type"); - } - } break; - default: GGML_ABORT("invalid type"); - } - } - - // write tensor infos - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { - struct gguf_tensor_info * info = &ctx->infos[i]; - - gguf_bwrite_str(buf, &info->name); - gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); - for (uint32_t j = 0; j < info->n_dims; ++j) { - gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); - } - gguf_bwrite_el(buf, &info->type, sizeof(info->type)); - gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); - } - - // we require the data section to be aligned, so take into account any padding - { - const size_t offset = buf->offset; - const size_t offset_pad = GGML_PAD(offset, ctx->alignment); - - if (offset_pad != offset) { - uint8_t pad = 0; - for (size_t i = 0; i < offset_pad - offset; ++i) { - gguf_bwrite_el(buf, &pad, sizeof(pad)); - } - } - } - - if (only_meta) { - return; - } - - size_t offset = 0; - - // write tensor data - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { - struct gguf_tensor_info * info = &ctx->infos[i]; - - const size_t size = info->size; - const size_t size_pad = GGML_PAD(size, ctx->alignment); - - gguf_bwrite_el(buf, info->data, size); - - if (size_pad != size) { - uint8_t pad = 0; - for (size_t j = 0; j < size_pad - size; ++j) { - gguf_bwrite_el(buf, &pad, sizeof(pad)); - } - } - - GGML_ASSERT(offset == info->offset); - - offset += size_pad; - } -} - -void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { - FILE * file = ggml_fopen(fname, "wb"); - if (!file) { - GGML_ABORT("failed to open file for writing"); - } - - struct gguf_buf buf = gguf_buf_init(16*1024); - - gguf_write_to_buf(ctx, &buf, only_meta); - - fwrite(buf.data, 1, buf.offset, file); - - gguf_buf_free(buf); - - fclose(file); -} - -size_t gguf_get_meta_size(const struct gguf_context * ctx) { - // no allocs - only compute size - struct gguf_buf buf = gguf_buf_init(0); - - gguf_write_to_buf(ctx, &buf, true); - - return buf.offset; -} - -void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { - struct gguf_buf buf = gguf_buf_init(16*1024); - - gguf_write_to_buf(ctx, &buf, true); - - memcpy(data, buf.data, buf.offset); - - gguf_buf_free(buf); -} - void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp new file mode 100644 index 000000000..655ed600a --- /dev/null +++ b/ggml/src/gguf.cpp @@ -0,0 +1,1325 @@ +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "gguf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +struct type_to_gguf_type; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_UINT8; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_INT8; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_UINT16; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_INT16; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_UINT32; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_INT32; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_FLOAT32; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_BOOL; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_STRING; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_UINT64; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_INT64; +}; + +template <> +struct type_to_gguf_type { + static constexpr enum gguf_type value = GGUF_TYPE_FLOAT64; +}; + +static const std::map GGUF_TYPE_SIZE = { + {GGUF_TYPE_UINT8, sizeof(uint8_t)}, + {GGUF_TYPE_INT8, sizeof(int8_t)}, + {GGUF_TYPE_UINT16, sizeof(uint16_t)}, + {GGUF_TYPE_INT16, sizeof(int16_t)}, + {GGUF_TYPE_UINT32, sizeof(uint32_t)}, + {GGUF_TYPE_INT32, sizeof(int32_t)}, + {GGUF_TYPE_FLOAT32, sizeof(float)}, + {GGUF_TYPE_BOOL, sizeof(int8_t)}, + {GGUF_TYPE_STRING, 0}, // undefined + {GGUF_TYPE_ARRAY, 0}, // undefined + {GGUF_TYPE_UINT64, sizeof(uint64_t)}, + {GGUF_TYPE_INT64, sizeof(int64_t)}, + {GGUF_TYPE_FLOAT64, sizeof(double)}, +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +static const std::map GGUF_TYPE_NAME = { + {GGUF_TYPE_UINT8, "u8"}, + {GGUF_TYPE_INT8, "i8"}, + {GGUF_TYPE_UINT16, "u16"}, + {GGUF_TYPE_INT16, "i16"}, + {GGUF_TYPE_UINT32, "u32"}, + {GGUF_TYPE_INT32, "i32"}, + {GGUF_TYPE_FLOAT32, "f32"}, + {GGUF_TYPE_BOOL, "bool"}, + {GGUF_TYPE_STRING, "str"}, + {GGUF_TYPE_ARRAY, "arr"}, + {GGUF_TYPE_UINT64, "u64"}, + {GGUF_TYPE_INT64, "i64"}, + {GGUF_TYPE_FLOAT64, "f64"}, +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +size_t gguf_type_size(enum gguf_type type) { + auto it = GGUF_TYPE_SIZE.find(type); + return it == GGUF_TYPE_SIZE.end() ? 0 : it->second; +} + +struct gguf_kv { + std::string key; + + bool is_array; + enum gguf_type type; + + std::vector data; + std::vector data_string; + + template + gguf_kv(const std::string & key, const T value) + : key(key), is_array(false), type(type_to_gguf_type::value) { + GGML_ASSERT(!key.empty()); + data.resize(sizeof(T)); + memcpy(data.data(), &value, sizeof(T)); + } + + template + gguf_kv(const std::string & key, const std::vector & value) + : key(key), is_array(true), type(type_to_gguf_type::value) { + GGML_ASSERT(!key.empty()); + data.resize(value.size()*sizeof(T)); + for (size_t i = 0; i < value.size(); ++i) { + const T tmp = value[i]; + memcpy(data.data() + i*sizeof(T), &tmp, sizeof(T)); + } + } + + gguf_kv(const std::string & key, const std::string & value) + : key(key), is_array(false), type(GGUF_TYPE_STRING) { + GGML_ASSERT(!key.empty()); + data_string.push_back(value); + } + + gguf_kv(const std::string & key, const std::vector & value) + : key(key), is_array(true), type(GGUF_TYPE_STRING) { + GGML_ASSERT(!key.empty()); + data_string = value; + } + + const std::string & get_key() const { + return key; + } + + const enum gguf_type & get_type() const { + return type; + } + + size_t get_ne() const { + if (type == GGUF_TYPE_STRING) { + const size_t ne = data_string.size(); + GGML_ASSERT(is_array || ne == 1); + return ne; + } + const size_t type_size = gguf_type_size(type); + GGML_ASSERT(data.size() % type_size == 0); + const size_t ne = data.size() / type_size; + GGML_ASSERT(is_array || ne == 1); + return ne; + } + + template + const T & get_val(const size_t i = 0) const { + GGML_ASSERT(type_to_gguf_type::value == type); + if constexpr (std::is_same::value) { + GGML_ASSERT(data_string.size() >= i+1); + return data_string[i]; + } + const size_t type_size = gguf_type_size(type); + GGML_ASSERT(data.size() % type_size == 0); + GGML_ASSERT(data.size() >= (i+1)*type_size); + return reinterpret_cast(data.data())[i]; + } + + void cast(const enum gguf_type new_type) { + const size_t new_type_size = gguf_type_size(new_type); + GGML_ASSERT(data.size() % new_type_size == 0); + type = new_type; + } +}; + +struct gguf_tensor_info { + struct ggml_tensor t; // for holding the equivalent info + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` +}; + +struct gguf_context { + uint32_t version = GGUF_VERSION; + + std::vector kv; + std::vector info; + + size_t alignment = GGUF_DEFAULT_ALIGNMENT; + size_t offset = 0; // offset of `data` from beginning of file + size_t size = 0; // size of `data` in bytes + + void * data = nullptr; +}; + +struct gguf_reader { + FILE * file; + + gguf_reader(FILE * file) : file(file) {} + + template + bool read(T & dst) const { + return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + } + + template + bool read(std::vector & dst, const size_t n) const { + dst.resize(n); + for (size_t i = 0; i < dst.size(); ++i) { + if constexpr (std::is_same::value) { + bool tmp; + if (!read(tmp)) { + return false; + } + dst[i] = tmp; + } else { + if (!read(dst[i])) { + return false; + } + } + } + return true; + } + + bool read(bool & dst) const { + int8_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = tmp != 0; + return true; + } + + bool read(enum ggml_type & dst) const { + int32_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = ggml_type(tmp); + return true; + } + + bool read(enum gguf_type & dst) const { + int32_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = gguf_type(tmp); + return true; + } + + bool read(std::string & dst) const { + uint64_t size = -1; + if (!read(size)) { + return false; + } + dst.resize(size); + return fread(dst.data(), 1, dst.length(), file) == dst.length(); + } + + bool read(void * dst, const size_t size) const { + return fread(dst, 1, size, file) == size; + } +}; + +struct gguf_context * gguf_init_empty(void) { + return new gguf_context; +} + +template +bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & kv, const std::string & key, const bool is_array, const size_t n) { + if (is_array) { + std::vector value; + try { + if (!gr.read(value, n)) { + return false; + } + } catch (std::length_error &) { + fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str()); + return false; + } catch (std::bad_alloc &) { + fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str()); + return false; + } + kv.emplace_back(key, value); + } else { + T value; + if (!gr.read(value)) { + return false; + } + kv.emplace_back(key, value); + } + return true; +} + +struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { + const struct gguf_reader gr(file); + struct gguf_context * ctx = new gguf_context; + + bool ok = true; + + // file magic + { + std::vector magic; + ok = ok && gr.read(magic, 4); + + if (!ok) { + fprintf(stderr, "%s: failed to read magic\n", __func__); + gguf_free(ctx); + return nullptr; + } + + for (uint32_t i = 0; i < magic.size(); i++) { + if (magic[i] != GGUF_MAGIC[i]) { + fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]); + gguf_free(ctx); + return nullptr; + } + } + } + + // header + int64_t n_kv = 0; + int64_t n_tensors = 0; + + if (ok && gr.read(ctx->version)) { + if (ctx->version == 1) { + fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__); + ok = false; + } + if (ctx->version > GGUF_VERSION) { + fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n", + __func__, ctx->version, GGUF_VERSION); + ok = false; + } + } else { + ok = false; + } + + if (ok && gr.read(n_tensors)) { + static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing"); + if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) { + fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n", + __func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info)); + ok = false; + } + } else { + ok = false; + } + + if (ok && gr.read(n_kv)) { + static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing"); + if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) { + fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n", + __func__, n_kv, SIZE_MAX/sizeof(gguf_kv)); + ok = false; + } + } else { + ok = false; + } + + if (!ok) { + fprintf(stderr, "%s: failed to read header\n", __func__); + gguf_free(ctx); + return nullptr; + } + + // KV pairs + { + for (int64_t i = 0; ok && i < n_kv; ++i) { + std::string key; + gguf_type type = gguf_type(-1); + bool is_array = false; + uint64_t n = 1; + + try { + ok = ok && gr.read(key); + } catch (std::length_error &) { + fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i); + ok = false; + } catch (std::bad_alloc &) { + fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i); + ok = false; + } + for (size_t j = 0; ok && j < ctx->kv.size(); ++j) { + if (key == ctx->kv[j].key) { + fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i); + ok = false; + } + } + if (!ok) { + break; + } + + ok = ok && gr.read(type); + if (type == GGUF_TYPE_ARRAY) { + is_array = true; + ok = ok && gr.read(type); + ok = ok && gr.read(n); + } + if (!ok) { + break; + } + + switch (type) { + case GGUF_TYPE_UINT8: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_INT8: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_INT16: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_INT32: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_STRING: ok = ok && gguf_read_emplace_helper(gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_INT64: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_read_emplace_helper (gr, ctx->kv, key, is_array, n); break; + case GGUF_TYPE_ARRAY: + default: + { + fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type); + ok = false; + } break; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); + gguf_free(ctx); + return nullptr; + } + GGML_ASSERT(int64_t(ctx->kv.size()) == n_kv); + + const int alignment_idx = gguf_find_key(ctx, GGUF_KEY_GENERAL_ALIGNMENT); + ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx); + + if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) { + fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment); + gguf_free(ctx); + return nullptr; + } + } + + // read the tensor info + for (int64_t i = 0; ok && i < n_tensors; ++i) { + struct gguf_tensor_info info; + + // tensor name + { + std::string name; + try { + ok = ok && gr.read(name); + } catch (std::length_error &) { + fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i); + ok = false; + } catch (std::bad_alloc &) { + fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i); + ok = false; + } + if (name.length() >= GGML_MAX_NAME) { + fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME); + ok = false; + break; + } + ggml_set_name(&info.t, name.c_str()); + + // make sure there are no duplicate tensor names + for (int64_t j = 0; ok && j < i; ++j) { + if (strcmp(info.t.name, ctx->info[j].t.name) == 0) { + fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i); + ok = false; + break; + } + } + } + if (!ok) { + break; + } + + // tensor shape + { + uint32_t n_dims = -1; + ok = ok && gr.read(n_dims); + if (n_dims > GGML_MAX_DIMS) { + fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n", + __func__, info.t.name, n_dims, GGML_MAX_DIMS); + ok = false; + break; + } + for (uint32_t j = 0; ok && j < GGML_MAX_DIMS; ++j) { + info.t.ne[j] = 1; + if (j < n_dims) { + ok = ok && gr.read(info.t.ne[j]); + } + + // check that all ne are non-negative + if (info.t.ne[j] < 0) { + fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n", + __func__, info.t.name, j, info.t.ne[j]); + ok = false; + break; + } + } + + // check that the total number of elements is representable + if (ok && ((INT64_MAX/info.t.ne[1] <= info.t.ne[0]) || + (INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) || + (INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) { + + fprintf(stderr, "%s: total number of elements in tensor '%s' with shape " + "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n", + __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX); + ok = false; + break; + } + } + if (!ok) { + break; + } + + // tensor type + { + ok = ok && gr.read(info.t.type); + + // check that tensor type is within defined range + if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) { + fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n", + __func__, info.t.name, info.t.type, ggml_type_name(info.t.type)); + ok = false; + break; + } + const size_t type_size = ggml_type_size(info.t.type); + const int64_t blck_size = ggml_blck_size(info.t.type); + + // check that row size is divisible by block size + if (blck_size == 0 || info.t.ne[0] % blck_size != 0) { + fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, " + "not a multiple of block size (%" PRId64 ")\n", + __func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size); + ok = false; + break; + } + + // calculate byte offsets given the tensor shape and type + info.t.nb[0] = type_size; + info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size); + for (int j = 2; j < GGML_MAX_DIMS; ++j) { + info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1]; + } + } + if (!ok) { + break; + } + + // tensor data offset within buffer + ok = ok && gr.read(info.offset); + + ctx->info.push_back(info); + } + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor info\n", __func__); + gguf_free(ctx); + return nullptr; + } + GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors); + + // we require the data section to be aligned, so take into account any padding + if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) { + fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + + // store the current file offset - this is where the data section starts + ctx->offset = ftell(file); + + // compute the total size of the data section, taking into account the alignment + { + ctx->size = 0; + for (size_t i = 0; i < ctx->info.size(); ++i) { + const gguf_tensor_info & ti = ctx->info[i]; + if (ti.offset != ctx->size) { + fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n", + __func__, ti.t.name, ti.offset, ctx->size); + fprintf(stderr, "%s: failed to read tensor data\n", __func__); + gguf_free(ctx); + return nullptr; + } + ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment); + } + } + + // load the tensor data only if requested + if (params.ctx != nullptr) { + // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob + // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of + // the ggml_tensor structs to the appropriate locations in the binary blob + + // compute the exact size needed for the new ggml_context + const size_t mem_size = + params.no_alloc ? + (n_tensors )*ggml_tensor_overhead() : + (n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + + struct ggml_init_params pdata = { + /*mem_size =*/ mem_size, + /*mem_buffer =*/ nullptr, + /*no_alloc =*/ params.no_alloc, + }; + + *params.ctx = ggml_init(pdata); + if (*params.ctx == nullptr) { + fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__); + gguf_free(ctx); + return nullptr; + } + + struct ggml_context * ctx_data = *params.ctx; + + struct ggml_tensor * data = nullptr; + + if (!params.no_alloc) { + data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); + + ok = ok && data != nullptr; + + // read the binary blob with the tensor data + ok = ok && gr.read(data->data, ctx->size); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__); + ggml_free(ctx_data); + *params.ctx = nullptr; + gguf_free(ctx); + return nullptr; + } + + ctx->data = data->data; + } + + ggml_set_no_alloc(ctx_data, true); + + // create the tensors + for (size_t i = 0; i < ctx->info.size(); ++i) { + const struct gguf_tensor_info & info = ctx->info[i]; + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, info.t.type, GGML_MAX_DIMS, info.t.ne); + + ok = ok && cur != nullptr; + + if (!ok) { + break; + } + + ggml_set_name(cur, info.t.name); + + // point the data member to the appropriate location in the binary blob using the tensor info + if (!params.no_alloc) { + cur->data = (char *) data->data + info.offset; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to create tensors\n", __func__); + ggml_free(ctx_data); + *params.ctx = nullptr; + gguf_free(ctx); + return nullptr; + } + + ggml_set_no_alloc(ctx_data, params.no_alloc); + } + + return ctx; +} + +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = ggml_fopen(fname, "rb"); + + if (!file) { + fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname); + return nullptr; + } + + struct gguf_context * result = gguf_init_from_file_impl(file, params); + fclose(file); + return result; +} + +void gguf_free(struct gguf_context * ctx) { + if (ctx == nullptr) { + return; + } + delete ctx; +} + +const char * gguf_type_name(enum gguf_type type) { + auto it = GGUF_TYPE_NAME.find(type); + return it == GGUF_TYPE_NAME.end() ? nullptr : it->second; +} + +uint32_t gguf_get_version(const struct gguf_context * ctx) { + return ctx->version; +} + +size_t gguf_get_alignment(const struct gguf_context * ctx) { + return ctx->alignment; +} + +size_t gguf_get_data_offset(const struct gguf_context * ctx) { + return ctx->offset; +} + +int64_t gguf_get_n_kv(const struct gguf_context * ctx) { + return ctx->kv.size(); +} + +int64_t gguf_find_key(const struct gguf_context * ctx, const char * key) { + // return -1 if key not found + int64_t keyfound = -1; + + const int64_t n_kv = gguf_get_n_kv(ctx); + + for (int64_t i = 0; i < n_kv; ++i) { + if (strcmp(key, gguf_get_key(ctx, i)) == 0) { + keyfound = i; + break; + } + } + + return keyfound; +} + +const char * gguf_get_key(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + return ctx->kv[key_id].get_key().c_str(); +} + +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + return ctx->kv[key_id].is_array ? GGUF_TYPE_ARRAY : ctx->kv[key_id].get_type(); +} + +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].is_array); + return ctx->kv[key_id].get_type(); +} + +const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING); + return ctx->kv[key_id].data.data(); +} + +const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING); + return ctx->kv[key_id].data_string[i].c_str(); +} + +size_t gguf_get_arr_n(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + + if (ctx->kv[key_id].type == GGUF_TYPE_STRING) { + return ctx->kv[key_id].data_string.size(); + } + + const size_t type_size = gguf_type_size(ctx->kv[key_id].type); + GGML_ASSERT(ctx->kv[key_id].data.size() % type_size == 0); + return ctx->kv[key_id].data.size() / type_size; +} + +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +float gguf_get_val_f32(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +double gguf_get_val_f64(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val(); +} + +const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + return ctx->kv[key_id].get_val().c_str(); +} + +const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].get_ne() == 1); + GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING); + return ctx->kv[key_id].data.data(); +} + +int64_t gguf_get_n_tensors(const struct gguf_context * ctx) { + return ctx->info.size(); +} + +int64_t gguf_find_tensor(const struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int64_t tensor_id = -1; + + const int64_t n_tensors = gguf_get_n_tensors(ctx); + + for (int64_t i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensor_id = i; + break; + } + } + + return tensor_id; +} + +size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id) { + GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx)); + return ctx->info[tensor_id].offset; +} + +const char * gguf_get_tensor_name(const struct gguf_context * ctx, int64_t tensor_id) { + GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx)); + return ctx->info[tensor_id].t.name; +} + +enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int64_t tensor_id) { + GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx)); + return ctx->info[tensor_id].t.type; +} + +size_t gguf_get_tensor_size(const struct gguf_context * ctx, int64_t tensor_id) { + GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx)); + return ggml_nbytes(&ctx->info[tensor_id].t); +} + +int64_t gguf_remove_key(struct gguf_context * ctx, const char * key) { + const int64_t key_id = gguf_find_key(ctx, key); + if (key_id >= 0) { + ctx->kv.erase(ctx->kv.begin() + key_id); + } + return key_id; +} + +template +static void gguf_check_reserved_keys(const std::string & key, const T val) { + if (key == GGUF_KEY_GENERAL_ALIGNMENT) { + if constexpr (std::is_same::value) { + GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2"); + } else { + GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32"); + } + } +} + +void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, val); +} + +void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { + gguf_check_reserved_keys(key, val); + gguf_remove_key(ctx, key); + ctx->kv.emplace_back(key, std::string(val)); +} + +void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n) { + gguf_check_reserved_keys(key, data); + gguf_remove_key(ctx, key); + + const size_t nbytes = n*gguf_type_size(type); + std::vector tmp(nbytes); + if (!tmp.empty()) { + memcpy(tmp.data(), data, nbytes); + } + ctx->kv.emplace_back(key, tmp); + ctx->kv.back().cast(type); +} + +void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, size_t n) { + gguf_check_reserved_keys(key, data); + gguf_remove_key(ctx, key); + + std::vector tmp(n); + for (size_t i = 0; i < n; ++i) { + tmp[i] = data[i]; + } + ctx->kv.emplace_back(key, tmp); +} + +// set or add KV pairs from another context +void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src) { + const int64_t n_kv = gguf_get_n_kv(src); + for (int64_t i = 0; i < n_kv; ++i) { + const struct gguf_kv & kv = src->kv[i]; + + if (!kv.is_array) { + switch (kv.get_type()) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, kv.get_key().c_str(), kv.get_val()); break; + case GGUF_TYPE_STRING: gguf_set_val_str (ctx, kv.get_key().c_str(), kv.get_val().c_str()); break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid type"); + } + continue; + } + + const size_t ne = kv.get_ne(); + + switch (kv.get_type()) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: { + gguf_set_arr_data(ctx, kv.get_key().c_str(), kv.get_type(), kv.data.data(), ne); + } break; + case GGUF_TYPE_STRING: { + std::vector tmp(ne); + for (size_t j = 0; j < ne; ++j) { + tmp[j] = kv.data_string[j].c_str(); + } + gguf_set_arr_str(ctx, kv.get_key().c_str(), tmp.data(), ne); + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid type"); + } + } +} + +void gguf_add_tensor( + struct gguf_context * ctx, + const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor); + if (gguf_find_tensor(ctx, tensor->name) != -1) { + GGML_ABORT("duplicate tensor name: %s", tensor->name); + } + + struct gguf_tensor_info ti; + ti.t = *tensor; + ti.offset = ctx->info.empty() ? 0 : + ctx->info.back().offset + GGML_PAD(ggml_nbytes(&ctx->info.back().t), ctx->alignment); + ctx->info.push_back(ti); +} + +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int64_t tensor_id = gguf_find_tensor(ctx, name); + if (tensor_id < 0) { + GGML_ABORT("tensor not found: %s", name); + } + struct ggml_tensor * tensor = &ctx->info[tensor_id].t; + const size_t type_size = ggml_type_size(type); + const int64_t blck_size = ggml_blck_size(type); + + tensor->type = type; + GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type"); + + tensor->nb[0] = type_size; + tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size); + for (int i = 2; i < GGML_MAX_DIMS; i++) { + tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1]; + } + + // update offsets + const int64_t n_tensors = gguf_get_n_tensors(ctx); + for (int64_t i = tensor_id + 1; i < n_tensors; ++i) { + ctx->info[i].offset = ctx->info[i - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[i - 1].t), ctx->alignment); + } +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data) { + const int64_t tensor_id = gguf_find_tensor(ctx, name); + if (tensor_id < 0) { + GGML_ABORT("tensor not found: %s", name); + } + + ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const +} + +struct gguf_writer { + std::vector & buf; + + gguf_writer(std::vector & buf) : buf(buf) {} + + template + void write(const T & val) const { + for (size_t i = 0; i < sizeof(val); ++i) { + buf.push_back(reinterpret_cast(&val)[i]); + } + } + + void write(const std::vector & val) const { + buf.insert(buf.end(), val.begin(), val.end()); + } + + void write(const bool & val) const { + const int8_t val8 = val ? 1 : 0; + write(val8); + } + + void write(const std::string & val) const { + { + const uint64_t n = val.length(); + write(n); + } + for (size_t i = 0; i < val.length(); ++i) { + buf.push_back(reinterpret_cast(val.data())[i]); + } + } + + void write(const char * val) const { + write(std::string(val)); + } + + void write(const enum ggml_type & val) const { + write(int32_t(val)); + } + + void write(const enum gguf_type & val) const { + write(int32_t(val)); + } + + void write(const struct gguf_kv & kv) const { + const uint64_t ne = kv.get_ne(); + + write(kv.get_key()); + + if (kv.is_array) { + write(GGUF_TYPE_ARRAY); + write(kv.get_type()); + write(ne); + } else { + write(kv.get_type()); + } + + switch (kv.get_type()) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + write(kv.data); + } break; + case GGUF_TYPE_BOOL: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_STRING: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid type"); + } + } + + void write_tensor_meta(const struct gguf_tensor_info & info) const { + write(info.t.name); + + const uint32_t n_dims = ggml_n_dims(&info.t); + write(n_dims); + + for (uint32_t j = 0; j < n_dims; ++j) { + write(info.t.ne[j]); + } + write(info.t.type); + write(info.offset); + } + + void pad(const size_t alignment) const { + while (buf.size() % alignment != 0) { + const int8_t zero = 0; + write(zero); + } + } + + void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) const { + GGML_ASSERT(buf.size() - offset_data == info.offset); + + GGML_ASSERT(ggml_is_contiguous(&info.t)); + const size_t offset = buf.size(); + const size_t nbytes = ggml_nbytes(&info.t); + + buf.resize(offset + nbytes); + if (info.t.buffer) { + ggml_backend_tensor_get(&info.t, buf.data() + offset, 0, nbytes); + } else { + GGML_ASSERT(info.t.data); + memcpy(buf.data() + offset, info.t.data, nbytes); + } + + pad(alignment); + } +}; + +void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta) { + const struct gguf_writer gw(buf); + + const int64_t n_kv = gguf_get_n_kv(ctx); + const int64_t n_tensors = gguf_get_n_tensors(ctx); + + // write header + gw.write(GGUF_MAGIC[0]); + gw.write(GGUF_MAGIC[1]); + gw.write(GGUF_MAGIC[2]); + gw.write(GGUF_MAGIC[3]); + gw.write(ctx->version); + gw.write(n_tensors); + gw.write(n_kv); + + // write key-value pairs + for (int64_t i = 0; i < n_kv; ++i) { + gw.write(ctx->kv[i]); + } + + // write tensor info + for (int64_t i = 0; i < n_tensors; ++i) { + gw.write_tensor_meta(ctx->info[i]); + } + + // we require the data section to be aligned + gw.pad(ctx->alignment); + + if (only_meta) { + return; + } + + const size_t offset_data = gw.buf.size(); + + // write tensor data + for (int64_t i = 0; i < n_tensors; ++i) { + gw.write_tensor_data(ctx->info[i], offset_data, ctx->alignment); + } +} + +bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = ggml_fopen(fname, "wb"); + + if (!file) { + fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname); + return false; + } + + std::vector buf; + gguf_write_to_buf(ctx, buf, only_meta); + const bool ok = fwrite(buf.data(), 1, buf.size(), file) == buf.size(); + fclose(file); + return ok; +} + +size_t gguf_get_meta_size(const struct gguf_context * ctx) { + // only return size + std::vector buf; + gguf_write_to_buf(ctx, buf, /*only_meta =*/ true); + return buf.size(); +} + +void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { + std::vector buf; + gguf_write_to_buf(ctx, buf, /*only_meta =*/ true); + memcpy(data, buf.data(), buf.size()); +} diff --git a/gguf-py/README.md b/gguf-py/README.md index 24af96a17..37a75923b 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -15,13 +15,13 @@ pip install gguf [examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. -[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. +[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. -[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. +[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. -[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. +[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. -[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. +[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. ## Development Maintainers who participate in development of this package are advised to install it in editable mode: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 273370370..9d0e7489f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -102,6 +102,8 @@ class Keys: EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" + EXPERT_GATING_FUNC = "{arch}.expert_gating_func" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -255,6 +257,7 @@ class MODEL_ARCH(IntEnum): MAMBA = auto() XVERSE = auto() COMMAND_R = auto() + COHERE2 = auto() DBRX = auto() OLMO = auto() OLMO2 = auto() @@ -312,6 +315,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() + FFN_EXP_PROBS_B = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() @@ -437,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.COHERE2: "cohere2", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO2: "olmo2", @@ -496,6 +501,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", @@ -1136,6 +1142,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_K_NORM, MODEL_TENSOR.ATTN_Q_NORM, ], + MODEL_ARCH.COHERE2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.DBRX: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1276,6 +1294,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, ], MODEL_ARCH.CHATGLM : [ MODEL_TENSOR.TOKEN_EMBD, @@ -1576,6 +1595,11 @@ class GGMLQuantizationType(IntEnum): TQ2_0 = 35 +class ExpertGatingFuncType(IntEnum): + SOFTMAX = 1 + SIGMOID = 2 + + # TODO: add GGMLFileType from ggml_ftype in ggml.h diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 3023b539a..4a0a65e3c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -26,6 +26,7 @@ from .constants import ( RopeScalingType, PoolingType, TokenType, + ExpertGatingFuncType, ) from .quants import quant_shape_from_byte_shape @@ -715,6 +716,12 @@ class GGUFWriter: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_expert_weights_norm(self, value: bool) -> None: + self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value) + + def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: + self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + def add_swin_norm(self, value: bool) -> None: self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) diff --git a/gguf-py/scripts/__init__.py b/gguf-py/gguf/scripts/__init__.py similarity index 100% rename from gguf-py/scripts/__init__.py rename to gguf-py/gguf/scripts/__init__.py diff --git a/gguf-py/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py similarity index 100% rename from gguf-py/scripts/gguf_convert_endian.py rename to gguf-py/gguf/scripts/gguf_convert_endian.py diff --git a/gguf-py/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py similarity index 100% rename from gguf-py/scripts/gguf_dump.py rename to gguf-py/gguf/scripts/gguf_dump.py diff --git a/gguf-py/scripts/gguf_hash.py b/gguf-py/gguf/scripts/gguf_hash.py similarity index 100% rename from gguf-py/scripts/gguf_hash.py rename to gguf-py/gguf/scripts/gguf_hash.py diff --git a/gguf-py/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py similarity index 100% rename from gguf-py/scripts/gguf_new_metadata.py rename to gguf-py/gguf/scripts/gguf_new_metadata.py diff --git a/gguf-py/scripts/gguf_set_metadata.py b/gguf-py/gguf/scripts/gguf_set_metadata.py similarity index 100% rename from gguf-py/scripts/gguf_set_metadata.py rename to gguf-py/gguf/scripts/gguf_set_metadata.py diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7009a11d4..efe2a4aa4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -276,6 +276,10 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe ), + MODEL_TENSOR.FFN_EXP_PROBS_B: ( + "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 + ), + # Feed-forward up MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9c3956256..92d7f22ec 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,12 +1,11 @@ [tool.poetry] name = "gguf" -version = "0.13.0" +version = "0.14.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ {include = "gguf"}, {include = "gguf/py.typed"}, - {include = "scripts"}, ] readme = "README.md" homepage = "https://ggml.ai" @@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" -gguf-dump = "scripts:gguf_dump_entrypoint" -gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" -gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" +gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint" +gguf-dump = "gguf.scripts:gguf_dump_entrypoint" +gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint" +gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint" diff --git a/include/llama-cpp.h b/include/llama-cpp.h index 1500cb2fc..11306b17f 100644 --- a/include/llama-cpp.h +++ b/include/llama-cpp.h @@ -9,7 +9,7 @@ #include "llama.h" struct llama_model_deleter { - void operator()(llama_model * model) { llama_free_model(model); } + void operator()(llama_model * model) { llama_model_free(model); } }; struct llama_context_deleter { diff --git a/include/llama.h b/include/llama.h index 7b305b299..0295a51fb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -34,7 +34,6 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF -// TODO: use everywhere in the implementation #define LLAMA_TOKEN_NULL -1 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' @@ -105,6 +104,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, }; enum llama_rope_type { @@ -413,12 +413,19 @@ extern "C" { // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); - LLAMA_API struct llama_model * llama_load_model_from_file( + DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_model_params params), + "use llama_model_load_from_file instead"); + + LLAMA_API struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params); - // TODO: rename to llama_model_free - LLAMA_API void llama_free_model(struct llama_model * model); + DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model), + "use llama_model_free instead"); + + LLAMA_API void llama_model_free(struct llama_model * model); // TODO: rename to llama_init_from_model LLAMA_API struct llama_context * llama_new_context_with_model( diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index b4ac38bbf..a0921f1a9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f +c8bd0fee71dc8328d93be301bbee06bc10d30429 diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 9fd7edea3..d4879b778 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char } else { ab_map[name].b = cur; } + } else if (str_endswith(name, "_norm.weight")) { + // TODO: add support for norm vector + // for now, we don't really care because most adapters still work fine without it + continue; } else { throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } @@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char for (auto & it : ab_map) { const std::string & name = it.first; llama_lora_weight & w = it.second; + bool is_token_embd = str_endswith(name, "token_embd.weight"); if (!w.a || !w.b) { throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); @@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char // device buft and device ctx auto * model_tensor = llama_model_get_tensor(model, name.c_str()); if (!model_tensor) { - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); } struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape - if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - throw std::runtime_error("tensor '" + name + "' has incorrect shape"); - } - if (w.a->ne[1] != w.b->ne[0]) { - throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + if (is_token_embd) { + // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() + if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); + } + } else { + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } } // save tensor to adapter diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 5f1870cc8..3448656b1 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -45,6 +45,13 @@ struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; + // get actual scale based on rank and alpha + float get_scale(float alpha, float adapter_scale) { + const float rank = (float) b->ne[0]; + const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; + return scale; + } + llama_lora_weight() = default; llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a60038385..007d79f82 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -39,6 +39,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_COHERE2, "cohere2" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO2, "olmo2" }, @@ -91,6 +92,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, + { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, + { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -807,6 +810,21 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, + { + LLM_ARCH_COHERE2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_DBRX, { @@ -968,6 +986,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, }, }, { @@ -1350,6 +1369,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 446e72eeb..45e458bb9 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -43,6 +43,7 @@ enum llm_arch { LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, + LLM_ARCH_COHERE2, LLM_ARCH_DBRX, LLM_ARCH_OLMO, LLM_ARCH_OLMO2, @@ -95,6 +96,8 @@ enum llm_kv { LLM_KV_EXPERT_USED_COUNT, LLM_KV_EXPERT_SHARED_COUNT, LLM_KV_EXPERT_WEIGHTS_SCALE, + LLM_KV_EXPERT_WEIGHTS_NORM, + LLM_KV_EXPERT_GATING_FUNC, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, @@ -230,6 +233,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index a07e9cf00..44670d3d8 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -45,6 +45,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA }, { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, + { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, @@ -148,6 +149,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_MINICPM; } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { return LLM_CHAT_TEMPLATE_DEEPSEEK_2; + } else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) { + return LLM_CHAT_TEMPLATE_DEEPSEEK_3; } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) { // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct @@ -453,6 +456,21 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "Assistant:"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) { + // DeepSeek-V3 + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << LU8("<|User|>") << message->content; + } else if (role == "assistant") { + ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>"); + } + } + if (add_ass) { + ss << LU8("<|Assistant|>"); + } } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct diff --git a/src/llama-chat.h b/src/llama-chat.h index 364318c27..b8e94d9ef 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -25,6 +25,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_VICUNA_ORCA, LLM_CHAT_TEMPLATE_DEEPSEEK, LLM_CHAT_TEMPLATE_DEEPSEEK_2, + LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_CHATGML_3, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 3a76b71a4..a29f20ec4 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -6,7 +6,13 @@ // bump if necessary #define LLAMA_MAX_LAYERS 512 -#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 +#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3 + +enum llama_expert_gating_func_type { + LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1, + LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2, +}; struct llama_hparams_posnet { uint32_t n_embd; @@ -54,7 +60,9 @@ struct llama_hparams { uint32_t n_expert_shared = 0; uint32_t n_norm_groups = 0; - float expert_weights_scale = 0.0; + float expert_weights_scale = 0.0; + bool expert_weights_norm = false; + uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; float f_norm_eps; float f_norm_rms_eps; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index a05ba4f63..6ec709dd3 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -1,5 +1,6 @@ #include "llama-impl.h" +#include "gguf.h" #include "llama.h" #include @@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = gguf_get_arr_data(ctx_gguf, i); + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); std::stringstream ss; ss << "["; for (int j = 0; j < arr_n; j++) { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 53379253a..90b6c56ed 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -119,10 +119,10 @@ bool llama_kv_cache_init( struct llama_kv_cache_slot_info llama_kv_cache_find_slot( struct llama_kv_cache & cache, - const struct llama_ubatch & batch) { - const uint32_t n_tokens = batch.n_tokens; - const uint32_t n_seqs = batch.n_seqs; - const uint32_t n_seq_tokens = batch.n_seq_tokens; + const struct llama_ubatch & ubatch) { + const uint32_t n_tokens = ubatch.n_tokens; + const uint32_t n_seqs = ubatch.n_seqs; + const uint32_t n_seq_tokens = ubatch.n_seq_tokens; if (cache.recurrent) { // For recurrent state architectures (like Mamba or RWKV), @@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // A slot should be always be contiguous. // can only process batches with an equal number of new tokens in each sequence - GGML_ASSERT(batch.equal_seqs); + GGML_ASSERT(ubatch.equal_seqs); int32_t min = cache.size - 1; int32_t max = 0; // everything should fit if all seq_ids are smaller than the max for (uint32_t s = 0; s < n_seqs; ++s) { - const uint32_t n_seq_id = batch.n_seq_id[s]; + const uint32_t n_seq_id = ubatch.n_seq_id[s]; for (uint32_t j = 0; j < n_seq_id; ++j) { - const llama_seq_id seq_id = batch.seq_id[s][j]; + const llama_seq_id seq_id = ubatch.seq_id[s][j]; if (seq_id < 0 || (uint32_t) seq_id >= cache.size) { // too big seq_id @@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find usable cell range for (uint32_t s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; llama_kv_cell & seq_meta = cache.cells[seq_id]; bool has_cell = false; if (seq_meta.tail >= 0) { @@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // gather and re-order for (uint32_t s = 0; s < n_seqs; ++s) { int32_t dst_id = s + min; - int32_t src_id = cache.cells[batch.seq_id[s][0]].tail; + int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail; if (dst_id != src_id) { llama_kv_cell & dst_cell = cache.cells[dst_id]; llama_kv_cell & src_cell = cache.cells[src_id]; @@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // update the pos of the used seqs for (uint32_t s = 0; s < n_seqs; ++s) { - const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1]; + const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; int32_t cell_id = s + min; llama_kv_cell & cell = cache.cells[cell_id]; @@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // What should happen when the pos backtracks or skips a value? // Clearing the state mid-batch would require special-casing which isn't done. LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", - __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens); + __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens); } cell.pos = last_pos; cell.seq_id.clear(); - for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) { - const llama_seq_id seq_id = batch.seq_id[s][j]; + for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; cell.seq_id.insert(seq_id); cache.cells[seq_id].tail = cell_id; } @@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t s = 0; s < n_seqs; s++) { for (uint32_t i = 0; i < n_seq_tokens; ++i) { uint32_t k = s*n_seq_tokens + i; - cache.cells[cache.head + k].pos = batch.pos[k]; + cache.cells[cache.head + k].pos = ubatch.pos[k]; - for (int32_t j = 0; j < batch.n_seq_id[s]; j++) { - cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]); + for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) { + cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]); } } } diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index a99326335..a8cb9439b 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -241,12 +241,16 @@ llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } -int llama_file::fileno() const { +int llama_file::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); +#else +#if defined(fileno) + return fileno(pimpl->fp); #else return ::fileno(pimpl->fp); #endif +#endif } void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } @@ -265,7 +269,7 @@ struct llama_mmap::impl { impl(struct llama_file * file, size_t prefetch, bool numa) { size = file->size(); - int fd = file->fileno(); + int fd = file->file_id(); int flags = MAP_SHARED; if (numa) { prefetch = 0; } #ifdef __linux__ @@ -357,7 +361,7 @@ struct llama_mmap::impl { size = file->size(); - HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno()); + HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id()); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 6bcddee8c..1da9ecb6b 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -18,7 +18,7 @@ struct llama_file { size_t tell() const; size_t size() const; - int fileno() const; + int file_id() const; // fileno overload void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 7743b4652..1c4e30878 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -18,7 +18,7 @@ const char * llama_file_version_name(llama_fver version) { } namespace GGUFMeta { - template + template struct GKV_Base_Type { static constexpr gguf_type gt = gt_; @@ -60,10 +60,11 @@ namespace GGUFMeta { public: static constexpr gguf_type gt = GGUF_TYPE_ARRAY; static ArrayInfo getter(const gguf_context *ctx, const int k) { + const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); return ArrayInfo { - gguf_get_arr_type(ctx, k), + arr_type, size_t(gguf_get_arr_n(ctx, k)), - gguf_get_arr_data(ctx, k), + arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), }; } }; @@ -553,7 +554,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, const enum gguf_type type = gguf_get_kv_type(meta.get(), i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) + ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) : gguf_type_name(type); std::string value = gguf_kv_to_str(meta.get(), i); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ace0ba262..7deb3683b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) { case MODEL_70B: return "70B"; case MODEL_236B: return "236B"; case MODEL_314B: return "314B"; + case MODEL_671B: return "671B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; @@ -125,6 +126,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { } } +static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) { + switch (type) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax"; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid"; + default: return "unknown"; + } +} + std::string llama_model_arch_name (const llama_model & model) { return llm_arch_name(model.arch); } @@ -786,6 +795,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_COHERE2: + { + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_DBRX: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -923,11 +942,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + // for compatibility with existing DeepSeek V2 and V2.5 GGUFs + // that have no expert_gating_func model parameter set + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; + } ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); switch (hparams.n_layer) { case 27: model.type = e_model::MODEL_16B; break; case 60: model.type = e_model::MODEL_236B; break; + case 61: model.type = e_model::MODEL_671B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -1249,6 +1276,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { tokenizer_pre == "deepseek-coder") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "deepseek-v3") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; + vocab.tokenizer_clean_spaces = false; } else if ( tokenizer_pre == "falcon") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; @@ -1892,24 +1923,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } - if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } - if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } + if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } + if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); } + if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } + if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } - if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); } - if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); } - if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); } - if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); } - if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); } - if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); } + if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); } + if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); } + if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); } + if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); } + if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); } + if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); } for (const auto & id : vocab.special_eog_ids) { LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() ); @@ -1931,6 +1962,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func)); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); } @@ -1976,6 +2009,10 @@ struct llama_model_params llama_model_default_params() { } void llama_free_model(struct llama_model * model) { + llama_model_free(model); +} + +void llama_model_free(struct llama_model * model) { delete model; } @@ -2031,6 +2068,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MINICPM: case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: + case LLM_ARCH_COHERE2: case LLM_ARCH_OLMO: case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: diff --git a/src/llama-model.h b/src/llama-model.h index 01c780c41..ce038932d 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -63,6 +63,7 @@ enum llm_type { MODEL_70B, MODEL_236B, MODEL_314B, + MODEL_671B, MODEL_SMALL, MODEL_MEDIUM, MODEL_LARGE, @@ -213,6 +214,7 @@ struct llama_layer { struct ggml_tensor * ffn_down_b = nullptr; // b2 struct ggml_tensor * ffn_up_b = nullptr; // b3 struct ggml_tensor * ffn_act = nullptr; + struct ggml_tensor * ffn_exp_probs_b = nullptr; // mamba proj struct ggml_tensor * ssm_in = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 42974f8f1..466e7bc61 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -7,14 +7,12 @@ #include #include #include +#include #include #include #include #include -// TODO: replace with ggml API call -#define QK_K 256 - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) { } } -struct quantize_state_internal { +struct quantize_state_impl { const llama_model & model; const llama_model_quantize_params * params; @@ -43,13 +41,13 @@ struct quantize_state_internal { // used to figure out if a model shares tok_embd with the output weight bool has_output = false; - quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) {} }; -static void llama_tensor_dequantize_internal( +static void llama_tensor_dequantize_impl( struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread ) { @@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal( workers.clear(); } -static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { +static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { new_type = qs.params->output_tensor_type; } else { - int nx = tensor->ne[0]; - if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + const int64_t nx = tensor->ne[0]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; //} bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || - new_type == GGML_TYPE_IQ1_M) { - int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); + { + const int64_t nx = tensor->ne[0]; + const int64_t ny = tensor->ne[1]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (nx % qk_k != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; } else { ++qs.n_k_quantized; } } + if (convert_incompatible_tensor) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { +static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); @@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa return new_size; } -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { +static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; @@ -534,7 +533,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_hparams(ml, model); llm_load_stats (ml, model); - struct quantize_state_internal qs(model, params); + struct quantize_state_impl qs(model, params); if (params->only_copy) { ftype = model.ftype; @@ -837,7 +836,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { - llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); + llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread); f32_data = (float *) f32_conv_buf.data(); } @@ -866,7 +865,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; - new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } @@ -875,7 +874,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // update the gguf meta data as we go gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); - gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size); + GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size); + gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data); // write tensor data + padding fout.write((const char *) new_data, new_size); @@ -919,7 +919,7 @@ uint32_t llama_model_quantize( const char * fname_out, const llama_model_quantize_params * params) { try { - llama_model_quantize_internal(fname_inp, fname_out, params); + llama_model_quantize_impl(fname_inp, fname_out, params); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); return 1; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 69cea2f14..ef5a576cc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) for (int i = 0; i < (int)cur_p->size; ++i) { const float val = cur_p->data[i].logit; int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); - ib = std::max(0, std::min(nbuckets-1, ib)); + ib = std::max(0, std::min(nbuckets - 1, ib)); bucket_idx[i] = ib; ++histo[ib]; } @@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) for (int i = 0; i < (int)cur_p->size; ++i) { int j = bucket_idx[i]; if (j >= ib) { - *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i]; + *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i]; } } ptr = tmp_tokens.data(); int ndone = 0; - for (int j = nbuckets-1; j > ib; --j) { + for (int j = nbuckets - 1; j > ib; --j) { std::sort(ptr, ptr + histo[j], comp); ptr += histo[j]; ndone += histo[j]; @@ -1832,7 +1832,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat ctx->dry_repeat_count[last - k] = std::min(n, rep_limit); if (n > 0) { lt = k; - rt = k+n-1; + rt = k + n - 1; } } else { // If k is inside the current Z-box, consider two cases. diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 909e04871..a4c015484 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -382,6 +382,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "\\p{N}+", }; break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM: + regex_exprs = { + "\\p{N}{1,3}", + "[一-龥぀-ゟ゠-ヿ]+", + "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: regex_exprs = { "[\r\n]", @@ -490,7 +497,7 @@ struct llm_tokenizer_bpe_session { bool append_bos(std::vector & output) const { if (vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); return true; } @@ -499,7 +506,7 @@ struct llm_tokenizer_bpe_session { bool append_eos(std::vector & output) const { if (vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); return true; } @@ -1396,7 +1403,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { - buffer.erase_after(std::next(buffer.begin(), (source-1))); + buffer.erase_after(std::next(buffer.begin(), (source - 1))); } // repeat for the right side @@ -1410,7 +1417,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { - buffer.erase_after(std::next(buffer.begin(), (source-1))); + buffer.erase_after(std::next(buffer.begin(), (source - 1))); } break; } @@ -1447,7 +1454,7 @@ std::vector llama_tokenize_internal( bool is_prev_special = true; // prefix with space if first token if (add_special && vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); is_prev_special = true; } @@ -1482,7 +1489,7 @@ std::vector llama_tokenize_internal( } if (add_special && vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); } } break; @@ -1515,7 +1522,7 @@ std::vector llama_tokenize_internal( case LLAMA_VOCAB_TYPE_WPM: { if (add_special) { - GGML_ASSERT(vocab.special_cls_id != -1); + GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_cls_id); } @@ -1535,14 +1542,14 @@ std::vector llama_tokenize_internal( } if (add_special) { - GGML_ASSERT(vocab.special_sep_id != -1); + GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_sep_id); } } break; case LLAMA_VOCAB_TYPE_UGM: { if (add_special && vocab.tokenizer_add_bos) { - GGML_ASSERT(vocab.special_bos_id != -1); + GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_bos_id); } llm_tokenizer_ugm_session session(vocab); @@ -1567,7 +1574,7 @@ std::vector llama_tokenize_internal( } if (add_special && vocab.tokenizer_add_eos) { - GGML_ASSERT(vocab.special_eos_id != -1); + GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL); output.push_back(vocab.special_eos_id); } } break; @@ -1635,7 +1642,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla } bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { - return token != -1 && vocab.special_eog_ids.count(token) > 0; + return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0; } bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) { @@ -1874,7 +1881,7 @@ int32_t llama_detokenize_impl( } if (remove_special && vocab.tokenizer_add_eos) { - if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) { + if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) { n_tokens--; } } diff --git a/src/llama.cpp b/src/llama.cpp index d7110b90b..97e716cd6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8,7 +8,6 @@ #include "llama-kv-cache.h" #include "llama-model-loader.h" #include "llama-model.h" -#include "llama-quant.h" #include "ggml.h" #include "ggml-alloc.h" @@ -18,12 +17,8 @@ #include #include #include -#include #include -#include -#include #include -#include #include #include #include @@ -31,10 +26,7 @@ #include #include #include -#include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -1552,6 +1544,32 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_COHERE2: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + // init output from the input tok embed + model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, + llama_model_loader::TENSOR_DUPLICATED); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + } + } + break; case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -1831,6 +1849,7 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } else { layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (n_expert == 0) { throw std::runtime_error("n_expert must be > 0"); @@ -2513,21 +2532,36 @@ static struct ggml_tensor * llm_build_inp_embd( struct ggml_context * ctx, struct llama_context & lctx, const llama_hparams & hparams, - const llama_ubatch & batch, + const llama_ubatch & ubatch, struct ggml_tensor * tok_embd, const llm_build_cb & cb) { const int64_t n_embd = hparams.n_embd; struct ggml_tensor * inpL; - if (batch.token) { - lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + if (ubatch.token) { + lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens); cb(lctx.inp_tokens, "inp_tokens", -1); ggml_set_input(lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); + + // apply lora for embedding tokens if needed + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(tok_embd); + if (lora == nullptr) { + continue; + } + const float adapter_scale = it.second; + const float scale = lora->get_scale(it.first->alpha, adapter_scale); + struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( + ctx, lora->b, // non-transposed lora_b + ggml_get_rows(ctx, lora->a, lctx.inp_tokens) + ), scale); + inpL = ggml_add(ctx, inpL, inpL_delta); + } } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); inpL = lctx.inp_embd; ggml_set_input(lctx.inp_embd); } @@ -2598,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm( if (lora == nullptr) { continue; } - const float alpha = it.first->alpha; - const float rank = (float) lora->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; + const float adapter_scale = it.second; + const float scale = lora->get_scale(it.first->alpha, adapter_scale); struct ggml_tensor * ab_cur = ggml_mul_mat( ctx0, lora->b, ggml_mul_mat(ctx0, lora->a, cur) @@ -2811,12 +2844,14 @@ static struct ggml_tensor * llm_build_moe_ffn( struct ggml_tensor * up_exps, struct ggml_tensor * gate_exps, struct ggml_tensor * down_exps, + struct ggml_tensor * exp_probs_b, int64_t n_expert, int64_t n_expert_used, llm_ffn_op_type type_op, bool norm_w, bool scale_w, float w_scale, +llama_expert_gating_func_type gating_op, const llm_build_cb & cb, int il) { int64_t n_embd = cur->ne[0]; @@ -2825,11 +2860,31 @@ static struct ggml_tensor * llm_build_moe_ffn( ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); - ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] + ggml_tensor * probs = nullptr; + switch (gating_op) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: + { + probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] + } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: + { + probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens] + } break; + default: + GGML_ABORT("fatal error"); + } cb(probs, "ffn_moe_probs", il); + // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights + ggml_tensor * selection_probs = probs; + if (exp_probs_b != nullptr) { + selection_probs = ggml_add(ctx, probs, exp_probs_b); + cb(selection_probs, "ffn_moe_probs_biased", il); + } + // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens] + ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens] cb(selected_experts->src[0], "ffn_moe_argsort", il); cb(selected_experts, "ffn_moe_topk", il); @@ -3100,7 +3155,7 @@ static struct ggml_tensor * llm_build_copy_mask_state( static struct ggml_tensor * llm_build_mamba( struct ggml_context * ctx, struct llama_context & lctx, - const llama_ubatch & batch, + const llama_ubatch & ubatch, struct ggml_cgraph * graph, struct ggml_tensor * cur, struct ggml_tensor * state_copy, @@ -3116,17 +3171,17 @@ static struct ggml_tensor * llm_build_mamba( const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t dt_rank = hparams.ssm_dt_rank; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; // Use the same RMS norm as the final layer norm const float norm_rms_eps = hparams.f_norm_rms_eps; - const int64_t n_seq_tokens = batch.n_seq_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(batch.equal_seqs); - GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); struct ggml_tensor * conv_states_all = kv.k_l[il]; struct ggml_tensor * ssm_states_all = kv.v_l[il]; @@ -3926,6 +3981,7 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); @@ -3950,9 +4006,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); } @@ -4602,9 +4660,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_GELU, true, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); @@ -4743,9 +4803,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); @@ -5991,9 +6053,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); @@ -7633,6 +7697,137 @@ struct llm_build_context { } + struct ggml_cgraph * build_cohere2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // cohere2 requires different mask for layers using sliding window (SWA) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + + // sliding window switch pattern + const int32_t sliding_window_pattern = 4; + + for (int il = 0; il < n_layer; ++il) { + // three layers sliding window attention (window size 4096) and ROPE + // fourth layer uses global attention without positional embeddings + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + if (is_sliding) { + Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, + beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, + attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + } else { + // For non-sliding layers, just reshape without applying RoPE + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + } + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, + KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, + cb, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + // ref: https://allenai.org/olmo // based on the original build_llama() function, changes: // * non-parametric layer norm @@ -7985,9 +8180,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); @@ -8382,9 +8579,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(cur, "ffn_moe_out", il); @@ -8523,9 +8722,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, n_expert, n_expert_used, LLM_FFN_SILU, false, false, hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, cb, il); cb(moe_out, "ffn_moe_out", il); @@ -8752,9 +8953,11 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, - LLM_FFN_SILU, false, + LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale, + (enum llama_expert_gating_func_type) hparams.expert_gating_func, cb, il); cb(moe_out, "ffn_moe_out", il); @@ -10384,6 +10587,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_command_r(); } break; + case LLM_ARCH_COHERE2: + { + result = llm.build_cohere2(); + } break; case LLM_ARCH_DBRX: { result = llm.build_dbrx(); @@ -10517,7 +10724,7 @@ static enum ggml_status llama_graph_compute( // return positive int on warning // return negative int on error // -static int llama_decode_internal( +static int llama_decode_impl( llama_context & lctx, llama_batch inp_batch) { @@ -10852,7 +11059,7 @@ static int llama_decode_internal( // return positive int on warning // return negative int on error // -static int llama_encode_internal( +static int llama_encode_impl( llama_context & lctx, llama_batch inp_batch) { @@ -11034,7 +11241,7 @@ static int llama_encode_internal( } // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache -static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { +static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; const auto & hparams = lctx.model.hparams; @@ -11254,7 +11461,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } -static void llama_kv_cache_update_internal(struct llama_context & lctx) { +static void llama_kv_cache_update_impl(struct llama_context & lctx) { bool need_reserve = false; if (lctx.kv_self.has_shift) { @@ -11290,7 +11497,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // defragment the KV cache if needed if (lctx.kv_self.do_defrag) { - llama_kv_cache_defrag_internal(lctx); + llama_kv_cache_defrag_impl(lctx); need_reserve = true; @@ -11319,13 +11526,7 @@ int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, float scale) { - if (ctx->cparams.flash_attn) { - LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__); - return -1; - } - ctx->lora_adapters[adapter] = scale; - return 0; } @@ -11456,6 +11657,12 @@ int64_t llama_time_us(void) { struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params) { + return llama_model_load_from_file(path_model, params); +} + +struct llama_model * llama_model_load_from_file( + const char * path_model, + struct llama_model_params params) { ggml_time_init(); llama_model * model = new llama_model; @@ -11494,7 +11701,7 @@ struct llama_model * llama_load_model_from_file( ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); if (!rpc_reg) { LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__); - llama_free_model(model); + llama_model_free(model); return nullptr; } @@ -11502,7 +11709,7 @@ struct llama_model * llama_load_model_from_file( ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); if (!ggml_backend_rpc_add_device_fn) { LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__); - llama_free_model(model); + llama_model_free(model); return nullptr; } @@ -11512,7 +11719,7 @@ struct llama_model * llama_load_model_from_file( model->devices.push_back(dev); } else { LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); - llama_free_model(model); + llama_model_free(model); return nullptr; } } @@ -11544,7 +11751,7 @@ struct llama_model * llama_load_model_from_file( if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) { LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size()); - llama_free_model(model); + llama_model_free(model); return nullptr; } ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; @@ -11567,7 +11774,7 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); } - llama_free_model(model); + llama_model_free(model); return nullptr; } @@ -11991,7 +12198,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) { } void llama_kv_cache_update(struct llama_context * ctx) { - llama_kv_cache_update_internal(*ctx); + llama_kv_cache_update_impl(*ctx); } bool llama_kv_cache_can_shift(struct llama_context * ctx) { @@ -12003,7 +12210,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) { int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch) { - const int ret = llama_encode_internal(*ctx, batch); + const int ret = llama_encode_impl(*ctx, batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); } @@ -12014,7 +12221,7 @@ int32_t llama_encode( int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch) { - const int ret = llama_decode_internal(*ctx, batch); + const int ret = llama_decode_impl(*ctx, batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } @@ -12234,16 +12441,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, return 0; } -int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) { +int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { std::string str_split_path(split_path); char postfix[32]; snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); std::string str_postfix(postfix); - // check if dest ends with postfix + // check if split_prefix ends with postfix int size_prefix = str_split_path.size() - str_postfix.size(); if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { - snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); + snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); return size_prefix; } @@ -12252,6 +12459,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int const char * llama_print_system_info(void) { static std::string s; + s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { auto * reg = ggml_backend_reg_get(i); diff --git a/src/unicode.cpp b/src/unicode.cpp index 8ed6b1a51..7aca6544b 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -667,18 +667,24 @@ std::vector unicode_regex_split(const std::string & text, const std { "\\p{N}", unicode_cpt_flags::NUMBER }, { "\\p{L}", unicode_cpt_flags::LETTER }, { "\\p{P}", unicode_cpt_flags::PUNCTUATION }, + { "\\p{M}", unicode_cpt_flags::ACCENT_MARK }, + { "\\p{S}", unicode_cpt_flags::SYMBOL }, }; static const std::map k_ucat_cpt = { { unicode_cpt_flags::NUMBER, 0xD1 }, { unicode_cpt_flags::LETTER, 0xD2 }, { unicode_cpt_flags::PUNCTUATION, 0xD3 }, + { unicode_cpt_flags::ACCENT_MARK, 0xD4 }, + { unicode_cpt_flags::SYMBOL, 0xD5 }, }; static const std::map k_ucat_map = { { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9 { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + { unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints + { unicode_cpt_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`| }; // compute collapsed codepoints only if needed by at least one regex diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp index 57fa00011..ba084a91a 100644 --- a/tests/test-autorelease.cpp +++ b/tests/test-autorelease.cpp @@ -13,10 +13,10 @@ int main(int argc, char ** argv) { std::thread([&model_path]() { llama_backend_init(); - auto * model = llama_load_model_from_file(model_path, llama_model_default_params()); + auto * model = llama_model_load_from_file(model_path, llama_model_default_params()); auto * ctx = llama_new_context_with_model(model, llama_context_default_params()); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); }).join(); diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 1bb5fb47c..611957ac0 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -15,66 +15,71 @@ constexpr int offset_has_tensors = 2000; constexpr int offset_has_data = 3000; enum handcrafted_file_type { - HANDCRAFTED_HEADER_BAD_MAGIC = 10, - HANDCRAFTED_HEADER_BAD_VERSION_1 = 20, - HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30, - HANDCRAFTED_HEADER_BAD_N_TENSORS = 40, - HANDCRAFTED_HEADER_BAD_N_KV = 50, - HANDCRAFTED_HEADER_EMPTY = 800, + HANDCRAFTED_HEADER_BAD_MAGIC = 10, + HANDCRAFTED_HEADER_BAD_VERSION_1 = 20, + HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30, + HANDCRAFTED_HEADER_BAD_N_TENSORS = 40, + HANDCRAFTED_HEADER_BAD_N_KV = 50, + HANDCRAFTED_HEADER_EMPTY = 800, - HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv, - HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv, - HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv, - HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv, - HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv, + HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv, + HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv, + // HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv, // removed because it can result in allocations > 1 TB (default sanitizer limit) + HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv, + HANDCRAFTED_KV_BAD_ALIGN = 50 + offset_has_kv, + HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv, - HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors, - HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors, - HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors, - HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors, - HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors, - HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors, - HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors, - HANDCRAFTED_TENSORS_BAD_ALIGNMENT = 80 + offset_has_tensors, - HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors, - HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors, + HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors, + HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_ALIGN = 75 + offset_has_tensors, + HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN = 80 + offset_has_tensors, + HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors, + HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors, - HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data, - HANDCRAFTED_DATA_BAD_ALIGNMENT = 20 + offset_has_data, - HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data, - HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data, + HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data, + HANDCRAFTED_DATA_BAD_ALIGN = 15 + offset_has_data, + HANDCRAFTED_DATA_INCONSISTENT_ALIGN = 20 + offset_has_data, + HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data, + HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data, }; std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) { switch (hft) { - case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC"; - case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1"; - case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE"; - case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV"; - case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS"; - case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY"; + case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC"; + case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1"; + case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE"; + case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV"; + case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS"; + case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY"; - case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE"; - case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE"; - case HANDCRAFTED_KV_BAD_VALUE_SIZE: return "KV_BAD_VALUE_SIZE"; - case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY"; - case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV"; + case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE"; + case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE"; + case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY"; + case HANDCRAFTED_KV_BAD_ALIGN: return "KV_BAD_ALIGN"; + case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV"; - case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE"; - case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS"; - case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE"; - case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG"; - case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE"; - case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET"; - case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME"; - case HANDCRAFTED_TENSORS_BAD_ALIGNMENT: return "TENSORS_BAD_ALIGNMENT"; - case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS"; - case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN"; + case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE"; + case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS"; + case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE"; + case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG"; + case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE"; + case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET"; + case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME"; + case HANDCRAFTED_TENSORS_BAD_ALIGN: return "TENSORS_BAD_ALIGN"; + case HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN: return "TENSORS_INCONSISTENT_ALIGN"; + case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS"; + case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN"; - case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA"; - case HANDCRAFTED_DATA_BAD_ALIGNMENT: return "DATA_BAD_ALIGNMENT"; - case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS"; - case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN"; + case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA"; + case HANDCRAFTED_DATA_BAD_ALIGN: return "DATA_BAD_ALIGN"; + case HANDCRAFTED_DATA_INCONSISTENT_ALIGN: return "DATA_INCONSISTENT_ALIGN"; + case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS"; + case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN"; } GGML_ABORT("fatal error"); } @@ -140,31 +145,41 @@ std::vector> get_kv_types(std::mt19937 return kv_types; } -static void helper_write(const void * data, const size_t nbytes, FILE * file) { +template +static void helper_write(FILE * file, const T & val) { + GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val)); +} + +static void helper_write(FILE * file, const void * data, const size_t nbytes) { GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes); } static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) { FILE * file = tmpfile(); + if (!file) { + return file; + } + std::mt19937 rng(seed); + uint32_t alignment = GGUF_DEFAULT_ALIGNMENT; if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) { const char bad_magic[4] = {'F', 'U', 'G', 'G'}; - helper_write(bad_magic, sizeof(bad_magic), file); + helper_write(file, bad_magic, sizeof(bad_magic)); } else { - helper_write(GGUF_MAGIC, 4, file); + helper_write(file, GGUF_MAGIC, 4); } if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) { const uint32_t version = 1; - helper_write(&version, sizeof(version), file); + helper_write(file, version); } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) { const uint32_t version = GGUF_VERSION + 1; - helper_write(&version, sizeof(version), file); + helper_write(file, version); } else { const uint32_t version = GGUF_VERSION; - helper_write(&version, sizeof(version), file); + helper_write(file, version); } std::vector tensor_configs; @@ -174,10 +189,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) { const uint64_t n_tensors = -1; - helper_write(&n_tensors, sizeof(n_tensors), file); + helper_write(file, n_tensors); } else { const uint64_t n_tensors = tensor_configs.size(); - helper_write(&n_tensors, sizeof(n_tensors), file); + helper_write(file, n_tensors); } std::vector> kv_types; @@ -186,41 +201,49 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } { uint64_t n_kv = kv_types.size(); - if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + if (hft == HANDCRAFTED_KV_BAD_ALIGN || + hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || + hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + n_kv += 1; } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) { n_kv = -1; } - helper_write(&n_kv, sizeof(n_kv), file); + helper_write(file, n_kv); } if (hft < offset_has_kv) { + while (ftell(file) % alignment != 0) { + const char pad = 0; + helper_write(file, pad); + } + for (int i = 0; i < extra_bytes; ++i) { const char tmp = 0; - helper_write(&tmp, sizeof(tmp), file); + helper_write(file, tmp); } rewind(file); return file; } for (int i = 0; i < int(kv_types.size()); ++i) { - const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first); - const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second); + const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].first); + const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].second); const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i)); if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) { const uint64_t n = -1; - helper_write(&n, sizeof(n), file); + helper_write(file, n); } else { const uint64_t n = key.length(); - helper_write(&n, sizeof(n), file); + helper_write(file, n); } - helper_write(key.data(), key.length(), file); + helper_write(file, key.data(), key.length()); { const int32_t type32 = int32_t(type); - helper_write(&type32, sizeof(type32), file); + helper_write(file, type32); } uint32_t data[16]; @@ -233,69 +256,67 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft if (type == GGUF_TYPE_STRING) { const uint64_t n = rng() % sizeof(data); - helper_write(&n, sizeof(n), file); - helper_write(data, n, file); + helper_write(file, n); + helper_write(file, data, n); continue; } if (type == GGUF_TYPE_ARRAY) { { const int32_t type32 = int32_t(type_arr); - helper_write(&type32, sizeof(type32), file); + helper_write(file, type32); } if (type_arr == GGUF_TYPE_STRING) { const uint64_t nstr = rng() % (16 + 1); - helper_write(&nstr, sizeof(nstr), file); + helper_write(file, nstr); for (uint64_t istr = 0; istr < nstr; ++istr) { const uint64_t n = rng() % (sizeof(uint32_t) + 1); - helper_write(&n, sizeof(n), file); - helper_write(&data[istr], n, file); + helper_write(file, n); + helper_write(file, &data[istr], n); } continue; } const size_t type_size = gguf_type_size(type_arr); const uint64_t n = (rng() % sizeof(data)) / type_size; - helper_write(&n, sizeof(n), file); - helper_write(&data, n*type_size, file); + helper_write(file, n); + helper_write(file, &data, n*type_size); continue; } - size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type); - if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) { - type_size += rng() % 3; - } - helper_write(data, type_size, file); + helper_write(file, data, hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type)); } - if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { - const std::string key = "general.alignment"; - { - const uint64_t n = key.length(); - helper_write(&n, sizeof(n), file); - } - helper_write(key.data(), key.length(), file); + if (hft == HANDCRAFTED_KV_BAD_ALIGN || + hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || + hft == HANDCRAFTED_DATA_BAD_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + + const uint64_t n = strlen(GGUF_KEY_GENERAL_ALIGNMENT); + helper_write(file, n); + helper_write(file, GGUF_KEY_GENERAL_ALIGNMENT, n); const int32_t type = gguf_type(GGUF_TYPE_UINT32); - helper_write(&type, sizeof(type), file); + helper_write(file, type); - const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1; - helper_write(&alignment, sizeof(alignment), file); + alignment = expect_context_not_null(hft) ? 1 : 13; + helper_write(file, alignment); } if (hft < offset_has_tensors) { + while (ftell(file) % alignment != 0) { + const char pad = 0; + helper_write(file, pad); + } + for (int i = 0; i < extra_bytes; ++i) { const char tmp = 0; - helper_write(&tmp, sizeof(tmp), file); + helper_write(file, tmp); } rewind(file); return file; } - uint32_t alignment = GGUF_DEFAULT_ALIGNMENT; - if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) { - alignment -= 1; - } else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { - alignment += 1; + if (hft == HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN || hft == HANDCRAFTED_DATA_INCONSISTENT_ALIGN) { + alignment = 1; } uint64_t offset = 0; @@ -313,9 +334,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } { const uint64_t n = name.length(); - helper_write(&n, sizeof(n), file); + helper_write(file, n); } - helper_write(name.data(), name.length(), file); + helper_write(file, name.data(), name.length()); uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1; for (int i = GGML_MAX_DIMS-1; i >= 1; --i) { @@ -326,35 +347,35 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) { const uint32_t n_dims_bad = GGML_MAX_DIMS + 1; - helper_write(&n_dims_bad, sizeof(n_dims_bad), file); + helper_write(file, n_dims_bad); } else { - helper_write(&n_dims, sizeof(n_dims), file); + helper_write(file, n_dims); } if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) { for (uint32_t j = 0; j < n_dims; ++j) { const int64_t bad_dim = -1; - helper_write(&bad_dim, sizeof(bad_dim), file); + helper_write(file, bad_dim); } } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){ for (uint32_t j = 0; j < n_dims; ++j) { const int64_t big_dim = 4*int64_t(INT32_MAX); - helper_write(&big_dim, sizeof(big_dim), file); + helper_write(file, big_dim); } } else { - helper_write(shape.data(), n_dims*sizeof(int64_t), file); + helper_write(file, shape.data(), n_dims*sizeof(int64_t)); } { - const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type); - helper_write(&type32, sizeof(type32), file); + const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? GGML_TYPE_COUNT : int32_t(type); + helper_write(file, type32); } if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) { const uint64_t bad_offset = -1; - helper_write(&bad_offset, sizeof(bad_offset), file); + helper_write(file, bad_offset); } else { - helper_write(&offset, sizeof(offset), file); + helper_write(file, offset); } int64_t ne = shape[0]; @@ -364,12 +385,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft offset += GGML_PAD(ggml_row_size(type, ne), alignment); } - const uint32_t alignment_overshoot = ftell(file) % alignment; - if (alignment_overshoot != 0) { - for (size_t i = alignment_overshoot; i < alignment; ++i) { - const char pad = 0; - helper_write(&pad, sizeof(pad), file); - } + while (ftell(file) % alignment != 0) { + const char pad = 0; + helper_write(file, pad); } if (hft >= offset_has_data) { @@ -380,13 +398,13 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } for (uint64_t i = 0; i < nbytes; ++i) { const uint8_t random_byte = i % 256; - helper_write(&random_byte, sizeof(random_byte), file); + helper_write(file, random_byte); } } for (int i = 0; i < extra_bytes; ++i) { const char tmp = 0; - helper_write(&tmp, sizeof(tmp), file); + helper_write(file, tmp); } rewind(file); return file; @@ -505,6 +523,16 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i } const char * data_gguf = reinterpret_cast(gguf_get_arr_data(gguf_ctx, id)); + + if (type_arr == GGUF_TYPE_BOOL) { + for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) { + if (bool(data8[arr_i]) != bool(data_gguf[arr_i])) { + ok = false; + } + } + continue; + } + if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) { ok = false; } @@ -512,12 +540,20 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i } const char * data_gguf = reinterpret_cast(gguf_get_val_data(gguf_ctx, id)); + + if (type == GGUF_TYPE_BOOL) { + if (bool(*data8) != bool(*data_gguf)) { + ok = false; + } + continue; + } + if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) { ok = false; } } - const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT; + const uint32_t expected_alignment = alignment_defined ? 1 : GGUF_DEFAULT_ALIGNMENT; if (gguf_get_alignment(gguf_ctx) != expected_alignment) { ok = false; } @@ -539,7 +575,7 @@ static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsig bool ok = true; - const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment"); + const int id_alignment = gguf_find_key(gguf_ctx, GGUF_KEY_GENERAL_ALIGNMENT); const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT; uint64_t expected_offset = 0; @@ -607,7 +643,7 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u std::vector data(size); GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0); - GGML_ASSERT(fread(data.data(), 1, size, file) == size); + GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size()); for (size_t j = 0; j < size; ++j) { const uint8_t expected_byte = (j + offset) % 256; @@ -627,15 +663,15 @@ static std::pair test_handcrafted_file(const unsigned int seed) { const std::vector hfts = { HANDCRAFTED_HEADER_BAD_MAGIC, HANDCRAFTED_HEADER_BAD_VERSION_1, - // HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME + HANDCRAFTED_HEADER_BAD_VERSION_FUTURE, HANDCRAFTED_HEADER_BAD_N_KV, HANDCRAFTED_HEADER_BAD_N_TENSORS, HANDCRAFTED_HEADER_EMPTY, HANDCRAFTED_KV_BAD_KEY_SIZE, HANDCRAFTED_KV_BAD_TYPE, - // HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit - // HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME + HANDCRAFTED_KV_DUPLICATE_KEY, + HANDCRAFTED_KV_BAD_ALIGN, HANDCRAFTED_KV_SUCCESS, HANDCRAFTED_TENSORS_BAD_NAME_SIZE, @@ -643,14 +679,16 @@ static std::pair test_handcrafted_file(const unsigned int seed) { HANDCRAFTED_TENSORS_BAD_SHAPE, HANDCRAFTED_TENSORS_NE_TOO_BIG, HANDCRAFTED_TENSORS_BAD_TYPE, - // HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME + HANDCRAFTED_TENSORS_BAD_OFFSET, HANDCRAFTED_TENSORS_DUPLICATE_NAME, - // HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME + HANDCRAFTED_TENSORS_BAD_ALIGN, + HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN, HANDCRAFTED_TENSORS_SUCCESS, HANDCRAFTED_TENSORS_CUSTOM_ALIGN, HANDCRAFTED_DATA_NOT_ENOUGH_DATA, - // HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME + HANDCRAFTED_DATA_BAD_ALIGN, + HANDCRAFTED_DATA_INCONSISTENT_ALIGN, HANDCRAFTED_DATA_SUCCESS, HANDCRAFTED_DATA_CUSTOM_ALIGN, }; @@ -674,6 +712,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { /*no_alloc =*/ false, /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, }; + struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); if (expect_context_not_null(hft)) { @@ -689,7 +728,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { } ntest++; - if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME + if (hft >= offset_has_data && !expect_context_not_null(hft)) { printf("%s: - no_dangling_ggml_context_pointer: ", __func__); if (ctx) { printf("\033[1;31mFAIL\033[0m\n"); @@ -700,23 +739,6 @@ static std::pair test_handcrafted_file(const unsigned int seed) { ntest++; } - if (false && expect_context_not_null(hft)) { // FIXME - FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1); - struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params); - - printf("%s: - context_null_with_extra_bytes: ", __func__); - if (gguf_ctx_eb) { - printf("\033[1;31mFAIL\033[0m\n"); - } else { - printf("\033[1;32mOK\033[0m\n"); - npass++; - } - ntest++; - - gguf_free(gguf_ctx_eb); - fclose(file_eb); - } - const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN; if (expect_context_not_null(hft)) { @@ -763,14 +785,15 @@ static std::pair test_handcrafted_file(const unsigned int seed) { ntest++; } + fclose(file); if (gguf_ctx) { ggml_free(ctx); gguf_free(gguf_ctx); } - fclose(file); printf("\n"); } + return std::make_pair(npass, ntest); } @@ -789,10 +812,6 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t const std::string key = "my_key_" + std::to_string(rng() % 1024); const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT); - if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) { - continue; // FIXME memory leak - } - switch (type) { case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break; case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break; @@ -826,6 +845,9 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t std::vector random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t)); for (size_t j = 0; j < random_data.size(); ++j) { random_data[j] = rng(); + if (type_arr == GGUF_TYPE_BOOL) { + random_data[j] &= 0x01010101; // the sanitizer complains if booleans are not 0 or 1 + } } gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne); } break; @@ -928,6 +950,17 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other continue; } + if (type_arr == GGUF_TYPE_BOOL) { + const int8_t * data = reinterpret_cast(gguf_get_arr_data(ctx, id)); + const int8_t * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other)); + for (int arr_i = 0; arr_i < arr_n; ++arr_i) { + if (bool(data[arr_i]) != bool(data_other[arr_i])) { + ok = false; + } + } + continue; + } + if (type_arr == GGUF_TYPE_STRING) { for (int arr_i = 0; arr_i < arr_n; ++arr_i) { const std::string str = gguf_get_arr_str(ctx, id, arr_i); @@ -939,8 +972,8 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other continue; } - const char * data = reinterpret_cast(gguf_get_arr_data(ctx, id)); - const char * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other)); + const int8_t * data = reinterpret_cast(gguf_get_arr_data(ctx, id)); + const int8_t * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other)); if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) { ok = false; } @@ -1028,21 +1061,6 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml } static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) { - FILE * file = tmpfile(); -#ifdef _WIN32 - if (!file) { - printf("%s: failed to create tmpfile(), needs elevated privileges on Windows"); - printf("%s: skipping tests"); - return std::make_pair(0, 0); - } -#else - GGML_ASSERT(file); -#endif // _WIN32 - - if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { - return std::make_pair(0, 0); // FIXME - } - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); printf("%s: device=%s, backend=%s, only_meta=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no"); @@ -1060,10 +1078,24 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned bbuf = result.buffer; } - struct gguf_buf gbuf = gguf_buf_init(16 * 1024); - gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta); - helper_write(gbuf.data, gbuf.offset, file); - rewind(file); + FILE * file = tmpfile(); + +#ifdef _WIN32 + if (!file) { + printf("%s: failed to create tmpfile(), needs elevated privileges on Windows"); + printf("%s: skipping tests"); + return std::make_pair(0, 0); + } +#else + GGML_ASSERT(file); +#endif // _WIN32 + + { + std::vector buf; + gguf_write_to_buf(gguf_ctx_0, buf, only_meta); + GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size()); + rewind(file); + } struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { @@ -1151,9 +1183,8 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned ggml_free(ctx_1); gguf_free(gguf_ctx_0); gguf_free(gguf_ctx_1); - gguf_buf_free(gbuf); ggml_backend_free(backend); - GGML_ASSERT(fclose(file) == 0); + fclose(file); printf("\n"); return std::make_pair(npass, ntest); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 858535c3c..9095826fa 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -21,7 +21,7 @@ int main(int argc, char *argv[] ) { (void) ctx; return progress > 0.50; }; - auto * model = llama_load_model_from_file(model_path, params); + auto * model = llama_model_load_from_file(model_path, params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 0af85f002..121c2c60c 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -152,7 +152,7 @@ int main(int argc, char **argv) { mparams.vocab_only = true; - model = llama_load_model_from_file(fname.c_str(), mparams); + model = llama_model_load_from_file(fname.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); @@ -165,7 +165,7 @@ int main(int argc, char **argv) { if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); + llama_model_free(model); return 1; } } @@ -300,7 +300,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); } - llama_free_model(model); + llama_model_free(model); llama_free(ctx); llama_backend_free(); diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 0ff7fc833..5718fab04 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -46,7 +46,7 @@ int main(int argc, char **argv) { mparams.vocab_only = true; - model = llama_load_model_from_file(fname.c_str(), mparams); + model = llama_model_load_from_file(fname.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); @@ -59,7 +59,7 @@ int main(int argc, char **argv) { if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); + llama_model_free(model); return 1; } } @@ -143,7 +143,7 @@ int main(int argc, char **argv) { } } - llama_free_model(model); + llama_model_free(model); llama_free(ctx); llama_backend_free(); diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp index 9b0716a43..ac05387c9 100644 --- a/tests/test-tokenizer-1-spm.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -34,7 +34,7 @@ int main(int argc, char ** argv) { mparams.vocab_only = true; - model = llama_load_model_from_file(fname.c_str(), mparams); + model = llama_model_load_from_file(fname.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); @@ -47,7 +47,7 @@ int main(int argc, char ** argv) { if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); + llama_model_free(model); return 1; } } @@ -113,7 +113,7 @@ int main(int argc, char ** argv) { } } - llama_free_model(model); + llama_model_free(model); llama_free(ctx); llama_backend_free();