llama : add ftype meta info to the model

ggml-ci
2023-08-22 14:37:35 +03:00 · 2023-08-22 14:37:35 +03:00 · bee1f0e441
commit bee1f0e441
parent ef3f333d37
3 changed files with 24 additions and 3 deletions
--- a/gguf.py
+++ b/gguf.py
@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
 KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"
 # LLM
 KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
@ -595,6 +596,9 @@ class GGUFWriter:
    def add_source_hf_repo(self, repo: str):
        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
    def add_file_type(self, ftype: int):
        self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
    def add_name(self, name: str):
        self.add_string(KEY_GENERAL_NAME, name)
--- a/llama.cpp
+++ b/llama.cpp
@ -1121,6 +1121,16 @@ struct llama_model_loader {
                     } break;
            }
            // this is a way to mark that we have "guessed" the file type
            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
            {
                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
                if (kid >= 0) {
                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
                }
            }
            for (int i = 0; i < n_kv; i++) {
                const char * name         = gguf_get_key(ctx_gguf, i);
                const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@ -1323,7 +1333,11 @@ struct llama_model_loader {
 // load LLaMA models
 //
-const char * llama_model_ftype_name(enum llama_ftype ftype) {
+std::string llama_model_ftype_name(enum llama_ftype ftype) {
    if (ftype & LLAMA_FTYPE_GUESSED) {
        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
    }
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
        case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
@ -1552,7 +1566,7 @@ static void llama_model_load_internal(
        LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, hparams.rope_freq_base);
        LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, hparams.rope_freq_scale);
        LLAMA_LOG_INFO("%s: model type   = %s\n",     __func__, llama_model_type_name(model.type));
-        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype));
+        LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
        LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml->n_elements*1e-9);
        // general kv
@ -3620,6 +3634,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, model_loader->ctx_gguf);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
 #ifdef GGML_USE_K_QUANTS
    int n_attention_wv    = 0;
@ -4471,7 +4486,7 @@ int llama_model_n_embd(const struct llama_model * model) {
 }
 int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
+    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
 }
 int llama_model_quantize(
--- a/llama.h
+++ b/llama.h
@ -103,6 +103,8 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
    typedef struct llama_token_data {