fix codestyle

2024-07-01 02:23:19 +00:00 · 2024-07-01 02:23:19 +00:00 · d07f0a90c3
commit d07f0a90c3
parent bbe1926fac
3 changed files with 30 additions and 31 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -87,35 +87,35 @@ class Keys:
        TIME_STEP_RANK = "{arch}.ssm.time_step_rank"

    class Tokenizer:
-        MODEL            = "tokenizer.ggml.model"
-        PRE              = "tokenizer.ggml.pre"
-        LIST             = "tokenizer.ggml.tokens"
-        TOKEN_TYPE       = "tokenizer.ggml.token_type"
-        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"  # for BERT-style token types
-        SCORES           = "tokenizer.ggml.scores"
-        MERGES           = "tokenizer.ggml.merges"
-        BOS_ID           = "tokenizer.ggml.bos_token_id"
-        EOS_ID           = "tokenizer.ggml.eos_token_id"
-        UNK_ID           = "tokenizer.ggml.unknown_token_id"
-        SEP_ID           = "tokenizer.ggml.seperator_token_id"
-        PAD_ID           = "tokenizer.ggml.padding_token_id"
-        CLS_ID           = "tokenizer.ggml.cls_token_id"
-        MASK_ID          = "tokenizer.ggml.mask_token_id"
-        ADD_BOS          = "tokenizer.ggml.add_bos_token"
-        ADD_EOS          = "tokenizer.ggml.add_eos_token"
-        ADD_PREFIX       = "tokenizer.ggml.add_space_prefix"
-        REMOVE_EXTRA_WS  = "tokenizer.ggml.remove_extra_whitespaces"
+        MODEL                = "tokenizer.ggml.model"
+        PRE                  = "tokenizer.ggml.pre"
+        LIST                 = "tokenizer.ggml.tokens"
+        TOKEN_TYPE           = "tokenizer.ggml.token_type"
+        TOKEN_TYPE_COUNT     = "tokenizer.ggml.token_type_count"  # for BERT-style token types
+        SCORES               = "tokenizer.ggml.scores"
+        MERGES               = "tokenizer.ggml.merges"
+        BOS_ID               = "tokenizer.ggml.bos_token_id"
+        EOS_ID               = "tokenizer.ggml.eos_token_id"
+        UNK_ID               = "tokenizer.ggml.unknown_token_id"
+        SEP_ID               = "tokenizer.ggml.seperator_token_id"
+        PAD_ID               = "tokenizer.ggml.padding_token_id"
+        CLS_ID               = "tokenizer.ggml.cls_token_id"
+        MASK_ID              = "tokenizer.ggml.mask_token_id"
+        ADD_BOS              = "tokenizer.ggml.add_bos_token"
+        ADD_EOS              = "tokenizer.ggml.add_eos_token"
+        ADD_PREFIX           = "tokenizer.ggml.add_space_prefix"
+        REMOVE_EXTRA_WS      = "tokenizer.ggml.remove_extra_whitespaces"
        PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
-        HF_JSON          = "tokenizer.huggingface.json"
-        RWKV             = "tokenizer.rwkv.world"
-        CHAT_TEMPLATE    = "tokenizer.chat_template"
-        CHAT_TEMPLATE_N  = "tokenizer.chat_template.{name}"
-        CHAT_TEMPLATES   = "tokenizer.chat_templates"
+        HF_JSON              = "tokenizer.huggingface.json"
+        RWKV                 = "tokenizer.rwkv.world"
+        CHAT_TEMPLATE        = "tokenizer.chat_template"
+        CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
+        CHAT_TEMPLATES       = "tokenizer.chat_templates"
        # FIM/Infill special tokens constants
-        PREFIX_ID        = "tokenizer.ggml.prefix_token_id"
-        SUFFIX_ID        = "tokenizer.ggml.suffix_token_id"
-        MIDDLE_ID        = "tokenizer.ggml.middle_token_id"
-        EOT_ID           = "tokenizer.ggml.eot_token_id"
+        PREFIX_ID            = "tokenizer.ggml.prefix_token_id"
+        SUFFIX_ID            = "tokenizer.ggml.suffix_token_id"
+        MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
+        EOT_ID               = "tokenizer.ggml.eot_token_id"

 #
 # recommended mapping of model tensor names for storage in gguf
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5222,7 +5222,6 @@ static void llm_load_vocab(
                vocab.special_eot_id    = 107;
            }
        }
-        
        try {
            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
        } catch (const std::exception & e) {
@ -19829,7 +19828,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl.find("chatglm3") != std::string::npos || tmpl.find("[gMASK]sop") != std::string::npos) {
+    } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
@ -19839,7 +19838,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl.find("chatglm4") != std::string::npos || tmpl.find("[gMASK]<sop>") != std::string::npos) {
+    } else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -61,7 +61,7 @@ int main(void) {
        // ChatGLM3
        "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
        // ChatGLM4
-        "chatglm4",
+        u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B