diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7be609054..27ed2f1ee 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2329,9 +2329,8 @@ class InternLM2Model(Model): def set_vocab(self): # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. + # Copy from _set_vocab_sentencepiece, The only difference is that we find mislabeled UNUSED tokens, + # and that we set '<|im_end|>' as the eos token for chat models. from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -2358,11 +2357,6 @@ class InternLM2Model(Model): piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") toktype = SentencePieceTokenTypes.NORMAL if tokenizer.IsUnknown(token_id): diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index ecff95f9a..a6bfe6035 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -562,7 +562,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam vocab->id_to_token.resize(n_vocab); for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); + std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i)); vocab->token_to_id[word] = i; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 0051a5eb6..79a9d0974 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -19,7 +19,7 @@ struct tensor_transformation { static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id)); } static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8aa7b0750..34a91edf0 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -240,7 +240,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { switch (type) { case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); + return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i)); case GGUF_TYPE_ARRAY: { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); @@ -250,7 +250,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { ss << "["; for (int j = 0; j < arr_n; j++) { if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); + std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j)); // escape quotes replace_all(val, "\\", "\\\\"); replace_all(val, "\"", "\\\""); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e24b8a319..a4e1fec0c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2410,10 +2410,12 @@ extern "C" { GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); + GGML_API int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id); GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + GGML_API int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4b782b0c1..01ef74db1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -22946,6 +22946,14 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i return str->data; } +int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->n; +} + int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); @@ -23024,6 +23032,12 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { return ctx->kv[key_id].value.str.data; } +int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id) { + GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); + GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); + return ctx->kv[key_id].value.str.n; +} + const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) { GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx)); GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY); diff --git a/src/llama.cpp b/src/llama.cpp index 0accb1492..a44aca294 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1609,7 +1609,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { switch (type) { case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); + return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i)); case GGUF_TYPE_ARRAY: { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); @@ -1619,7 +1619,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { ss << "["; for (int j = 0; j < arr_n; j++) { if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); + std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j)); // escape quotes replace_all(val, "\\", "\\\\"); replace_all(val, "\"", "\\\""); @@ -4205,7 +4205,7 @@ namespace GGUFMeta { static constexpr gguf_type gt = GGUF_TYPE_STRING; static std::string getter(const gguf_context * ctx, const int kid) { - return gguf_get_val_str(ctx, kid); + return std::string(gguf_get_val_str(ctx, kid), gguf_get_val_str_n(ctx, kid)); } }; @@ -6167,7 +6167,7 @@ static void llm_load_vocab( const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); for (int i = 0; i < n_merges; i++) { - const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + const std::string word(gguf_get_arr_str(ctx, merges_keyidx, i), gguf_get_arr_str_n(ctx, merges_keyidx, i)); GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); std::string first; @@ -6397,7 +6397,7 @@ static void llm_load_vocab( vocab.id_to_token.resize(n_vocab); for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); + std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i)); GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); vocab.token_to_id[word] = i; @@ -18369,7 +18369,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c { auto get_kv_str = [&](const std::string & key) -> std::string { int id = gguf_find_key(ctx_gguf, key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id)); }; auto get_kv_f32 = [&](const std::string & key) -> float { int id = gguf_find_key(ctx_gguf, key.c_str());