From 9654d62f7e47ab8559b3039f0633a6ae00f1e01a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Mar 2024 11:41:29 +0200 Subject: [PATCH] unicode : names --- llama.cpp | 10 +++++----- unicode.cpp | 12 ++++++------ unicode.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index c712a42a5..76f44aa45 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9340,7 +9340,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } case LLAMA_VOCAB_TYPE_BPE: { GGML_ASSERT(false); - return unicode_utf8_to_bytes(token_data.text); + return unicode_utf8_to_byte(token_data.text); } case LLAMA_VOCAB_TYPE_WPM: { GGML_ASSERT(false); @@ -9365,7 +9365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { - return vocab.token_to_id.at(unicode_bytes_to_utf8(ch)); + return vocab.token_to_id.at(unicode_byte_to_utf8(ch)); } default: GGML_ASSERT(false); @@ -9819,7 +9819,7 @@ private: for (std::string & word : bpe_words) { std::string encoded_token = ""; for (char & c : word) { - encoded_token += unicode_bytes_to_utf8(c); + encoded_token += unicode_byte_to_utf8(c); } bpe_encoded_words.emplace_back(encoded_token); } @@ -13955,8 +13955,8 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto& unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_bytes(unicode_cpt_to_utf8(unicode_sequence)); + for (auto & unicode_sequence : unicode_sequences) { + decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); } return decoded_text; diff --git a/unicode.cpp b/unicode.cpp index 3b6462b19..0e4a292d8 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -666,7 +666,7 @@ static std::unordered_map cpt_type_map() { return cpt_types; } -static std::unordered_map bytes_to_unicode_map() { +static std::unordered_map unicode_byteo_to_utf8_map() { std::unordered_map map; for (int ch = u'!'; ch <= u'~'; ++ch) { assert(0 <= ch && ch < 256); @@ -690,7 +690,7 @@ static std::unordered_map bytes_to_unicode_map() { return map; } -static std::unordered_map unicode_to_bytes_map() { +static std::unordered_map unicode_utf8_to_byte_map() { std::unordered_map map; for (int ch = u'!'; ch <= u'~'; ++ch) { assert(0 <= ch && ch < 256); @@ -771,13 +771,13 @@ int unicode_cpt_type(const std::string & utf8) { return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset)); } -std::string unicode_bytes_to_utf8(uint8_t byte) { - static std::unordered_map map = bytes_to_unicode_map(); +std::string unicode_byte_to_utf8(uint8_t byte) { + static std::unordered_map map = unicode_byteo_to_utf8_map(); return map.at(byte); } -uint8_t unicode_utf8_to_bytes(const std::string & utf8) { - static std::unordered_map map = unicode_to_bytes_map(); +uint8_t unicode_utf8_to_byte(const std::string & utf8) { + static std::unordered_map map = unicode_utf8_to_byte_map(); return map.at(utf8); } diff --git a/unicode.h b/unicode.h index 5636ea0f1..3c57f1502 100644 --- a/unicode.h +++ b/unicode.h @@ -22,6 +22,6 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8); int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(const std::string & utf8); -std::string unicode_bytes_to_utf8(uint8_t byte); +std::string unicode_byte_to_utf8(uint8_t byte); +uint8_t unicode_utf8_to_byte(const std::string & utf8); -uint8_t unicode_utf8_to_bytes(const std::string & utf8);