From 9654d62f7e47ab8559b3039f0633a6ae00f1e01a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 11 Mar 2024 11:41:29 +0200
Subject: [PATCH] unicode : names

---
 llama.cpp   | 10 +++++-----
 unicode.cpp | 12 ++++++------
 unicode.h   |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c712a42a5..76f44aa45 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9340,7 +9340,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
         }
         case LLAMA_VOCAB_TYPE_BPE: {
             GGML_ASSERT(false);
-            return unicode_utf8_to_bytes(token_data.text);
+            return unicode_utf8_to_byte(token_data.text);
         }
         case LLAMA_VOCAB_TYPE_WPM: {
             GGML_ASSERT(false);
@@ -9365,7 +9365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
         }
         case LLAMA_VOCAB_TYPE_WPM:
         case LLAMA_VOCAB_TYPE_BPE: {
-            return vocab.token_to_id.at(unicode_bytes_to_utf8(ch));
+            return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
         }
         default:
             GGML_ASSERT(false);
@@ -9819,7 +9819,7 @@ private:
         for (std::string & word : bpe_words) {
             std::string encoded_token = "";
             for (char & c : word) {
-                encoded_token += unicode_bytes_to_utf8(c);
+                encoded_token += unicode_byte_to_utf8(c);
             }
             bpe_encoded_words.emplace_back(encoded_token);
         }
@@ -13955,8 +13955,8 @@ int32_t llama_tokenize(
 static std::string llama_decode_text(const std::string & text) {
     std::string decoded_text;
     auto unicode_sequences = unicode_cpts_from_utf8(text);
-    for (auto& unicode_sequence : unicode_sequences) {
-        decoded_text += unicode_utf8_to_bytes(unicode_cpt_to_utf8(unicode_sequence));
+    for (auto & unicode_sequence : unicode_sequences) {
+        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
     }
 
     return decoded_text;
diff --git a/unicode.cpp b/unicode.cpp
index 3b6462b19..0e4a292d8 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -666,7 +666,7 @@ static std::unordered_map<uint32_t, int> cpt_type_map() {
     return cpt_types;
 }
 
-static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map() {
+static std::unordered_map<uint8_t, std::string> unicode_byteo_to_utf8_map() {
     std::unordered_map<uint8_t, std::string> map;
     for (int ch = u'!'; ch <= u'~'; ++ch) {
         assert(0 <= ch && ch < 256);
@@ -690,7 +690,7 @@ static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map() {
     return map;
 }
 
-static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map() {
+static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
     std::unordered_map<std::string, uint8_t> map;
     for (int ch = u'!'; ch <= u'~'; ++ch) {
         assert(0 <= ch && ch < 256);
@@ -771,13 +771,13 @@ int unicode_cpt_type(const std::string & utf8) {
     return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
 }
 
-std::string unicode_bytes_to_utf8(uint8_t byte) {
-    static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map();
+std::string unicode_byte_to_utf8(uint8_t byte) {
+    static std::unordered_map<uint8_t, std::string> map = unicode_byteo_to_utf8_map();
     return map.at(byte);
 }
 
-uint8_t unicode_utf8_to_bytes(const std::string & utf8) {
-    static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map();
+uint8_t unicode_utf8_to_byte(const std::string & utf8) {
+    static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
     return map.at(utf8);
 }
 
diff --git a/unicode.h b/unicode.h
index 5636ea0f1..3c57f1502 100644
--- a/unicode.h
+++ b/unicode.h
@@ -22,6 +22,6 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);
 
-std::string unicode_bytes_to_utf8(uint8_t byte);
+std::string unicode_byte_to_utf8(uint8_t byte);
+uint8_t unicode_utf8_to_byte(const std::string & utf8);
 
-uint8_t unicode_utf8_to_bytes(const std::string & utf8);