unicode : names
This commit is contained in:
parent
0458996ec1
commit
9654d62f7e
3 changed files with 13 additions and 13 deletions
10
llama.cpp
10
llama.cpp
|
@ -9340,7 +9340,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
return unicode_utf8_to_bytes(token_data.text);
|
return unicode_utf8_to_byte(token_data.text);
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_WPM: {
|
case LLAMA_VOCAB_TYPE_WPM: {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -9365,7 +9365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
}
|
}
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
return vocab.token_to_id.at(unicode_bytes_to_utf8(ch));
|
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -9819,7 +9819,7 @@ private:
|
||||||
for (std::string & word : bpe_words) {
|
for (std::string & word : bpe_words) {
|
||||||
std::string encoded_token = "";
|
std::string encoded_token = "";
|
||||||
for (char & c : word) {
|
for (char & c : word) {
|
||||||
encoded_token += unicode_bytes_to_utf8(c);
|
encoded_token += unicode_byte_to_utf8(c);
|
||||||
}
|
}
|
||||||
bpe_encoded_words.emplace_back(encoded_token);
|
bpe_encoded_words.emplace_back(encoded_token);
|
||||||
}
|
}
|
||||||
|
@ -13955,8 +13955,8 @@ int32_t llama_tokenize(
|
||||||
static std::string llama_decode_text(const std::string & text) {
|
static std::string llama_decode_text(const std::string & text) {
|
||||||
std::string decoded_text;
|
std::string decoded_text;
|
||||||
auto unicode_sequences = unicode_cpts_from_utf8(text);
|
auto unicode_sequences = unicode_cpts_from_utf8(text);
|
||||||
for (auto& unicode_sequence : unicode_sequences) {
|
for (auto & unicode_sequence : unicode_sequences) {
|
||||||
decoded_text += unicode_utf8_to_bytes(unicode_cpt_to_utf8(unicode_sequence));
|
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
|
||||||
}
|
}
|
||||||
|
|
||||||
return decoded_text;
|
return decoded_text;
|
||||||
|
|
12
unicode.cpp
12
unicode.cpp
|
@ -666,7 +666,7 @@ static std::unordered_map<uint32_t, int> cpt_type_map() {
|
||||||
return cpt_types;
|
return cpt_types;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map() {
|
static std::unordered_map<uint8_t, std::string> unicode_byteo_to_utf8_map() {
|
||||||
std::unordered_map<uint8_t, std::string> map;
|
std::unordered_map<uint8_t, std::string> map;
|
||||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||||
assert(0 <= ch && ch < 256);
|
assert(0 <= ch && ch < 256);
|
||||||
|
@ -690,7 +690,7 @@ static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map() {
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map() {
|
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||||
std::unordered_map<std::string, uint8_t> map;
|
std::unordered_map<std::string, uint8_t> map;
|
||||||
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||||
assert(0 <= ch && ch < 256);
|
assert(0 <= ch && ch < 256);
|
||||||
|
@ -771,13 +771,13 @@ int unicode_cpt_type(const std::string & utf8) {
|
||||||
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string unicode_bytes_to_utf8(uint8_t byte) {
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||||
static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map();
|
static std::unordered_map<uint8_t, std::string> map = unicode_byteo_to_utf8_map();
|
||||||
return map.at(byte);
|
return map.at(byte);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t unicode_utf8_to_bytes(const std::string & utf8) {
|
uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
||||||
static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map();
|
static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
|
||||||
return map.at(utf8);
|
return map.at(utf8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,6 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||||
int unicode_cpt_type(uint32_t cp);
|
int unicode_cpt_type(uint32_t cp);
|
||||||
int unicode_cpt_type(const std::string & utf8);
|
int unicode_cpt_type(const std::string & utf8);
|
||||||
|
|
||||||
std::string unicode_bytes_to_utf8(uint8_t byte);
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||||
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
|
|
||||||
uint8_t unicode_utf8_to_bytes(const std::string & utf8);
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue