style fixes
This commit is contained in:
parent
3a1895d786
commit
961e98f245
1 changed files with 24 additions and 39 deletions
57
llama.cpp
57
llama.cpp
|
@ -8175,7 +8175,9 @@ struct llm_tokenizer_wpm {
|
||||||
// find the longest tokens that form the words
|
// find the longest tokens that form the words
|
||||||
for (const std::string &word : words) {
|
for (const std::string &word : words) {
|
||||||
// skip empty words
|
// skip empty words
|
||||||
if (word.size() == 0) continue;
|
if (word.size() == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// prepend phantom space
|
// prepend phantom space
|
||||||
std::string word1 = "\xe2\x96\x81" + word;
|
std::string word1 = "\xe2\x96\x81" + word;
|
||||||
|
@ -8201,7 +8203,9 @@ struct llm_tokenizer_wpm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// must be an unknown character
|
// must be an unknown character
|
||||||
if (!match) i++;
|
if (!match) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// we didn't find any matches for this word
|
// we didn't find any matches for this word
|
||||||
|
@ -8215,8 +8219,7 @@ struct llm_tokenizer_wpm {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> preprocess(const std::string & text) {
|
std::vector<std::string> preprocess(const std::string & text) {
|
||||||
std::string ori_str = text;
|
std::string ori_str = normalize(text);
|
||||||
ori_str = normalize(ori_str);
|
|
||||||
uint64_t ori_size = ori_str.size();
|
uint64_t ori_size = ori_str.size();
|
||||||
|
|
||||||
// single punct / single symbol / single digit
|
// single punct / single symbol / single digit
|
||||||
|
@ -8267,8 +8270,7 @@ struct llm_tokenizer_wpm {
|
||||||
std::string normalize(const std::string &text) {
|
std::string normalize(const std::string &text) {
|
||||||
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
||||||
std::string text2 = strip_accents(text);
|
std::string text2 = strip_accents(text);
|
||||||
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i]))
|
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
||||||
{
|
|
||||||
char c = text2[i];
|
char c = text2[i];
|
||||||
if (c >= 'A' && c <= 'Z')
|
if (c >= 'A' && c <= 'Z')
|
||||||
text2[i] = c - 'A' + 'a';
|
text2[i] = c - 'A' + 'a';
|
||||||
|
@ -8334,17 +8336,13 @@ struct llm_tokenizer_wpm {
|
||||||
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
||||||
};
|
};
|
||||||
|
|
||||||
for (size_t i = 0; i < inputString.length();)
|
for (size_t i = 0; i < inputString.length();) {
|
||||||
{
|
|
||||||
int len = utf8_len(inputString[i]);
|
int len = utf8_len(inputString[i]);
|
||||||
std::string curChar = inputString.substr(i, len);
|
std::string curChar = inputString.substr(i, len);
|
||||||
auto iter = accentMap.find(curChar);
|
auto iter = accentMap.find(curChar);
|
||||||
if (iter != accentMap.end())
|
if (iter != accentMap.end()) {
|
||||||
{
|
|
||||||
resultString += iter->second;
|
resultString += iter->second;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
resultString += curChar;
|
resultString += curChar;
|
||||||
}
|
}
|
||||||
i += len;
|
i += len;
|
||||||
|
@ -8397,8 +8395,7 @@ struct fragment_buffer_variant{
|
||||||
|
|
||||||
// #define PRETOKENIZERDEBUG
|
// #define PRETOKENIZERDEBUG
|
||||||
|
|
||||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||||
{
|
|
||||||
// for each special token
|
// for each special token
|
||||||
for (const auto & st: vocab.special_tokens_cache) {
|
for (const auto & st: vocab.special_tokens_cache) {
|
||||||
const auto & special_token = st.first;
|
const auto & special_token = st.first;
|
||||||
|
@ -8516,10 +8513,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
for (const auto & fragment: fragment_buffer)
|
for (const auto & fragment: fragment_buffer) {
|
||||||
{
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
|
||||||
{
|
|
||||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
|
|
||||||
// TODO: It's likely possible to get rid of this string copy entirely
|
// TODO: It's likely possible to get rid of this string copy entirely
|
||||||
|
@ -8539,19 +8534,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
llm_tokenizer_spm tokenizer(vocab);
|
llm_tokenizer_spm tokenizer(vocab);
|
||||||
llama_escape_whitespace(raw_text);
|
llama_escape_whitespace(raw_text);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
}
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
||||||
{
|
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
{
|
{
|
||||||
for (const auto & fragment: fragment_buffer)
|
for (const auto & fragment: fragment_buffer) {
|
||||||
{
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
|
||||||
{
|
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
|
@ -8559,19 +8550,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#endif
|
#endif
|
||||||
llm_tokenizer_bpe tokenizer(vocab);
|
llm_tokenizer_bpe tokenizer(vocab);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
}
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
||||||
{
|
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
{
|
{
|
||||||
for (const auto & fragment: fragment_buffer)
|
for (const auto & fragment: fragment_buffer) {
|
||||||
{
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
|
||||||
{
|
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
|
@ -8579,9 +8566,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#endif
|
#endif
|
||||||
llm_tokenizer_wpm tokenizer(vocab);
|
llm_tokenizer_wpm tokenizer(vocab);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
}
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
||||||
{
|
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue