tests : add fail test for llama-bpe

This commit is contained in:
Georgi Gerganov 2024-05-09 10:27:14 +03:00
parent 8de8b6d1df
commit 12a7b69623
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 8 additions and 7 deletions

View file

@ -257,6 +257,7 @@ tests = [
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]

View file

@ -112,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
std::unordered_map<uint32_t, int> cpt_types;
for (auto p : unicode_ranges_number) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
}
}
for (auto p : unicode_ranges_letter) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_LETTER;
}
}
for (auto p : unicode_ranges_separator) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
}
}
for (auto p : unicode_ranges_accent_mark) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
}
for (auto p : unicode_ranges_punctuation) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
}
@ -142,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
}
}
for (auto p : unicode_ranges_control) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
}
}
@ -629,7 +629,7 @@ bool unicode_cpt_is_whitespace(uint32_t cp) {
static const std::unordered_set<uint32_t> is_whitespace = [] {
std::unordered_set<uint32_t> is_whitespace;
for (auto p : unicode_ranges_whitespace) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
is_whitespace.insert(i);
}
}