From 668e0d9b735ea68f8ea8f257397939f95a60a0d5 Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Wed, 8 May 2024 10:12:10 +0200 Subject: [PATCH] feat: remove extra complexity in NFD --- .../server/tests/features/embeddings.feature | 9 +++- llama.cpp | 3 +- unicode-data.cpp | 5 --- unicode-data.h | 1 - unicode.cpp | 44 +++---------------- unicode.h | 4 +- 6 files changed, 15 insertions(+), 51 deletions(-) diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index bbb386db5..d6b92a453 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -16,10 +16,17 @@ Feature: llama.cpp server Then the server is starting Then the server is healthy + Scenario: Embedding + When embeddings are computed for: + """ + What is the capital of Bulgaria ? + """ + Then embeddings are generated + Scenario: Tokenize / Detokenize complex When tokenizing: """ - España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国 + 北京的清晨,空氣清新而寧靜,一个年轻的旅行者在长城上漫步,他从自己的故乡—서울에서 출발하여 아시아의 다양한 문화를 탐험하고자 하는 꿈을 품고 떠났다。彼は日本の古都、京都を訪れ、そこで美しい桜の花が満開の下で古典音楽のコンサートに参加しました。祭りの夜、彼は色とりどりの灯籠が空に浮かぶのを見て、その美しさに感動しました。その後、彼は印度のバラナシに到着し、गंगा की घाटों पर आध्यात्मिक शांति की खोज में जुट गया। वहाँ उसने दिवाली के उत्सव में हिस्सा लिया, जहां लाखों दीये जलाकर समृद्धि और खुशहाली की कामना की गई थी।この旅は彼にとって非常に啓発的であり、多くの異なる文化から新しいことを学び、新しい友達を作る機会を与えました。彼はこの経験を通じて、 異なる文化の間の共通点と相違点を理解するようになりました。España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国 """ Then tokens can be detokenize and is equivalent False diff --git a/llama.cpp b/llama.cpp index 177b928e3..aeb5c08df 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12456,8 +12456,7 @@ struct llm_tokenizer_wpm { } std::vector preprocess(const std::string & text) { - auto unicode_cpts = unicode_cpts_from_utf8(text); - std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts); + std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); // strip accents, strip control, uniformize whitespace, // to lowercase, pad chinese characters, pad punctuation diff --git a/unicode-data.cpp b/unicode-data.cpp index b6f9e8ed1..07bf02c45 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -1691,8 +1691,3 @@ const std::map unicode_map_lowercase = { {0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E}, {0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943}, }; - - -const std::map unicode_canonical_class = { - {42613, 230}, {824, 1}, {811, 220}, {2292, 230}, {2030, 230}, {65065, 220}, {773, 230}, {784, 230}, {2281, 220}, {43233, 230}, {43244, 230}, {6964, 7}, {1434, 222}, {1752, 230}, {7646, 230}, {7657, 230}, {8401, 230}, {8412, 230}, {65057, 230}, {70197, 9}, {6841, 220}, {11764, 230}, {1847, 220}, {42654, 230}, {119143, 1}, {841, 220}, {43699, 230}, {68325, 230}, {1459, 13}, {7397, 1}, {92914, 1}, {7083, 9}, {1558, 230}, {69890, 230}, {119152, 216}, {7619, 230}, {1859, 230}, {1441, 230}, {1452, 230}, {1473, 24}, {42616, 230}, {2090, 230}, {2876, 7}, {792, 220}, {803, 220}, {795, 216}, {856, 232}, {1453, 222}, {42655, 230}, {1771, 230}, {1474, 25}, {122894, 230}, {776, 230}, {7626, 220}, {119212, 230}, {43204, 9}, {1559, 230}, {2303, 230}, {43309, 220}, {125253, 230}, {7638, 230}, {65060, 230}, {1469, 22}, {7387, 230}, {1627, 230}, {12330, 218}, {119174, 230}, {2265, 230}, {7019, 230}, {874, 230}, {6779, 230}, {7668, 230}, {122913, 230}, {3975, 230}, {11775, 230}, {1433, 230}, {2071, 230}, {2082, 230}, {2093, 230}, {6836, 230}, {43235, 230}, {43246, 230}, {836, 230}, {43766, 9}, {92980, 230}, {125136, 220}, {1458, 12}, {6680, 220}, {122886, 230}, {3963, 130}, {2765, 9}, {11748, 230}, {852, 220}, {2284, 230}, {1155, 230}, {2295, 230}, {2033, 230}, {65068, 220}, {787, 230}, {2139, 220}, {43236, 230}, {1619, 230}, {7629, 234}, {1850, 230}, {1861, 230}, {2381, 9}, {7022, 230}, {844, 230}, {855, 230}, {1755, 230}, {7649, 230}, {7660, 230}, {122916, 230}, {6459, 220}, {11756, 230}, {11767, 230}, {43443, 7}, {11505, 230}, {71104, 7}, {8431, 220}, {43014, 9}, {1425, 220}, {2492, 7}, {7622, 230}, {1455, 230}, {12334, 224}, {42619, 230}, {1613, 29}, {2276, 230}, {7142, 7}, {7154, 9}, {8423, 230}, {768, 230}, {779, 230}, {7417, 230}, {1858, 220}, {1763, 220}, {43710, 230}, {4237, 220}, {1842, 230}, {1444, 220}, {7641, 230}, {7652, 230}, {66045, 220}, {122897, 230}, {1630, 230}, {11759, 230}, {119177, 230}, {125138, 220}, {70090, 7}, {825, 220}, {125256, 230}, {1465, 19}, {1553, 230}, {1843, 230}, {1436, 230}, {43247, 230}, {70512, 230}, {3158, 91}, {113822, 1}, {2085, 230}, {2620, 7}, {12442, 8}, {2268, 230}, {3768, 118}, {65071, 230}, {7024, 230}, {1158, 230}, {43713, 230}, {7400, 1}, {1612, 28}, {66424, 230}, {7633, 230}, {1454, 228}, {12333, 222}, {806, 220}, {817, 220}, {828, 220}, {2260, 230}, {7025, 230}, {122908, 230}, {122919, 230}, {3970, 230}, {11770, 230}, {2287, 220}, {1428, 230}, {1439, 230}, {70504, 230}, {70515, 230}, {7679, 220}, {2077, 230}, {42737, 230}, {64286, 26}, {119362, 230}, {122881, 230}, {71103, 9}, {2385, 230}, {3893, 220}, {2279, 230}, {2028, 230}, {877, 230}, {7390, 220}, {119166, 220}, {794, 232}, {782, 230}, {771, 230}, {8426, 1}, {1845, 230}, {1856, 230}, {3787, 122}, {850, 230}, {1447, 220}, {7644, 230}, {4957, 230}, {122889, 230}, {122900, 230}, {6839, 220}, {11751, 230}, {11762, 230}, {11773, 230}, {2288, 27}, {1556, 230}, {69888, 230}, {92978, 230}, {2298, 220}, {4154, 9}, {42614, 230}, {790, 220}, {4958, 230}, {2289, 28}, {2271, 230}, {869, 230}, {1477, 220}, {6774, 230}, {3658, 107}, {7382, 220}, {7663, 230}, {819, 220}, {774, 230}, {7412, 230}, {8407, 230}, {70080, 9}, {119150, 216}, {1864, 220}, {858, 220}, {43307, 220}, {831, 230}, {3864, 220}, {7625, 230}, {7636, 230}, {1865, 230}, {7647, 230}, {6842, 220}, {2748, 7}, {11754, 230}, {822, 1}, {11503, 230}, {4038, 220}, {65063, 220}, {122911, 230}, {6458, 230}, {1431, 230}, {43242, 230}, {70507, 230}, {119168, 220}, {119179, 220}, {6834, 230}, {1750, 230}, {7655, 230}, {7666, 230}, {8410, 1}, {2388, 230}, {839, 220}, {2282, 230}, {2031, 230}, {7395, 1}, {92912, 1}, {7617, 230}, {7628, 230}, {801, 202}, {1857, 230}, {7377, 230}, {8400, 230}, {812, 220}, {861, 234}, {1479, 18}, {842, 230}, {1450, 220}, {1460, 14}, {72767, 9}, {71231, 9}, {119144, 1}, {125142, 220}, {1471, 23}, {122892, 230}, {122903, 230}, {11765, 230}, {122883, 230}, {119210, 230}, {2301, 230}, {43347, 9}, {8429, 220}, {2072, 230}, {92981, 230}, {65058, 230}, {3964, 130}, {1625, 230}, {70851, 7}, {2263, 230}, {872, 230}, {6777, 230}, {7385, 220}, {1761, 230}, {1772, 230}, {3897, 216}, {789, 232}, {8421, 1}, {119142, 216}, {66422, 230}, {70377, 7}, {119153, 216}, {122922, 230}, {2080, 230}, {2091, 230}, {1840, 230}, {6980, 9}, {12331, 228}, {834, 230}, {1442, 220}, {2364, 7}, {7639, 230}, {122884, 230}, {122895, 230}, {11746, 230}, {11757, 230}, {2293, 230}, {65066, 220}, {125254, 230}, {785, 230}, {860, 233}, {2137, 220}, {837, 240}, {3405, 9}, {43245, 230}, {1615, 31}, {2266, 230}, {3530, 9}, {70722, 9}, {7155, 9}, {70726, 7}, {1753, 230}, {1764, 230}, {7658, 230}, {814, 220}, {7669, 230}, {8402, 1}, {11647, 9}, {1848, 220}, {853, 220}, {7398, 1}, {92915, 1}, {7620, 230}, {6837, 220}, {42617, 230}, {804, 220}, {815, 220}, {69818, 7}, {8432, 230}, {777, 230}, {788, 230}, {43456, 9}, {2285, 220}, {119213, 230}, {1426, 230}, {43237, 230}, {70502, 230}, {119163, 220}, {5908, 9}, {7677, 220}, {1614, 30}, {1467, 20}, {864, 234}, {42618, 230}, {3656, 107}, {7650, 230}, {8405, 230}, {65061, 230}, {69939, 9}, {3784, 122}, {6845, 220}, {119175, 230}, {845, 220}, {2277, 230}, {43703, 230}, {875, 230}, {6780, 230}, {2290, 29}, {68153, 1}, {66425, 230}, {7676, 233}, {807, 202}, {12335, 224}, {2083, 230}, {796, 220}, {3785, 122}, {125137, 220}, {122898, 230}, {11749, 230}, {2296, 230}, {1156, 230}, {43711, 230}, {8424, 220}, {65069, 220}, {125257, 230}, {863, 233}, {70003, 7}, {43700, 220}, {43248, 230}, {1620, 230}, {8425, 230}, {826, 220}, {2269, 230}, {7023, 230}, {7631, 220}, {867, 230}, {878, 230}, {1756, 230}, {7661, 230}, {1466, 19}, {11768, 230}, {1851, 220}, {68154, 220}, {1437, 230}, {2034, 220}, {70513, 230}, {2075, 230}, {2086, 230}, {43239, 230}, {829, 230}, {7627, 230}, {7623, 230}, {122890, 230}, {42620, 230}, {820, 1}, {70850, 9}, {2299, 230}, {7388, 220}, {1628, 220}, {780, 230}, {866, 233}, {1429, 230}, {43240, 230}, {70505, 230}, {2261, 230}, {848, 230}, {859, 230}, {1445, 220}, {1759, 230}, {3659, 107}, {1560, 30}, {798, 220}, {809, 220}, {3640, 103}, {7642, 230}, {7653, 230}, {3971, 230}, {7664, 230}, {8408, 1}, {11760, 230}, {11771, 230}, {122909, 230}, {125139, 220}, {70378, 9}, {1554, 230}, {92976, 230}, {1855, 230}, {1448, 230}, {69759, 9}, {42612, 230}, {799, 220}, {2280, 230}, {3769, 118}, {2029, 230}, {1767, 230}, {772, 230}, {783, 230}, {8427, 1}, {1862, 220}, {43232, 230}, {1622, 220}, {1555, 230}, {1159, 230}, {6783, 220}, {6679, 230}, {119169, 220}, {7634, 230}, {7645, 230}, {65056, 230}, {71351, 7}, {1623, 230}, {6840, 220}, {11752, 230}, {818, 220}, {7026, 230}, {870, 230}, {6775, 230}, {122920, 230}, {1461, 15}, {1440, 230}, {1451, 230}, {70516, 230}, {2078, 230}, {2089, 230}, {791, 220}, {1617, 33}, {3642, 9}, {6832, 230}, {6843, 230}, {44013, 9}, {832, 230}, {119363, 230}, {1462, 16}, {70477, 9}, {122882, 230}, {11744, 230}, {1854, 220}, {69702, 9}, {2291, 230}, {2302, 230}, {65064, 220}, {7391, 220}, {43308, 220}, {2386, 220}, {119178, 220}, {4151, 7}, {66272, 220}, {810, 220}, {1846, 230}, {1751, 230}, {7656, 230}, {6109, 230}, {122901, 230}, {122912, 230}, {3974, 230}, {11763, 230}, {11774, 230}, {1432, 230}, {70508, 230}, {2070, 230}, {2081, 230}, {6835, 230}, {7630, 214}, {3157, 84}, {68111, 230}, {43234, 230}, {1557, 230}, {69889, 230}, {92979, 230}, {3277, 9}, {1809, 36}, {3149, 9}, {3962, 130}, {42615, 230}, {840, 220}, {2272, 230}, {2283, 230}, {2032, 230}, {7396, 1}, {7383, 220}, {7675, 230}, {119170, 220}, {786, 230}, {775, 230}, {8428, 220}, {3972, 9}, {119151, 216}, {119211, 230}, {802, 202}, {862, 234}, {843, 230}, {68159, 9}, {7637, 230}, {7648, 230}, {793, 220}, {8403, 1}, {7386, 230}, {122893, 230}, {1626, 230}, {11755, 230}, {11766, 230}, {119173, 230}, {823, 1}, {11504, 230}, {125252, 230}, {43243, 230}, {42607, 230}, {2264, 230}, {873, 230}, {6778, 230}, {3895, 220}, {1762, 230}, {7667, 230}, {7678, 230}, {8411, 230}, {1468, 21}, {7416, 230}, {8422, 1}, {122885, 230}, {851, 220}, {43698, 230}, {7405, 220}, {92913, 1}, {1616, 32}, {7640, 230}, {7378, 230}, {11747, 230}, {813, 220}, {68152, 230}, {7021, 230}, {7618, 220}, {7380, 1}, {119145, 1}, {122904, 230}, {122915, 230}, {2138, 220}, {3953, 129}, {1770, 220}, {8430, 220}, {2073, 230}, {71467, 9}, {92982, 230}, {1754, 230}, {7659, 230}, {65059, 230}, {65070, 230}, {3954, 130}, {3965, 130}, {71350, 9}, {1849, 220}, {854, 220}, {2035, 230}, {7399, 1}, {12332, 232}, {778, 230}, {119154, 216}, {66423, 230}, {7621, 230}, {125258, 7}, {2092, 230}, {8404, 230}, {805, 220}, {835, 230}, {1443, 220}, {1456, 10}, {122896, 230}, {122907, 230}, {11758, 230}, {11769, 230}, {70503, 230}, {1773, 220}, {769, 230}, {65067, 220}, {1457, 11}, {7020, 220}, {125255, 230}, {1552, 230}, {2294, 220}, {69940, 9}, {1629, 230}, {2893, 9}, {119176, 230}, {68326, 220}, {2267, 230}, {2027, 230}, {876, 230}, {7389, 220}, {119165, 220}, {770, 230}, {1463, 17}, {1860, 220}, {92916, 1}, {12441, 8}, {1464, 18}, {838, 230}, {1435, 220}, {6313, 228}, {7643, 230}, {1476, 230}, {70198, 7}, {122888, 230}, {6838, 220}, {11750, 230}, {125140, 220}, {119149, 226}, {816, 220}, {5940, 9}, {2275, 220}, {2286, 220}, {1427, 230}, {1438, 230}, {43238, 230}, {43249, 230}, {70514, 230}, {119164, 220}, {42736, 230}, {1863, 230}, {865, 234}, {6752, 9}, {6773, 230}, {3657, 107}, {3021, 9}, {7651, 230}, {7662, 230}, {3968, 130}, {6457, 222}, {8406, 230}, {8417, 230}, {65062, 230}, {1841, 220}, {1852, 220}, {69817, 9}, {846, 220}, {857, 220}, {43704, 230}, {66426, 230}, {7624, 230}, {808, 202}, {1561, 31}, {1853, 230}, {42621, 230}, {821, 1}, {797, 220}, {3786, 122}, {849, 230}, {1446, 220}, {70460, 7}, {1562, 32}, {1648, 35}, {122899, 230}, {122910, 230}, {781, 230}, {11761, 230}, {2278, 220}, {70506, 230}, {1631, 220}, {1157, 230}, {119167, 220}, {6833, 230}, {7223, 7}, {92977, 230}, {2509, 9}, {4153, 9}, {7654, 230}, {7392, 230}, {2387, 230}, {7632, 202}, {827, 220}, {2270, 230}, {1611, 27}, {43696, 230}, {868, 230}, {879, 230}, {7381, 220}, {1768, 230}, {1621, 220}, {7394, 1}, {122918, 230}, {2297, 220}, {1449, 230}, {7082, 9}, {2076, 230}, {2087, 230}, {800, 220}, {6098, 9}, {830, 230}, {125141, 220}, {3260, 7}, {7635, 230}, {4959, 230}, {122880, 230}, {122891, 230}, {122902, 230}, {11753, 230}, {2300, 230}, {1618, 34}, {43241, 230}, {1624, 230}, {3865, 220}, {2637, 9}, {2262, 230}, {1866, 230}, {2273, 230}, {7027, 230}, {871, 230}, {6776, 230}, {7384, 220}, {1760, 230}, {7665, 230}, {3641, 103}, {8409, 1}, {122921, 230}, {119141, 216}, {11772, 230}, {1844, 220}, {2079, 230}, {6844, 230}, {119364, 230}, {833, 230}, {1430, 220}, {7616, 230}, {3956, 132}, {7376, 230}, {68109, 220}, {11745, 230} -}; \ No newline at end of file diff --git a/unicode-data.h b/unicode-data.h index 794859935..3cf84117c 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -14,4 +14,3 @@ extern const std::vector> unicode_ranges_symbol; extern const std::vector> unicode_ranges_control; extern const std::multimap unicode_map_nfd; extern const std::map unicode_map_lowercase; -extern const std::map unicode_canonical_class; diff --git a/unicode.cpp b/unicode.cpp index 341cac025..1764066dd 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -13,7 +13,6 @@ #include #include #include -#include static std::string unicode_cpts_to_utf8(const std::vector & cps) { std::string result; @@ -470,54 +469,21 @@ std::string unicode_cpt_to_utf8(uint32_t cp) { throw std::invalid_argument("invalid codepoint"); } -auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) { - auto cc_a_it = unicode_canonical_class.find(a); - if (cc_a_it != unicode_canonical_class.end()) { - auto cc_b_it = unicode_canonical_class.find(b); - if (cc_b_it != unicode_canonical_class.end()) { - return cc_a_it->second < cc_b_it->second; - } - - } - return false; -}; - -// Function to sort subsequences based on canonical class -std::vector sort_by_canonical_class(std::vector & cpts) { - // Sort the sequence using the custom comparator function - sort(cpts.begin(), cpts.end(), compareByCanonicalClass); - return cpts; -} - -std::vector canonical_decomposition_cpts(std::vector & cpts, uint32_t starting_offset) { +std::vector unicode_cpts_normalize_nfd(const std::vector & cpts) { std::vector result; - for (auto i = starting_offset; i < cpts.size(); i++) { - const auto& it = unicode_map_nfd.equal_range(cpts[i]); + for (uint32_t cpt : cpts) { + auto it = unicode_map_nfd.equal_range(cpt); if (it.first != it.second) { - uint offset = 0; for (auto jt = it.first; jt != it.second; jt++) { - if (offset == 0) { - cpts[i] = jt->second; - } else { - cpts.emplace(cpts.begin() + i + offset, jt->second); - } - offset++; + result.push_back(jt->second); } - const auto & inner_result = canonical_decomposition_cpts(cpts, i); - result.insert(result.end(), inner_result.begin(), inner_result.end()); - break; } else { - result.push_back(cpts[i]); + result.push_back(cpt); } } return result; } -std::vector unicode_cpts_normalize_nfd(std::vector & cpts) { - auto result = canonical_decomposition_cpts(cpts, 0); - return sort_by_canonical_class(result); -} - std::vector unicode_cpts_from_utf8(const std::string & utf8) { std::vector result; size_t offset = 0; diff --git a/unicode.h b/unicode.h index 89f170144..e9026dc81 100644 --- a/unicode.h +++ b/unicode.h @@ -16,9 +16,7 @@ std::string unicode_cpt_to_utf8(uint32_t cp); std::vector unicode_cpts_from_utf8(const std::string & utf8); -std::vector unicode_cpts_normalize_nfd(std::vector & cpts); -std::vector canonical_decomposition_cpts(std::vector & cpts, uint32_t starting_offset); -std::vector sort_by_canonical_class(std::vector & cpts); +std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(const std::string & utf8);