From 4056dc5b1e07e19feccf5c93daa7856ab98efadf Mon Sep 17 00:00:00 2001 From: Kazim Abrar Mahi Date: Mon, 1 Apr 2024 00:48:49 +0600 Subject: [PATCH] added and refactored unicode_regex_split and related functions --- llama.cpp | 113 +++++++----------------- unicode.cpp | 247 ++++++++++++++++++++++++++++++++++++++++++++++++---- unicode.h | 7 +- 3 files changed, 264 insertions(+), 103 deletions(-) diff --git a/llama.cpp b/llama.cpp index bf43d127c..0dca86f67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12010,13 +12010,13 @@ struct llm_tokenizer_bpe { std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: - word_collection = bpe_gpt2_preprocess(text); + word_collection = unicode_regex_split(text, gpt2_regex); break; case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: - word_collection = bpe_deepseek_coder_preprocess(text); + word_collection = unicode_regex_split(text, deepseek_coder_regex); break; case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: - word_collection = bpe_deepseek_llm_preprocess(text); + word_collection = unicode_regex_split(text, deepseek_llm_regex); break; default: break; @@ -12147,90 +12147,41 @@ private: work_queue.push(bigram); } - std::vector byte_encoding_process(const std::vector & bpe_words) { - std::vectorbpe_encoded_words; - for (auto word : bpe_words) { - std::string text_utf = ""; - auto utf_word = unicode_cpts_from_utf8(word); - for (size_t i = 0; i < utf_word.size(); ++i) - text_utf += unicode_cpt_to_utf8(utf_word[i]); - - std::string encoded_token = ""; - for (char & c : text_utf) { - encoded_token += unicode_byte_to_utf8(c); - } - bpe_encoded_words.emplace_back(encoded_token); - } - return bpe_encoded_words; - } - - std::vector regex_preprocess(const std::wstring & text, const std::vector & offsets, const std::wstring & regex_expr) { - std::wregex expr(regex_expr); - std::vector bpe_words; // stroe the offset of each word - bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::wcregex_iterator end; - - int64_t start_idx = 0; - while (it != end) { - std::wcmatch match = *it; - if (match.position() > start_idx) { - bpe_words.emplace_back(match.position() - start_idx); - } - bpe_words.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; - } - - if (start_idx < (int64_t) offset) { - bpe_words.emplace_back(offset - start_idx); - } - start += offset; - } - - return bpe_words; - } - - std::vector regex_bpe_preprocess(const std::string & text, const std::vector & regex_exprs) { - std::wstring wtext = from_utf8(text); - - std::vector bpe_offsets = {wtext.size()}; - - for(auto & regex_expr : regex_exprs) { - bpe_offsets = regex_preprocess(wtext, bpe_offsets, regex_expr); - } - - std::vector bpe_words; - bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for(size_t & offset : bpe_offsets){ - bpe_words.emplace_back(to_utf8(std::wstring(wtext, start, offset))); - start += offset; - } - - return byte_encoding_process(bpe_words); - } - - std::vector bpe_gpt2_preprocess(const std::string & text) { - return regex_bpe_preprocess(text, get_gpt2_regex()); - } - - std::vector bpe_deepseek_coder_preprocess(const std::string & text) { - return regex_bpe_preprocess(text, get_deepseek_coder_regex()); - } - - std::vector bpe_deepseek_llm_preprocess(const std::string & text) { - return regex_bpe_preprocess(text, get_deepseek_llm_regex()); - } - const llama_vocab & vocab; std::vector symbols; std::vector symbols_final; llm_bigram_bpe::queue work_queue; + + const std::vector gpt2_regex = { + // //punc: \{p} and ascii puncs + L"[\U00000021-\U0000002F\U0000003A-\U00000040\\\U0000005B-\U00000060\U0000007B-\U0000007E\U000000A1-\U000000A1\U000000A7-\U000000A7\U000000AB-\U000000AB\U000000B6-\U000000B7\U000000BB-\U000000BB\U000000BF-\U000000BF\U0000037E-\U0000037E\U00000387-\U00000387\U0000055A-\U0000055F\U00000589-\U0000058A\U000005BE-\U000005BE\U000005C0-\U000005C0\U000005C3-\U000005C3\U000005C6-\U000005C6\U000005F3-\U000005F4\U00000609-\U0000060A\U0000060C-\U0000060D\U0000061B-\U0000061B\U0000061E-\U0000061F\U0000066A-\U0000066D\U000006D4-\U000006D4\U00000700-\U0000070D\U000007F7-\U000007F9\U00000830-\U0000083E\U0000085E-\U0000085E\U00000964-\U00000965\U00000970-\U00000970\U000009FD-\U000009FD\U00000A76-\U00000A76\U00000AF0-\U00000AF0\U00000C77-\U00000C77\U00000C84-\U00000C84\U00000DF4-\U00000DF4\U00000E4F-\U00000E4F\U00000E5A-\U00000E5B\U00000F04-\U00000F12\U00000F14-\U00000F14\U00000F3A-\U00000F3D\U00000F85-\U00000F85\U00000FD0-\U00000FD4\U00000FD9-\U00000FDA\U0000104A-\U0000104F\U000010FB-\U000010FB\U00001360-\U00001368\U00001400-\U00001400\U0000166E-\U0000166E\U0000169B-\U0000169C\U000016EB-\U000016ED\U00001735-\U00001736\U000017D4-\U000017D6\U000017D8-\U000017DA\U00001800-\U0000180A\U00001944-\U00001945\U00001A1E-\U00001A1F\U00001AA0-\U00001AA6\U00001AA8-\U00001AAD\U00001B5A-\U00001B60\U00001BFC-\U00001BFF\U00001C3B-\U00001C3F\U00001C7E-\U00001C7F\U00001CC0-\U00001CC7\U00001CD3-\U00001CD3\U00002010-\U00002027\U00002030-\U00002043\U00002045-\U00002051\U00002053-\U0000205E\U0000207D-\U0000207E\U0000208D-\U0000208E\U00002308-\U0000230B\U00002329-\U0000232A\U00002768-\U00002775\U000027C5-\U000027C6\U000027E6-\U000027EF\U00002983-\U00002998\U000029D8-\U000029DB\U000029FC-\U000029FD\U00002CF9-\U00002CFC\U00002CFE-\U00002CFF\U00002D70-\U00002D70\U00002E00-\U00002E2E\U00002E30-\U00002E4F\U00002E52-\U00002E52\U00003001-\U00003003\U00003008-\U00003011\U00003014-\U0000301F\U00003030-\U00003030\U0000303D-\U0000303D\U000030A0-\U000030A0\U000030FB-\U000030FB\U0000A4FE-\U0000A4FF\U0000A60D-\U0000A60F\U0000A673-\U0000A673\U0000A67E-\U0000A67E\U0000A6F2-\U0000A6F7\U0000A874-\U0000A877\U0000A8CE-\U0000A8CF\U0000A8F8-\U0000A8FA\U0000A8FC-\U0000A8FC\U0000A92E-\U0000A92F\U0000A95F-\U0000A95F\U0000A9C1-\U0000A9CD\U0000A9DE-\U0000A9DF\U0000AA5C-\U0000AA5F\U0000AADE-\U0000AADF\U0000AAF0-\U0000AAF1\U0000ABEB-\U0000ABEB\U0000FD3E-\U0000FD3F\U0000FE10-\U0000FE19\U0000FE30-\U0000FE52\U0000FE54-\U0000FE61\U0000FE63-\U0000FE63\U0000FE68-\U0000FE68\U0000FE6A-\U0000FE6B\U0000FF01-\U0000FF03\U0000FF05-\U0000FF0A\U0000FF0C-\U0000FF0F\U0000FF1A-\U0000FF1B\U0000FF1F-\U0000FF20\U0000FF3B-\U0000FF3D\U0000FF3F-\U0000FF3F\U0000FF5B-\U0000FF5B\U0000FF5D-\U0000FF5D\U0000FF5F-\U0000FF65\U00010100-\U00010102\U0001039F-\U0001039F\U000103D0-\U000103D0\U0001056F-\U0001056F\U00010857-\U00010857\U0001091F-\U0001091F\U0001093F-\U0001093F\U00010A50-\U00010A58\U00010A7F-\U00010A7F\U00010AF0-\U00010AF6\U00010B39-\U00010B3F\U00010B99-\U00010B9C\U00010EAD-\U00010EAD\U00010F55-\U00010F59\U00011047-\U0001104D\U000110BB-\U000110BC\U000110BE-\U000110C1\U00011140-\U00011143\U00011174-\U00011175\U000111C5-\U000111C8\U000111CD-\U000111CD\U000111DB-\U000111DB\U000111DD-\U000111DF\U00011238-\U0001123D\U000112A9-\U000112A9\U0001144B-\U0001144F\U0001145A-\U0001145B\U0001145D-\U0001145D\U000114C6-\U000114C6\U000115C1-\U000115D7\U00011641-\U00011643\U00011660-\U0001166C\U0001173C-\U0001173E\U0001183B-\U0001183B\U00011944-\U00011946\U000119E2-\U000119E2\U00011A3F-\U00011A46\U00011A9A-\U00011A9C\U00011A9E-\U00011AA2\U00011C41-\U00011C45\U00011C70-\U00011C71\U00011EF7-\U00011EF8\U00011FFF-\U00011FFF\U00012470-\U00012474\U00016A6E-\U00016A6F\U00016AF5-\U00016AF5\U00016B37-\U00016B3B\U00016B44-\U00016B44\U00016E97-\U00016E9A\U00016FE2-\U00016FE2\U0001BC9F-\U0001BC9F\U0001DA87-\U0001DA8B\U0001E95E-\U0001E95F]+", + // //'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S) + L"'s|'t|'re|'ve|'m|'ll|'d| ?[\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A]+| ?[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+| ?[^\\s\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+|\\s+(?!\\S)", + //digits + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]+", + L"[0-9][0-9][0-9]" + }; + const std::vector deepseek_coder_regex = { + L"[\r\n]", + //\s?\p{L}+ + L"\\s?[\U00000041-\U0000005A\U00000061-\U0000007A\U000000AA-\U000000AA\U000000B5-\U000000B5\U000000BA-\U000000BA\U000000C0-\U000000D6\U000000D8-\U000000F6\U000000F8-\U000002C1\U000002C6-\U000002D1\U000002E0-\U000002E4\U000002EC-\U000002EC\U000002EE-\U000002EE\U00000370-\U00000374\U00000376-\U00000377\U0000037A-\U0000037D\U0000037F-\U0000037F\U00000386-\U00000386\U00000388-\U0000038A\U0000038C-\U0000038C\U0000038E-\U000003A1\U000003A3-\U000003F5\U000003F7-\U00000481\U0000048A-\U0000052F\U00000531-\U00000556\U00000559-\U00000559\U00000560-\U00000588\U000005D0-\U000005EA\U000005EF-\U000005F2\U00000620-\U0000064A\U0000066E-\U0000066F\U00000671-\U000006D3\U000006D5-\U000006D5\U000006E5-\U000006E6\U000006EE-\U000006EF\U000006FA-\U000006FC\U000006FF-\U000006FF\U00000710-\U00000710\U00000712-\U0000072F\U0000074D-\U000007A5\U000007B1-\U000007B1\U000007CA-\U000007EA\U000007F4-\U000007F5\U000007FA-\U000007FA\U00000800-\U00000815\U0000081A-\U0000081A\U00000824-\U00000824\U00000828-\U00000828\U00000840-\U00000858\U00000860-\U0000086A\U000008A0-\U000008B4\U000008B6-\U000008C7\U00000904-\U00000939\U0000093D-\U0000093D\U00000950-\U00000950\U00000958-\U00000961\U00000971-\U00000980\U00000985-\U0000098C\U0000098F-\U00000990\U00000993-\U000009A8\U000009AA-\U000009B0\U000009B2-\U000009B2\U000009B6-\U000009B9\U000009BD-\U000009BD\U000009CE-\U000009CE\U000009DC-\U000009DD\U000009DF-\U000009E1\U000009F0-\U000009F1\U000009FC-\U000009FC\U00000A05-\U00000A0A\U00000A0F-\U00000A10\U00000A13-\U00000A28\U00000A2A-\U00000A30\U00000A32-\U00000A33\U00000A35-\U00000A36\U00000A38-\U00000A39\U00000A59-\U00000A5C\U00000A5E-\U00000A5E\U00000A72-\U00000A74\U00000A85-\U00000A8D\U00000A8F-\U00000A91\U00000A93-\U00000AA8\U00000AAA-\U00000AB0\U00000AB2-\U00000AB3\U00000AB5-\U00000AB9\U00000ABD-\U00000ABD\U00000AD0-\U00000AD0\U00000AE0-\U00000AE1\U00000AF9-\U00000AF9\U00000B05-\U00000B0C\U00000B0F-\U00000B10\U00000B13-\U00000B28\U00000B2A-\U00000B30\U00000B32-\U00000B33\U00000B35-\U00000B39\U00000B3D-\U00000B3D\U00000B5C-\U00000B5D\U00000B5F-\U00000B61\U00000B71-\U00000B71\U00000B83-\U00000B83\U00000B85-\U00000B8A\U00000B8E-\U00000B90\U00000B92-\U00000B95\U00000B99-\U00000B9A\U00000B9C-\U00000B9C\U00000B9E-\U00000B9F\U00000BA3-\U00000BA4\U00000BA8-\U00000BAA\U00000BAE-\U00000BB9\U00000BD0-\U00000BD0\U00000C05-\U00000C0C\U00000C0E-\U00000C10\U00000C12-\U00000C28\U00000C2A-\U00000C39\U00000C3D-\U00000C3D\U00000C58-\U00000C5A\U00000C60-\U00000C61\U00000C80-\U00000C80\U00000C85-\U00000C8C\U00000C8E-\U00000C90\U00000C92-\U00000CA8\U00000CAA-\U00000CB3\U00000CB5-\U00000CB9\U00000CBD-\U00000CBD\U00000CDE-\U00000CDE\U00000CE0-\U00000CE1\U00000CF1-\U00000CF2\U00000D04-\U00000D0C\U00000D0E-\U00000D10\U00000D12-\U00000D3A\U00000D3D-\U00000D3D\U00000D4E-\U00000D4E\U00000D54-\U00000D56\U00000D5F-\U00000D61\U00000D7A-\U00000D7F\U00000D85-\U00000D96\U00000D9A-\U00000DB1\U00000DB3-\U00000DBB\U00000DBD-\U00000DBD\U00000DC0-\U00000DC6\U00000E01-\U00000E30\U00000E32-\U00000E33\U00000E40-\U00000E46\U00000E81-\U00000E82\U00000E84-\U00000E84\U00000E86-\U00000E8A\U00000E8C-\U00000EA3\U00000EA5-\U00000EA5\U00000EA7-\U00000EB0\U00000EB2-\U00000EB3\U00000EBD-\U00000EBD\U00000EC0-\U00000EC4\U00000EC6-\U00000EC6\U00000EDC-\U00000EDF\U00000F00-\U00000F00\U00000F40-\U00000F47\U00000F49-\U00000F6C\U00000F88-\U00000F8C\U00001000-\U0000102A\U0000103F-\U0000103F\U00001050-\U00001055\U0000105A-\U0000105D\U00001061-\U00001061\U00001065-\U00001066\U0000106E-\U00001070\U00001075-\U00001081\U0000108E-\U0000108E\U000010A0-\U000010C5\U000010C7-\U000010C7\U000010CD-\U000010CD\U000010D0-\U000010FA\U000010FC-\U00001248\U0000124A-\U0000124D\U00001250-\U00001256\U00001258-\U00001258\U0000125A-\U0000125D\U00001260-\U00001288\U0000128A-\U0000128D\U00001290-\U000012B0\U000012B2-\U000012B5\U000012B8-\U000012BE\U000012C0-\U000012C0\U000012C2-\U000012C5\U000012C8-\U000012D6\U000012D8-\U00001310\U00001312-\U00001315\U00001318-\U0000135A\U00001380-\U0000138F\U000013A0-\U000013F5\U000013F8-\U000013FD\U00001401-\U0000166C\U0000166F-\U0000167F\U00001681-\U0000169A\U000016A0-\U000016EA\U000016F1-\U000016F8\U00001700-\U0000170C\U0000170E-\U00001711\U00001720-\U00001731\U00001740-\U00001751\U00001760-\U0000176C\U0000176E-\U00001770\U00001780-\U000017B3\U000017D7-\U000017D7\U000017DC-\U000017DC\U00001820-\U00001878\U00001880-\U00001884\U00001887-\U000018A8\U000018AA-\U000018AA\U000018B0-\U000018F5\U00001900-\U0000191E\U00001950-\U0000196D\U00001970-\U00001974\U00001980-\U000019AB\U000019B0-\U000019C9\U00001A00-\U00001A16\U00001A20-\U00001A54\U00001AA7-\U00001AA7\U00001B05-\U00001B33\U00001B45-\U00001B4B\U00001B83-\U00001BA0\U00001BAE-\U00001BAF\U00001BBA-\U00001BE5\U00001C00-\U00001C23\U00001C4D-\U00001C4F\U00001C5A-\U00001C7D\U00001C80-\U00001C88\U00001C90-\U00001CBA\U00001CBD-\U00001CBF\U00001CE9-\U00001CEC\U00001CEE-\U00001CF3\U00001CF5-\U00001CF6\U00001CFA-\U00001CFA\U00001D00-\U00001DBF\U00001E00-\U00001F15\U00001F18-\U00001F1D\U00001F20-\U00001F45\U00001F48-\U00001F4D\U00001F50-\U00001F57\U00001F59-\U00001F59\U00001F5B-\U00001F5B\U00001F5D-\U00001F5D\U00001F5F-\U00001F7D\U00001F80-\U00001FB4\U00001FB6-\U00001FBC\U00001FBE-\U00001FBE\U00001FC2-\U00001FC4\U00001FC6-\U00001FCC\U00001FD0-\U00001FD3\U00001FD6-\U00001FDB\U00001FE0-\U00001FEC\U00001FF2-\U00001FF4\U00001FF6-\U00001FFC\U00002071-\U00002071\U0000207F-\U0000207F\U00002090-\U0000209C\U00002102-\U00002102\U00002107-\U00002107\U0000210A-\U00002113\U00002115-\U00002115\U00002119-\U0000211D\U00002124-\U00002124\U00002126-\U00002126\U00002128-\U00002128\U0000212A-\U0000212D\U0000212F-\U00002139\U0000213C-\U0000213F\U00002145-\U00002149\U0000214E-\U0000214E\U00002183-\U00002184\U00002C00-\U00002C2E\U00002C30-\U00002C5E\U00002C60-\U00002CE4\U00002CEB-\U00002CEE\U00002CF2-\U00002CF3\U00002D00-\U00002D25\U00002D27-\U00002D27\U00002D2D-\U00002D2D\U00002D30-\U00002D67\U00002D6F-\U00002D6F\U00002D80-\U00002D96\U00002DA0-\U00002DA6\U00002DA8-\U00002DAE\U00002DB0-\U00002DB6\U00002DB8-\U00002DBE\U00002DC0-\U00002DC6\U00002DC8-\U00002DCE\U00002DD0-\U00002DD6\U00002DD8-\U00002DDE\U00002E2F-\U00002E2F\U00003005-\U00003006\U00003031-\U00003035\U0000303B-\U0000303C\U00003041-\U00003096\U0000309D-\U0000309F\U000030A1-\U000030FA\U000030FC-\U000030FF\U00003105-\U0000312F\U00003131-\U0000318E\U000031A0-\U000031BF\U000031F0-\U000031FF\U00003400-\U00004DBF\U00004E00-\U00009FFC\U0000A000-\U0000A48C\U0000A4D0-\U0000A4FD\U0000A500-\U0000A60C\U0000A610-\U0000A61F\U0000A62A-\U0000A62B\U0000A640-\U0000A66E\U0000A67F-\U0000A69D\U0000A6A0-\U0000A6E5\U0000A717-\U0000A71F\U0000A722-\U0000A788\U0000A78B-\U0000A7BF\U0000A7C2-\U0000A7CA\U0000A7F5-\U0000A801\U0000A803-\U0000A805\U0000A807-\U0000A80A\U0000A80C-\U0000A822\U0000A840-\U0000A873\U0000A882-\U0000A8B3\U0000A8F2-\U0000A8F7\U0000A8FB-\U0000A8FB\U0000A8FD-\U0000A8FE\U0000A90A-\U0000A925\U0000A930-\U0000A946\U0000A960-\U0000A97C\U0000A984-\U0000A9B2\U0000A9CF-\U0000A9CF\U0000A9E0-\U0000A9E4\U0000A9E6-\U0000A9EF\U0000A9FA-\U0000A9FE\U0000AA00-\U0000AA28\U0000AA40-\U0000AA42\U0000AA44-\U0000AA4B\U0000AA60-\U0000AA76\U0000AA7A-\U0000AA7A\U0000AA7E-\U0000AAAF\U0000AAB1-\U0000AAB1\U0000AAB5-\U0000AAB6\U0000AAB9-\U0000AABD\U0000AAC0-\U0000AAC0\U0000AAC2-\U0000AAC2\U0000AADB-\U0000AADD\U0000AAE0-\U0000AAEA\U0000AAF2-\U0000AAF4\U0000AB01-\U0000AB06\U0000AB09-\U0000AB0E\U0000AB11-\U0000AB16\U0000AB20-\U0000AB26\U0000AB28-\U0000AB2E\U0000AB30-\U0000AB5A\U0000AB5C-\U0000AB69\U0000AB70-\U0000ABE2\U0000AC00-\U0000D7A3\U0000D7B0-\U0000D7C6\U0000D7CB-\U0000D7FB\U0000F900-\U0000FA6D\U0000FA70-\U0000FAD9\U0000FB00-\U0000FB06\U0000FB13-\U0000FB17\U0000FB1D-\U0000FB1D\U0000FB1F-\U0000FB28\U0000FB2A-\U0000FB36\U0000FB38-\U0000FB3C\U0000FB3E-\U0000FB3E\U0000FB40-\U0000FB41\U0000FB43-\U0000FB44\U0000FB46-\U0000FBB1\U0000FBD3-\U0000FD3D\U0000FD50-\U0000FD8F\U0000FD92-\U0000FDC7\U0000FDF0-\U0000FDFB\U0000FE70-\U0000FE74\U0000FE76-\U0000FEFC\U0000FF21-\U0000FF3A\U0000FF41-\U0000FF5A\U0000FF66-\U0000FFBE\U0000FFC2-\U0000FFC7\U0000FFCA-\U0000FFCF\U0000FFD2-\U0000FFD7\U0000FFDA-\U0000FFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010280-\U0001029C\U000102A0-\U000102D0\U00010300-\U0001031F\U0001032D-\U00010340\U00010342-\U00010349\U00010350-\U00010375\U00010380-\U0001039D\U000103A0-\U000103C3\U000103C8-\U000103CF\U00010400-\U0001049D\U000104B0-\U000104D3\U000104D8-\U000104FB\U00010500-\U00010527\U00010530-\U00010563\U00010600-\U00010736\U00010740-\U00010755\U00010760-\U00010767\U00010800-\U00010805\U00010808-\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C-\U0001083C\U0001083F-\U00010855\U00010860-\U00010876\U00010880-\U0001089E\U000108E0-\U000108F2\U000108F4-\U000108F5\U00010900-\U00010915\U00010920-\U00010939\U00010980-\U000109B7\U000109BE-\U000109BF\U00010A00-\U00010A00\U00010A10-\U00010A13\U00010A15-\U00010A17\U00010A19-\U00010A35\U00010A60-\U00010A7C\U00010A80-\U00010A9C\U00010AC0-\U00010AC7\U00010AC9-\U00010AE4\U00010B00-\U00010B35\U00010B40-\U00010B55\U00010B60-\U00010B72\U00010B80-\U00010B91\U00010C00-\U00010C48\U00010C80-\U00010CB2\U00010CC0-\U00010CF2\U00010D00-\U00010D23\U00010E80-\U00010EA9\U00010EB0-\U00010EB1\U00010F00-\U00010F1C\U00010F27-\U00010F27\U00010F30-\U00010F45\U00010FB0-\U00010FC4\U00010FE0-\U00010FF6\U00011003-\U00011037\U00011083-\U000110AF\U000110D0-\U000110E8\U00011103-\U00011126\U00011144-\U00011144\U00011147-\U00011147\U00011150-\U00011172\U00011176-\U00011176\U00011183-\U000111B2\U000111C1-\U000111C4\U000111DA-\U000111DA\U000111DC-\U000111DC\U00011200-\U00011211\U00011213-\U0001122B\U00011280-\U00011286\U00011288-\U00011288\U0001128A-\U0001128D\U0001128F-\U0001129D\U0001129F-\U000112A8\U000112B0-\U000112DE\U00011305-\U0001130C\U0001130F-\U00011310\U00011313-\U00011328\U0001132A-\U00011330\U00011332-\U00011333\U00011335-\U00011339\U0001133D-\U0001133D\U00011350-\U00011350\U0001135D-\U00011361\U00011400-\U00011434\U00011447-\U0001144A\U0001145F-\U00011461\U00011480-\U000114AF\U000114C4-\U000114C5\U000114C7-\U000114C7\U00011580-\U000115AE\U000115D8-\U000115DB\U00011600-\U0001162F\U00011644-\U00011644\U00011680-\U000116AA\U000116B8-\U000116B8\U00011700-\U0001171A\U00011800-\U0001182B\U000118A0-\U000118DF\U000118FF-\U00011906\U00011909-\U00011909\U0001190C-\U00011913\U00011915-\U00011916\U00011918-\U0001192F\U0001193F-\U0001193F\U00011941-\U00011941\U000119A0-\U000119A7\U000119AA-\U000119D0\U000119E1-\U000119E1\U000119E3-\U000119E3\U00011A00-\U00011A00\U00011A0B-\U00011A32\U00011A3A-\U00011A3A\U00011A50-\U00011A50\U00011A5C-\U00011A89\U00011A9D-\U00011A9D\U00011AC0-\U00011AF8\U00011C00-\U00011C08\U00011C0A-\U00011C2E\U00011C40-\U00011C40\U00011C72-\U00011C8F\U00011D00-\U00011D06\U00011D08-\U00011D09\U00011D0B-\U00011D30\U00011D46-\U00011D46\U00011D60-\U00011D65\U00011D67-\U00011D68\U00011D6A-\U00011D89\U00011D98-\U00011D98\U00011EE0-\U00011EF2\U00011FB0-\U00011FB0\U00012000-\U00012399\U00012480-\U00012543\U00013000-\U0001342E\U00014400-\U00014646\U00016800-\U00016A38\U00016A40-\U00016A5E\U00016AD0-\U00016AED\U00016B00-\U00016B2F\U00016B40-\U00016B43\U00016B63-\U00016B77\U00016B7D-\U00016B8F\U00016E40-\U00016E7F\U00016F00-\U00016F4A\U00016F50-\U00016F50\U00016F93-\U00016F9F\U00016FE0-\U00016FE1\U00016FE3-\U00016FE3\U00017000-\U000187F7\U00018800-\U00018CD5\U00018D00-\U00018D08\U0001B000-\U0001B11E\U0001B150-\U0001B152\U0001B164-\U0001B167\U0001B170-\U0001B2FB\U0001BC00-\U0001BC6A\U0001BC70-\U0001BC7C\U0001BC80-\U0001BC88\U0001BC90-\U0001BC99\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2-\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB-\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546-\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A5\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7CB\U0001E100-\U0001E12C\U0001E137-\U0001E13D\U0001E14E-\U0001E14E\U0001E2C0-\U0001E2EB\U0001E800-\U0001E8C4\U0001E900-\U0001E943\U0001E94B-\U0001E94B\U0001EE00-\U0001EE03\U0001EE05-\U0001EE1F\U0001EE21-\U0001EE22\U0001EE24-\U0001EE24\U0001EE27-\U0001EE27\U0001EE29-\U0001EE32\U0001EE34-\U0001EE37\U0001EE39-\U0001EE39\U0001EE3B-\U0001EE3B\U0001EE42-\U0001EE42\U0001EE47-\U0001EE47\U0001EE49-\U0001EE49\U0001EE4B-\U0001EE4B\U0001EE4D-\U0001EE4F\U0001EE51-\U0001EE52\U0001EE54-\U0001EE54\U0001EE57-\U0001EE57\U0001EE59-\U0001EE59\U0001EE5B-\U0001EE5B\U0001EE5D-\U0001EE5D\U0001EE5F-\U0001EE5F\U0001EE61-\U0001EE62\U0001EE64-\U0001EE64\U0001EE67-\U0001EE6A\U0001EE6C-\U0001EE72\U0001EE74-\U0001EE77\U0001EE79-\U0001EE7C\U0001EE7E-\U0001EE7E\U0001EE80-\U0001EE89\U0001EE8B-\U0001EE9B\U0001EEA1-\U0001EEA3\U0001EEA5-\U0001EEA9\U0001EEAB-\U0001EEBB\U00020000-\U0002A6DD\U0002A700-\U0002B734\U0002B740-\U0002B81D\U0002B820-\U0002CEA1\U0002CEB0-\U0002EBE0\U0002F800-\U0002FA1D\U00030000-\U0003134A]+", + //\s?\p{P}+ + L"\\s?[\U00000021-\U00000023\U00000025-\\\U0000002A\U0000002C-\U0000002F\U0000003A-\U0000003B\\\U0000003F-\U00000040\\\U0000005B-\\\U0000005D\U0000005F-\U0000005F\U0000007B-\U0000007B\U0000007D-\U0000007D\U000000A1-\U000000A1\U000000A7-\U000000A7\U000000AB-\U000000AB\U000000B6-\U000000B7\U000000BB-\U000000BB\U000000BF-\U000000BF\U0000037E-\U0000037E\U00000387-\U00000387\U0000055A-\U0000055F\U00000589-\U0000058A\U000005BE-\U000005BE\U000005C0-\U000005C0\U000005C3-\U000005C3\U000005C6-\U000005C6\U000005F3-\U000005F4\U00000609-\U0000060A\U0000060C-\U0000060D\U0000061B-\U0000061B\U0000061E-\U0000061F\U0000066A-\U0000066D\U000006D4-\U000006D4\U00000700-\U0000070D\U000007F7-\U000007F9\U00000830-\U0000083E\U0000085E-\U0000085E\U00000964-\U00000965\U00000970-\U00000970\U000009FD-\U000009FD\U00000A76-\U00000A76\U00000AF0-\U00000AF0\U00000C77-\U00000C77\U00000C84-\U00000C84\U00000DF4-\U00000DF4\U00000E4F-\U00000E4F\U00000E5A-\U00000E5B\U00000F04-\U00000F12\U00000F14-\U00000F14\U00000F3A-\U00000F3D\U00000F85-\U00000F85\U00000FD0-\U00000FD4\U00000FD9-\U00000FDA\U0000104A-\U0000104F\U000010FB-\U000010FB\U00001360-\U00001368\U00001400-\U00001400\U0000166E-\U0000166E\U0000169B-\U0000169C\U000016EB-\U000016ED\U00001735-\U00001736\U000017D4-\U000017D6\U000017D8-\U000017DA\U00001800-\U0000180A\U00001944-\U00001945\U00001A1E-\U00001A1F\U00001AA0-\U00001AA6\U00001AA8-\U00001AAD\U00001B5A-\U00001B60\U00001BFC-\U00001BFF\U00001C3B-\U00001C3F\U00001C7E-\U00001C7F\U00001CC0-\U00001CC7\U00001CD3-\U00001CD3\U00002010-\U00002027\U00002030-\U00002043\U00002045-\U00002051\U00002053-\U0000205E\U0000207D-\U0000207E\U0000208D-\U0000208E\U00002308-\U0000230B\U00002329-\U0000232A\U00002768-\U00002775\U000027C5-\U000027C6\U000027E6-\U000027EF\U00002983-\U00002998\U000029D8-\U000029DB\U000029FC-\U000029FD\U00002CF9-\U00002CFC\U00002CFE-\U00002CFF\U00002D70-\U00002D70\U00002E00-\U00002E2E\U00002E30-\U00002E4F\U00002E52-\U00002E52\U00003001-\U00003003\U00003008-\U00003011\U00003014-\U0000301F\U00003030-\U00003030\U0000303D-\U0000303D\U000030A0-\U000030A0\U000030FB-\U000030FB\U0000A4FE-\U0000A4FF\U0000A60D-\U0000A60F\U0000A673-\U0000A673\U0000A67E-\U0000A67E\U0000A6F2-\U0000A6F7\U0000A874-\U0000A877\U0000A8CE-\U0000A8CF\U0000A8F8-\U0000A8FA\U0000A8FC-\U0000A8FC\U0000A92E-\U0000A92F\U0000A95F-\U0000A95F\U0000A9C1-\U0000A9CD\U0000A9DE-\U0000A9DF\U0000AA5C-\U0000AA5F\U0000AADE-\U0000AADF\U0000AAF0-\U0000AAF1\U0000ABEB-\U0000ABEB\U0000FD3E-\U0000FD3F\U0000FE10-\U0000FE19\U0000FE30-\U0000FE52\U0000FE54-\U0000FE61\U0000FE63-\U0000FE63\U0000FE68-\U0000FE68\U0000FE6A-\U0000FE6B\U0000FF01-\U0000FF03\U0000FF05-\U0000FF0A\U0000FF0C-\U0000FF0F\U0000FF1A-\U0000FF1B\U0000FF1F-\U0000FF20\U0000FF3B-\U0000FF3D\U0000FF3F-\U0000FF3F\U0000FF5B-\U0000FF5B\U0000FF5D-\U0000FF5D\U0000FF5F-\U0000FF65\U00010100-\U00010102\U0001039F-\U0001039F\U000103D0-\U000103D0\U0001056F-\U0001056F\U00010857-\U00010857\U0001091F-\U0001091F\U0001093F-\U0001093F\U00010A50-\U00010A58\U00010A7F-\U00010A7F\U00010AF0-\U00010AF6\U00010B39-\U00010B3F\U00010B99-\U00010B9C\U00010EAD-\U00010EAD\U00010F55-\U00010F59\U00011047-\U0001104D\U000110BB-\U000110BC\U000110BE-\U000110C1\U00011140-\U00011143\U00011174-\U00011175\U000111C5-\U000111C8\U000111CD-\U000111CD\U000111DB-\U000111DB\U000111DD-\U000111DF\U00011238-\U0001123D\U000112A9-\U000112A9\U0001144B-\U0001144F\U0001145A-\U0001145B\U0001145D-\U0001145D\U000114C6-\U000114C6\U000115C1-\U000115D7\U00011641-\U00011643\U00011660-\U0001166C\U0001173C-\U0001173E\U0001183B-\U0001183B\U00011944-\U00011946\U000119E2-\U000119E2\U00011A3F-\U00011A46\U00011A9A-\U00011A9C\U00011A9E-\U00011AA2\U00011C41-\U00011C45\U00011C70-\U00011C71\U00011EF7-\U00011EF8\U00011FFF-\U00011FFF\U00012470-\U00012474\U00016A6E-\U00016A6F\U00016AF5-\U00016AF5\U00016B37-\U00016B3B\U00016B44-\U00016B44\U00016E97-\U00016E9A\U00016FE2-\U00016FE2\U0001BC9F-\U0001BC9F\U0001DA87-\U0001DA8B\U0001E95E-\U0001E95F]+", + //cjk + L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+", + //digits + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" + }; + const std::vector deepseek_llm_regex = { + L"[\r\n]", + L"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + L"\\s?[\u0021-\u002f\u003a-\u007e\uff01-\uff0f\uff1a-\uff5e\u2018-\u201f\u3000-\u3002]+", + L"\\s+$", + L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+", + L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" + }; }; struct llm_tokenizer_wpm { diff --git a/unicode.cpp b/unicode.cpp index 0a2863ea0..5395bf128 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -5,11 +5,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include static std::string unicode_cpts_to_utf8(const std::vector & cps) { std::string result; @@ -194,6 +197,207 @@ static std::unordered_map unicode_utf8_to_byte_map() { return map; } +static inline std::wstring unicode_wstring_from_utf8(const std::string & s) +{ + std::wstring_convert> conv; + return conv.from_bytes(s); +} + +static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) +{ + // code to convert from utf32/utf16 to utf8 + std::wstring_convert, wchar_t> converter; + std::string utf8 = converter.to_bytes(ws); + return utf8; +} + +static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { + std::vectorbpe_encoded_words; + for (auto word : bpe_words) { + std::string text_utf = ""; + auto utf_word = unicode_cpts_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) + text_utf += unicode_cpt_to_utf8(utf_word[i]); + + std::string encoded_token = ""; + for (char & c : text_utf) { + encoded_token += unicode_byte_to_utf8(c); + } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; +} + +static std::vector unicode_custom_preprocess(const std::string & text) { + std::vector bpe_words; + + std::string token = ""; + // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ + bool collecting_numeric = false; + bool collecting_letter = false; + bool collecting_special = false; + bool collecting_whitespace_lookahead = false; + bool collecting = false; + + std::vector text_utf; + text_utf.reserve(text.size()); + bpe_words.reserve(text.size()); + + const auto cpts = unicode_cpts_from_utf8(text); + for (size_t i = 0; i < cpts.size(); ++i) + text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); + + for (int i = 0; i < (int)text_utf.size(); i++) { + const std::string & utf_char = text_utf[i]; + bool split_condition = false; + int bytes_remain = text_utf.size() - i; + // forward backward lookups + const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; + const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; + + // handling contractions + if (!split_condition && bytes_remain >= 2) { + // 's|'t|'m|'d + if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { + split_condition = true; + } + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char + utf_char_next; + bpe_words.emplace_back(token); + token = ""; + i++; + continue; + } + } + if (!split_condition && bytes_remain >= 3) { + // 're|'ve|'ll + if (utf_char == "\'" && ( + (utf_char_next == "r" && utf_char_next_next == "e") || + (utf_char_next == "v" && utf_char_next_next == "e") || + (utf_char_next == "l" && utf_char_next_next == "l")) + ) { + split_condition = true; + } + if (split_condition) { + // current token + next token can be defined + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char + utf_char_next + utf_char_next_next; + bpe_words.emplace_back(token); // the contraction + token = ""; + i += 2; + continue; + } + } + + if (!split_condition && !collecting) { + if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { + collecting_letter = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + collecting_numeric = true; + collecting = true; + } + else if ( + ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || + (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) + ) { + collecting_special = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { + collecting_whitespace_lookahead = true; + collecting = true; + } + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { + split_condition = true; + } + } + else if (!split_condition && collecting) { + if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { + split_condition = true; + } + else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { + split_condition = true; + } + else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { + split_condition = true; + } + else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + split_condition = true; + } + } + + if (utf_char_next == "") { + split_condition = true; // final + token += utf_char; + } + + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); + } + token = utf_char; + collecting = false; + collecting_letter = false; + collecting_numeric = false; + collecting_special = false; + collecting_whitespace_lookahead = false; + } + else { + token += utf_char; + } + } + + return bpe_words; +} + +static std::vector unicode_regex_preprocess(const std::wstring & text, const std::vector & offsets, const std::wstring & regex_expr) { + std::wregex expr(regex_expr); + std::vector bpe_offsets; // stroe the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr); + std::wcregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::wcmatch match = *it; + if (match.position() > start_idx) { + bpe_offsets.emplace_back(match.position() - start_idx); + } + bpe_offsets.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; + } + + if (start_idx < (int64_t) offset) { + bpe_offsets.emplace_back(offset - start_idx); + } + start += offset; + } + + return bpe_offsets; +} + +static bool unicode_regex_matched(const std::wstring & text, const std::vector & regex_exprs) { + + for(auto & regex_expr: regex_exprs) { + std::wregex expr(regex_expr); + if(std::regex_match(text, expr)) { + return true; + } + } + + return false; +} + // // interface // @@ -275,6 +479,7 @@ char32_t unicode_tolower(char32_t cp) { auto it = unicode_map_lowercase.find(cp); return it == unicode_map_lowercase.end() ? cp : it->second; } + static const std::vector gpt2_regex = { // //punc: \{p} and ascii puncs L"[\U00000021-\U0000002F\U0000003A-\U00000040\\\U0000005B-\U00000060\U0000007B-\U0000007E\U000000A1-\U000000A1\U000000A7-\U000000A7\U000000AB-\U000000AB\U000000B6-\U000000B7\U000000BB-\U000000BB\U000000BF-\U000000BF\U0000037E-\U0000037E\U00000387-\U00000387\U0000055A-\U0000055F\U00000589-\U0000058A\U000005BE-\U000005BE\U000005C0-\U000005C0\U000005C3-\U000005C3\U000005C6-\U000005C6\U000005F3-\U000005F4\U00000609-\U0000060A\U0000060C-\U0000060D\U0000061B-\U0000061B\U0000061E-\U0000061F\U0000066A-\U0000066D\U000006D4-\U000006D4\U00000700-\U0000070D\U000007F7-\U000007F9\U00000830-\U0000083E\U0000085E-\U0000085E\U00000964-\U00000965\U00000970-\U00000970\U000009FD-\U000009FD\U00000A76-\U00000A76\U00000AF0-\U00000AF0\U00000C77-\U00000C77\U00000C84-\U00000C84\U00000DF4-\U00000DF4\U00000E4F-\U00000E4F\U00000E5A-\U00000E5B\U00000F04-\U00000F12\U00000F14-\U00000F14\U00000F3A-\U00000F3D\U00000F85-\U00000F85\U00000FD0-\U00000FD4\U00000FD9-\U00000FDA\U0000104A-\U0000104F\U000010FB-\U000010FB\U00001360-\U00001368\U00001400-\U00001400\U0000166E-\U0000166E\U0000169B-\U0000169C\U000016EB-\U000016ED\U00001735-\U00001736\U000017D4-\U000017D6\U000017D8-\U000017DA\U00001800-\U0000180A\U00001944-\U00001945\U00001A1E-\U00001A1F\U00001AA0-\U00001AA6\U00001AA8-\U00001AAD\U00001B5A-\U00001B60\U00001BFC-\U00001BFF\U00001C3B-\U00001C3F\U00001C7E-\U00001C7F\U00001CC0-\U00001CC7\U00001CD3-\U00001CD3\U00002010-\U00002027\U00002030-\U00002043\U00002045-\U00002051\U00002053-\U0000205E\U0000207D-\U0000207E\U0000208D-\U0000208E\U00002308-\U0000230B\U00002329-\U0000232A\U00002768-\U00002775\U000027C5-\U000027C6\U000027E6-\U000027EF\U00002983-\U00002998\U000029D8-\U000029DB\U000029FC-\U000029FD\U00002CF9-\U00002CFC\U00002CFE-\U00002CFF\U00002D70-\U00002D70\U00002E00-\U00002E2E\U00002E30-\U00002E4F\U00002E52-\U00002E52\U00003001-\U00003003\U00003008-\U00003011\U00003014-\U0000301F\U00003030-\U00003030\U0000303D-\U0000303D\U000030A0-\U000030A0\U000030FB-\U000030FB\U0000A4FE-\U0000A4FF\U0000A60D-\U0000A60F\U0000A673-\U0000A673\U0000A67E-\U0000A67E\U0000A6F2-\U0000A6F7\U0000A874-\U0000A877\U0000A8CE-\U0000A8CF\U0000A8F8-\U0000A8FA\U0000A8FC-\U0000A8FC\U0000A92E-\U0000A92F\U0000A95F-\U0000A95F\U0000A9C1-\U0000A9CD\U0000A9DE-\U0000A9DF\U0000AA5C-\U0000AA5F\U0000AADE-\U0000AADF\U0000AAF0-\U0000AAF1\U0000ABEB-\U0000ABEB\U0000FD3E-\U0000FD3F\U0000FE10-\U0000FE19\U0000FE30-\U0000FE52\U0000FE54-\U0000FE61\U0000FE63-\U0000FE63\U0000FE68-\U0000FE68\U0000FE6A-\U0000FE6B\U0000FF01-\U0000FF03\U0000FF05-\U0000FF0A\U0000FF0C-\U0000FF0F\U0000FF1A-\U0000FF1B\U0000FF1F-\U0000FF20\U0000FF3B-\U0000FF3D\U0000FF3F-\U0000FF3F\U0000FF5B-\U0000FF5B\U0000FF5D-\U0000FF5D\U0000FF5F-\U0000FF65\U00010100-\U00010102\U0001039F-\U0001039F\U000103D0-\U000103D0\U0001056F-\U0001056F\U00010857-\U00010857\U0001091F-\U0001091F\U0001093F-\U0001093F\U00010A50-\U00010A58\U00010A7F-\U00010A7F\U00010AF0-\U00010AF6\U00010B39-\U00010B3F\U00010B99-\U00010B9C\U00010EAD-\U00010EAD\U00010F55-\U00010F59\U00011047-\U0001104D\U000110BB-\U000110BC\U000110BE-\U000110C1\U00011140-\U00011143\U00011174-\U00011175\U000111C5-\U000111C8\U000111CD-\U000111CD\U000111DB-\U000111DB\U000111DD-\U000111DF\U00011238-\U0001123D\U000112A9-\U000112A9\U0001144B-\U0001144F\U0001145A-\U0001145B\U0001145D-\U0001145D\U000114C6-\U000114C6\U000115C1-\U000115D7\U00011641-\U00011643\U00011660-\U0001166C\U0001173C-\U0001173E\U0001183B-\U0001183B\U00011944-\U00011946\U000119E2-\U000119E2\U00011A3F-\U00011A46\U00011A9A-\U00011A9C\U00011A9E-\U00011AA2\U00011C41-\U00011C45\U00011C70-\U00011C71\U00011EF7-\U00011EF8\U00011FFF-\U00011FFF\U00012470-\U00012474\U00016A6E-\U00016A6F\U00016AF5-\U00016AF5\U00016B37-\U00016B3B\U00016B44-\U00016B44\U00016E97-\U00016E9A\U00016FE2-\U00016FE2\U0001BC9F-\U0001BC9F\U0001DA87-\U0001DA8B\U0001E95E-\U0001E95F]+", @@ -296,27 +501,31 @@ static const std::vector deepseek_coder_regex = { //digits L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" }; + +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { + std::wstring wtext = unicode_wstring_from_utf8(text); -static const std::vector deepseek_llm_regex = { - L"[\r\n]", - L"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", - L"\\s?[\u0021-\u002f\u003a-\u007e\uff01-\uff0f\uff1a-\uff5e\u2018-\u201f\u3000-\u3002]+", - L"\\s+$", - L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+", - L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" - }; + std::vector bpe_offsets = {wtext.size()}; -std::vector get_gpt2_regex() { - return gpt2_regex; -} - -std::vector get_deepseek_coder_regex() { - return deepseek_coder_regex; -} - -std::vector get_deepseek_llm_regex() { - return deepseek_llm_regex; -} + for(auto & regex_expr : regex_exprs) { + bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, regex_expr); + } + std::vector bpe_words; + bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for(size_t & offset : bpe_offsets){ + const auto temp_word = std::wstring(wtext, start, offset); + if(unicode_regex_matched(temp_word, regex_exprs)) { + bpe_words.emplace_back(unicode_wstring_to_utf8(temp_word)); + } else { + auto custom_bpe_words = unicode_custom_preprocess(unicode_wstring_to_utf8(temp_word)); + bpe_words.insert(bpe_words.end(), custom_bpe_words.begin(), custom_bpe_words.end()); + } + + start += offset; + } + return unicode_byte_encoding_process(bpe_words); +} \ No newline at end of file diff --git a/unicode.h b/unicode.h index cf2f8e976..dc74c4013 100644 --- a/unicode.h +++ b/unicode.h @@ -1,10 +1,10 @@ #pragma once +#include #include +#include #include #include -#include -#include #define CODEPOINT_TYPE_UNIDENTIFIED 0 #define CODEPOINT_TYPE_DIGIT 1 @@ -44,4 +44,5 @@ inline std::string to_utf8(const std::wstring & ws) std::wstring_convert, wchar_t> converter; std::string utf8 = converter.to_bytes(ws); return utf8; -} \ No newline at end of file +} +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs);