unicode : clean-up

This commit is contained in:
Georgi Gerganov 2024-04-28 18:01:59 +03:00
parent d63cc9068b
commit e972e6cbf8
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 53 additions and 62 deletions

View file

@ -1,4 +1,4 @@
#include "unicode-data.h"
#include "unicode-data.h"
#include <cstdint>
#include <map>
@ -1649,7 +1649,3 @@ const std::map<char32_t, char32_t> unicode_map_lowercase = {
{0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E},
{0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943},
};
const std::set<std::string> unicode_regex_with_custom_preprocessor = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"
};