Adding unicode regex function
This commit is contained in:
parent
a5710a4101
commit
7e308ed212
4 changed files with 16 additions and 41 deletions
File diff suppressed because one or more lines are too long
|
@ -15,4 +15,3 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
|||
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
||||
extern const std::map<std::string, std::wstring> unicode_regex_to_wregex;
|
||||
extern const std::map<std::string, std::string> unicode_regex_to_regex;
|
24
unicode.cpp
24
unicode.cpp
File diff suppressed because one or more lines are too long
18
unicode.h
18
unicode.h
|
@ -28,21 +28,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
|||
|
||||
char32_t unicode_tolower(char32_t cp);
|
||||
|
||||
std::vector<std::wstring> get_gpt2_regex();
|
||||
std::vector<std::wstring> get_deepseek_coder_regex();
|
||||
std::vector<std::wstring> get_deepseek_llm_regex();
|
||||
|
||||
inline std::wstring from_utf8(const std::string & s)
|
||||
{
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||
return conv.from_bytes(s);
|
||||
}
|
||||
|
||||
inline std::string to_utf8(const std::wstring & ws)
|
||||
{
|
||||
// code to convert from utf32/utf16 to utf8
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
|
||||
std::string utf8 = converter.to_bytes(ws);
|
||||
return utf8;
|
||||
}
|
||||
bool unicode_wregex_exists(const std::string & regex);
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::wstring> & regex_exprs);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue