added and refactored unicode_regex_split and related functions

This commit is contained in:
Kazim Abrar Mahi 2024-04-01 00:48:49 +06:00 committed by Georgi Gerganov
parent 1c924e4b35
commit 4056dc5b1e
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 264 additions and 103 deletions

View file

@ -1,10 +1,10 @@
#pragma once
#include <codecvt>
#include <cstdint>
#include <locale>
#include <string>
#include <vector>
#include <locale>
#include <codecvt>
#define CODEPOINT_TYPE_UNIDENTIFIED 0
#define CODEPOINT_TYPE_DIGIT 1
@ -44,4 +44,5 @@ inline std::string to_utf8(const std::wstring & ws)
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
std::string utf8 = converter.to_bytes(ws);
return utf8;
}
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::wstring> & regex_exprs);