Fix previous commit
This commit is contained in:
parent
dcac74792b
commit
b67c81d1fa
1 changed files with 7 additions and 4 deletions
|
@ -697,14 +697,17 @@ static std::vector<uint32_t> unicode_regex_prepare(const std::string & regex) {
|
||||||
|
|
||||||
// use std::basic_regex<uint32_t> to split the text codepoints
|
// use std::basic_regex<uint32_t> to split the text codepoints
|
||||||
static std::vector<size_t> unicode_regex_split_stl(const std::vector<uint32_t> & text_cpts, const std::vector<uint32_t> & regex_cpts, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_stl(const std::vector<uint32_t> & text_cpts, const std::vector<uint32_t> & regex_cpts, const std::vector<size_t> & offsets) {
|
||||||
using regex_type = std::basic_regex<uint32_t>;
|
GGML_ASSERT(sizeof(std::codepoint) == sizeof(uint32_t));
|
||||||
using iter_type = std::regex_iterator<const uint32_t *>;
|
using regex_type = std::basic_regex<std::codepoint>;
|
||||||
regex_type regex(regex_cpts.begin(), regex_cpts.end());
|
using iter_type = std::regex_iterator<const std::codepoint *>;
|
||||||
|
|
||||||
|
const std::codepoint * text_data = (const std::codepoint *) text_cpts.data();
|
||||||
|
const std::codepoint * regex_data = (const std::codepoint *) regex_cpts.data();
|
||||||
|
regex_type regex(regex_data, regex_data+regex_cpts.size());
|
||||||
const iter_type end;
|
const iter_type end;
|
||||||
|
|
||||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size
|
||||||
const uint32_t * text_data = text_cpts.data();
|
|
||||||
for (auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
iter_type it(text_data, text_data + offset, regex);
|
iter_type it(text_data, text_data + offset, regex);
|
||||||
int64_t start_idx = 0;
|
int64_t start_idx = 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue