From b67c81d1fab3608099e229b916da4dfd2c81d57e Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 20:25:45 +0200 Subject: [PATCH] Fix previous commit --- src/unicode.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index b7c0fc549..2c98676a8 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -697,14 +697,17 @@ static std::vector unicode_regex_prepare(const std::string & regex) { // use std::basic_regex to split the text codepoints static std::vector unicode_regex_split_stl(const std::vector & text_cpts, const std::vector & regex_cpts, const std::vector & offsets) { - using regex_type = std::basic_regex; - using iter_type = std::regex_iterator; - regex_type regex(regex_cpts.begin(), regex_cpts.end()); + GGML_ASSERT(sizeof(std::codepoint) == sizeof(uint32_t)); + using regex_type = std::basic_regex; + using iter_type = std::regex_iterator; + + const std::codepoint * text_data = (const std::codepoint *) text_cpts.data(); + const std::codepoint * regex_data = (const std::codepoint *) regex_cpts.data(); + regex_type regex(regex_data, regex_data+regex_cpts.size()); const iter_type end; std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size - const uint32_t * text_data = text_cpts.data(); for (auto offset : offsets) { iter_type it(text_data, text_data + offset, regex); int64_t start_idx = 0;