llama : fix whitespace escaping in tokenizer (#2724)

This commit is contained in:
goerch 2023-08-22 23:10:42 +02:00 committed by GitHub
parent c63bb1d16a
commit 46ef5b5fcf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 21 deletions

View file

@ -2253,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
}
static std::string llama_escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
result += "\xe2\x96\x81";
std::string result = "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') {
if (!escaping) {
result += "\xe2\x96\x81";
escaping = true;
}
}
else {
escaping = false;
result += "\xe2\x96\x81";
} else {
result += text[offs];
}
}