server : Add option to return token pieces in /tokenize endpoint (#9108)
* server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
e6b7801bd1
commit
78203641fe
6 changed files with 139 additions and 6 deletions
|
@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|||
return res;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||
static bool is_valid_utf8(const std::string & str) {
|
||||
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||
const unsigned char* end = bytes + str.length();
|
||||
|
||||
while (bytes < end) {
|
||||
if (*bytes <= 0x7F) {
|
||||
// 1-byte sequence (0xxxxxxx)
|
||||
bytes++;
|
||||
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 2;
|
||||
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 3;
|
||||
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 4;
|
||||
} else {
|
||||
// Invalid UTF-8 lead byte
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const json & tokens) {
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue