server : Add option to return token pieces in /tokenize endpoint (#9108)

* server : added with_pieces functionality to /tokenize endpoint

* server : Add tokenize with pieces tests to server.feature

* Handle case if tokenizer splits along utf8 continuation bytes

* Add example of token splitting

* Remove trailing ws

* Fix trailing ws

* Maybe fix ci

* maybe this fix windows ci?

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
Mathijs Henquet 2024-09-12 22:30:11 +02:00 committed by GitHub
parent e6b7801bd1
commit 78203641fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 139 additions and 6 deletions

View file

@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
json tokens_response = json::array();
if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
if (with_pieces) {
for (const auto& token : tokens) {
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
json piece_json;
// Check if the piece is valid UTF-8
if (is_valid_utf8(piece)) {
piece_json = piece;
} else {
// If not valid UTF-8, store as array of byte values
piece_json = json::array();
for (unsigned char c : piece) {
piece_json.push_back(static_cast<int>(c));
}
}
tokens_response.push_back({
{"id", token},
{"piece", piece_json}
});
}
} else {
tokens_response = tokens;
}
}
const json data = format_tokenizer_response(tokens);
const json data = format_tokenizer_response(tokens_response);
res_ok(res, data);
};