llama : distinguish pieces from decoded text + fix detokenization
This commit is contained in:
parent
5d0ffb69f5
commit
9668aa115c
15 changed files with 93 additions and 68 deletions
|
@ -733,12 +733,12 @@ std::vector<llama_token> llama_tokenize(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -746,3 +746,24 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||||
|
|
||||||
return std::string(result.data(), result.size());
|
return std::string(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||||
|
const llama_token bos_id = llama_token_bos(ctx);
|
||||||
|
|
||||||
|
std::string piece;
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||||
|
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||||
|
|
||||||
|
// remove the leading space of the first non-BOS token
|
||||||
|
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
||||||
|
piece = piece.substr(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
result += piece;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -121,6 +121,11 @@ std::vector<llama_token> llama_tokenize(
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
std::string llama_token_to_str(
|
std::string llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
|
// removes the leading space from the first non-BOS token
|
||||||
|
std::string llama_detokenize(
|
||||||
|
llama_context * ctx,
|
||||||
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
|
@ -35,7 +35,7 @@ struct ostream_beam_view {
|
||||||
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||||
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
|
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||||
}
|
}
|
||||||
return os << ')';
|
return os << ')';
|
||||||
}
|
}
|
||||||
|
@ -156,7 +156,7 @@ int main(int argc, char ** argv)
|
||||||
|
|
||||||
for( auto id : tokens_list )
|
for( auto id : tokens_list )
|
||||||
{
|
{
|
||||||
std::cout << llama_token_to_str(ctx, id);
|
std::cout << llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
std::cout << std::flush;
|
std::cout << std::flush;
|
||||||
|
|
||||||
|
@ -175,7 +175,7 @@ int main(int argc, char ** argv)
|
||||||
|
|
||||||
std::cout << "\n\n";
|
std::cout << "\n\n";
|
||||||
for (llama_token const token_id : callback_data.response) {
|
for (llama_token const token_id : callback_data.response) {
|
||||||
std::cout << llama_token_to_str(ctx,token_id);
|
std::cout << llama_token_to_piece(ctx,token_id);
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
|
|
@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
|
||||||
if (id == llama_token_eos(ctx)) {
|
if (id == llama_token_eos(ctx)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
} else {
|
} else {
|
||||||
ret = llama_token_to_str(ctx, id);
|
ret = llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
eval_id(mymodel, id);
|
eval_id(mymodel, id);
|
||||||
return ret.c_str();
|
return ret.c_str();
|
||||||
|
|
|
@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -280,7 +280,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
|
@ -288,14 +288,14 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "'\n");
|
fprintf(stderr, "'\n");
|
||||||
}
|
}
|
||||||
|
@ -451,7 +451,7 @@ int main(int argc, char ** argv) {
|
||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
//printf("resetting: '");
|
//printf("resetting: '");
|
||||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||||
// printf("%s", llama_token_to_str(ctx, embd[i]));
|
// printf("%s", llama_token_to_piece(ctx, embd[i]));
|
||||||
//}
|
//}
|
||||||
//printf("'\n");
|
//printf("'\n");
|
||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
|
@ -504,7 +504,7 @@ int main(int argc, char ** argv) {
|
||||||
input_size = embd_guidance.size();
|
input_size = embd_guidance.size();
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//fprintf(stderr, "\n---------------------\n");
|
||||||
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
||||||
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
|
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
|
||||||
//}
|
//}
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//fprintf(stderr, "\n---------------------\n");
|
||||||
} else {
|
} else {
|
||||||
|
@ -663,7 +663,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
printf("%s", llama_token_to_str(ctx, id).c_str());
|
printf("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
@ -679,7 +679,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.antiprompt.size()) {
|
if (params.antiprompt.size()) {
|
||||||
std::string last_output;
|
std::string last_output;
|
||||||
for (auto id : last_n_tokens) {
|
for (auto id : last_n_tokens) {
|
||||||
last_output += llama_token_to_str(ctx, id);
|
last_output += llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
is_antiprompt = false;
|
is_antiprompt = false;
|
||||||
|
|
|
@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
|
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
|
|
|
@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||||
std::string ret;
|
std::string ret;
|
||||||
for (; begin != end; ++begin)
|
for (; begin != end; ++begin)
|
||||||
{
|
{
|
||||||
ret += llama_token_to_str(ctx, *begin);
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line,
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||||
{
|
{
|
||||||
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||||
|
@ -566,7 +566,7 @@ struct llama_server_context
|
||||||
|
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
||||||
{
|
{
|
||||||
// stopping_word = llama_token_to_str(ctx, embd.back());
|
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
stopped_eos = true;
|
stopped_eos = true;
|
||||||
LOG_VERBOSE("eos token found", {});
|
LOG_VERBOSE("eos token found", {});
|
||||||
|
@ -613,7 +613,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
const completion_token_output token_with_probs = nextToken();
|
const completion_token_output token_with_probs = nextToken();
|
||||||
|
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (params.n_probs > 0)
|
if (params.n_probs > 0)
|
||||||
|
@ -1248,7 +1248,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||||
|
|
||||||
struct token_translator {
|
struct token_translator {
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
|
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||||
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1358,7 +1358,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
while (llama.has_next_token) {
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
const completion_token_output token_with_probs = llama.doCompletion();
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
||||||
token_text.size(), STOP_FULL);
|
token_text.size(), STOP_FULL);
|
||||||
|
@ -1389,7 +1389,7 @@ int main(int argc, char **argv)
|
||||||
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
|
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// print the new token :
|
// print the new token :
|
||||||
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
|
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
|
|
|
@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||||
|
|
||||||
|
|
||||||
void print_token(struct llama_context * ctx, llama_token token) {
|
void print_token(struct llama_context * ctx, llama_token token) {
|
||||||
printf("%s", llama_token_to_str(ctx, token).c_str());
|
printf("%s", llama_token_to_piece(ctx, token).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
||||||
|
@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
||||||
const char * in = buf.data();
|
const char * in = buf.data();
|
||||||
const char * end = buf.data() + buf.size();
|
const char * end = buf.data() + buf.size();
|
||||||
for (int i = 0; i < (int) out.size(); ++i) {
|
for (int i = 0; i < (int) out.size(); ++i) {
|
||||||
std::string s = llama_token_to_str(lctx, out[i]);
|
std::string s = llama_token_to_piece(lctx, out[i]);
|
||||||
int len = s.length();
|
int len = s.length();
|
||||||
if (in >= end) {
|
if (in >= end) {
|
||||||
printf("%s: unexpected end of original text.\n", __func__);
|
printf("%s: unexpected end of original text.\n", __func__);
|
||||||
|
|
34
llama.cpp
34
llama.cpp
|
@ -796,12 +796,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
(void) tensor;
|
(void) tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -3374,6 +3374,11 @@ private:
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
|
// OG tokenizer behavior:
|
||||||
|
//
|
||||||
|
// tokenizer.encode('', add_bos=True) returns [1]
|
||||||
|
// tokenizer.encode('', add_bos=False) returns []
|
||||||
|
|
||||||
if (bos && vocab.special_bos_id != -1) {
|
if (bos && vocab.special_bos_id != -1) {
|
||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
}
|
}
|
||||||
|
@ -3382,11 +3387,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
raw_text = " " + raw_text;
|
|
||||||
|
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
|
raw_text = " " + raw_text;
|
||||||
|
|
||||||
llm_tokenizer_spm tokenizer(vocab);
|
llm_tokenizer_spm tokenizer(vocab);
|
||||||
llama_escape_whitespace(raw_text);
|
llama_escape_whitespace(raw_text);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
|
@ -4080,15 +4086,15 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const std::string text = llama_token_to_text(ctx, id);
|
const std::string piece = llama_token_to_str(ctx, id);
|
||||||
if (id == eos) {
|
if (id == eos) {
|
||||||
if (!allow_eos) {
|
if (!allow_eos) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
} else if (text.empty() || text[0] == 0) {
|
} else if (piece.empty() || piece[0] == 0) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
||||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4292,10 +4298,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string text = llama_token_to_text(ctx, token);
|
const std::string piece = llama_token_to_str(ctx, token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||||
|
@ -6089,12 +6095,12 @@ int llama_tokenize_with_model(
|
||||||
return res.size();
|
return res.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||||
return llama_token_to_str_with_model(&ctx->model, token, buf, length);
|
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// does not write null-terminator to str
|
// does not write null-terminator to buf
|
||||||
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||||
if (0 <= token && token < llama_model_n_vocab(model)) {
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
|
|
10
llama.h
10
llama.h
|
@ -381,15 +381,17 @@ extern "C" {
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
// Token Id -> Piece.
|
||||||
// Does not write null terminator to the buffer
|
// Uses the vocabulary in the provided context.
|
||||||
LLAMA_API int llama_token_to_str(
|
// Does not write null terminator to the buffer.
|
||||||
|
// Use code is responsible to remove the leading whitespace of the first non-BOS token.
|
||||||
|
LLAMA_API int llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * buf,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_with_model(
|
LLAMA_API int llama_token_to_piece_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * buf,
|
char * buf,
|
||||||
|
|
|
@ -6,14 +6,6 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
{ "" , { }, },
|
{ "" , { }, },
|
||||||
|
|
|
@ -13,7 +13,7 @@ dir_tokenizer = args.dir_tokenizer
|
||||||
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
""
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
|
@ -49,3 +49,10 @@ for text in tests:
|
||||||
print('\nwithout bos:')
|
print('\nwithout bos:')
|
||||||
print(tokenizer.encode(text, add_bos=False))
|
print(tokenizer.encode(text, add_bos=False))
|
||||||
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
|
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
|
||||||
|
|
||||||
|
print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
|
||||||
|
print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
|
||||||
|
print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
|
||||||
|
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
|
||||||
|
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
|
||||||
|
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
|
||||||
|
|
|
@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
@ -72,13 +64,13 @@ int main(int argc, char **argv) {
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
std::string forward = llama_token_to_str(ctx, i);
|
std::string forward = llama_token_to_piece(ctx, i);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||||
if (tokens.size() == 1) {
|
if (tokens.size() == 1) {
|
||||||
if (i != tokens[0]) {
|
if (i != tokens[0]) {
|
||||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue