llama : tokenizer fixes (#2549)

* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies
This commit is contained in:
goerch 2023-08-14 18:30:28 +02:00 committed by GitHub
parent 8af3a99ff1
commit ec1b100720
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 612 additions and 147 deletions

View file

@ -633,17 +633,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
return "The";
}
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int) add_bos);
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();

View file

@ -2,6 +2,7 @@
#pragma once
#define LLAMA_API_CPP // TODO: eliminate me
#include "llama.h"
#include <string>
@ -100,12 +101,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Model utils
//

View file

@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
}

View file

@ -191,10 +191,6 @@ int main(int argc, char ** argv) {
// tokenize the prompt
std::vector<llama_token> embd_inp;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
@ -278,7 +274,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
@ -286,14 +282,14 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "'\n");
}
@ -662,7 +658,7 @@ int main(int argc, char ** argv) {
// display text
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
printf("%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stdout);
}

View file

@ -1,6 +1,7 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_CPP // TODO: eliminate me
#define LLAMA_API_INTERNAL
#include "llama.h"

View file

@ -45,9 +45,8 @@ int main(int argc, char ** argv) {
llama_free_model(model);
return 1;
}
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
@ -92,7 +91,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
@ -152,7 +151,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);

View file

@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stderr);
@ -109,7 +109,7 @@ int main(int argc, char ** argv) {
}
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id));
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
fflush(stdout);
// push this new token for next evaluation

View file

@ -1,4 +1,5 @@
#include "ggml.h"
#include "common.h"
#include "llama.h"
#include <unordered_map>
#include <vector>
@ -1961,7 +1962,7 @@ void print_matrix(struct ggml_tensor * probs) {
void print_token(struct llama_context * ctx, llama_token token) {
printf("%s", llama_token_to_str(ctx, token));
printf("%s", llama_token_to_str(ctx, token).c_str());
}
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@ -2188,11 +2189,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
f.read_raw(buf.data(), f.size);
buf[f.size] = '\0';
out.resize(buf.size());
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
if (n_tokens >= 0) {
out.resize(n_tokens);
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
if (n_tokens < 0) {
out.resize(-n_tokens);
llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
}
bool verify = false;
@ -2200,17 +2200,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
const char * in = buf.data();
const char * end = buf.data() + buf.size();
for (int i = 0; i < (int) out.size(); ++i) {
const char * s = llama_token_to_str(lctx, out[i]);
int len = strlen(s);
std::string s = llama_token_to_str(lctx, out[i]);
int len = s.length();
if (in >= end) {
printf("%s: unexpected end of original text.\n", __func__);
break;
}
const bool matches = (strncmp(in, s, len) == 0);
const bool matches = (strncmp(in, s.c_str(), len) == 0);
if (matches) {
in += len;
} else {
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
}
}
}