parent
0e74a7222e
commit
e6b1a5003e
10 changed files with 119 additions and 115 deletions
|
@ -564,7 +564,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
// TODO: not great allocating this every time
|
// TODO: not great allocating this every time
|
||||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||||
std::vector<llama_token> res(text.size() + (int) add_bos);
|
std::vector<llama_token> res(text.size() + (int) add_bos + 1);
|
||||||
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
|
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -196,10 +196,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
|
||||||
params.prompt.insert(0, 1, ' ');
|
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
} else {
|
} else {
|
||||||
|
@ -283,7 +279,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
|
@ -291,14 +287,14 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
|
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
|
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "'\n");
|
fprintf(stderr, "'\n");
|
||||||
}
|
}
|
||||||
|
@ -636,7 +632,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
printf("%s", llama_token_to_str(ctx, id));
|
printf("%s", llama_token_to_str(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,7 +91,7 @@ int main(int argc, char ** argv) {
|
||||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str.c_str());
|
||||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
|
||||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str.c_str());
|
||||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
|
|
|
@ -102,7 +102,7 @@ int main(int argc, char ** argv)
|
||||||
|
|
||||||
for( auto id : tokens_list )
|
for( auto id : tokens_list )
|
||||||
{
|
{
|
||||||
printf( "%s" , llama_token_to_str( ctx , id ) );
|
printf( "%s" , llama_token_to_str( ctx , id ).c_str() );
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
@ -162,7 +162,7 @@ int main(int argc, char ** argv)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print the new token :
|
// Print the new token :
|
||||||
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
|
printf( "%s" , llama_token_to_str( ctx , new_token_id ).c_str() );
|
||||||
fflush( stdout );
|
fflush( stdout );
|
||||||
|
|
||||||
// Push this new token for next evaluation :
|
// Push this new token for next evaluation :
|
||||||
|
|
|
@ -1959,7 +1959,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||||
|
|
||||||
|
|
||||||
void print_token(struct llama_context * ctx, llama_token token) {
|
void print_token(struct llama_context * ctx, llama_token token) {
|
||||||
printf("%s", llama_token_to_str(ctx, token));
|
printf("%s", llama_token_to_str(ctx, token).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
||||||
|
@ -2198,17 +2198,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
||||||
const char * in = buf.data();
|
const char * in = buf.data();
|
||||||
const char * end = buf.data() + buf.size();
|
const char * end = buf.data() + buf.size();
|
||||||
for (int i = 0; i < (int) out.size(); ++i) {
|
for (int i = 0; i < (int) out.size(); ++i) {
|
||||||
const char * s = llama_token_to_str(lctx, out[i]);
|
std::string s = llama_token_to_str(lctx, out[i]);
|
||||||
int len = strlen(s);
|
int len = s.length();
|
||||||
if (in >= end) {
|
if (in >= end) {
|
||||||
printf("%s: unexpected end of original text.\n", __func__);
|
printf("%s: unexpected end of original text.\n", __func__);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const bool matches = (strncmp(in, s, len) == 0);
|
const bool matches = (strncmp(in, s.c_str(), len) == 0);
|
||||||
if (matches) {
|
if (matches) {
|
||||||
in += len;
|
in += len;
|
||||||
} else {
|
} else {
|
||||||
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
|
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
105
llama.cpp
105
llama.cpp
|
@ -242,13 +242,6 @@ struct llama_kv_cache {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_trie {
|
|
||||||
std::unordered_map<std::string, llama_trie> map;
|
|
||||||
};
|
|
||||||
|
|
||||||
void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs);
|
|
||||||
size_t llama_trie_find(const struct llama_trie& trie, const std::string& text, size_t offs);
|
|
||||||
|
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
@ -260,7 +253,6 @@ struct llama_vocab {
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_score> id_to_token;
|
std::vector<token_score> id_to_token;
|
||||||
struct llama_trie trie;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
|
@ -524,13 +516,12 @@ struct llama_file_loader {
|
||||||
float score = 0.0f;
|
float score = 0.0f;
|
||||||
file.read_raw(&score, sizeof(score));
|
file.read_raw(&score, sizeof(score));
|
||||||
|
|
||||||
|
assert(vocab.token_to_id.find(word) == vocab.token_to_id.end());
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
|
|
||||||
auto & tok_score = vocab.id_to_token[i];
|
auto & tok_score = vocab.id_to_token[i];
|
||||||
tok_score.tok = word;
|
tok_score.tok = word;
|
||||||
tok_score.score = score;
|
tok_score.score = score;
|
||||||
|
|
||||||
llama_trie_insert(vocab.trie, word, 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
||||||
|
@ -1804,26 +1795,37 @@ struct llama_sp_bigram {
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs) {
|
static std::string llama_escape_whitespace(const std::string& text) {
|
||||||
if (offs < text.size()) {
|
std::string result;
|
||||||
size_t char_len = utf8_len(text[offs]);
|
bool escaping = false;
|
||||||
std::string key = text.substr(offs, char_len);
|
result += char(0xe2);
|
||||||
if (trie.map.find(key) == trie.map.end()) {
|
result += char(0x96);
|
||||||
trie.map[key] = llama_trie();
|
result += char(0x81);
|
||||||
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
|
if (text[offs] == ' ') {
|
||||||
|
if (!escaping) {
|
||||||
|
result += char(0xe2);
|
||||||
|
result += char(0x96);
|
||||||
|
result += char(0x81);
|
||||||
|
escaping = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
escaping = false;
|
||||||
|
result += text[offs];
|
||||||
}
|
}
|
||||||
llama_trie_insert(trie.map.at(key), text, offs + char_len);
|
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_trie_find(const struct llama_trie& trie, const std::string & text, size_t offs) {
|
static std::string llama_unescape_whitespace(const std::string& word) {
|
||||||
if (offs < text.size()) {
|
if (word.length() >= 3 &&
|
||||||
size_t char_len = utf8_len(text[offs]);
|
word[0] == char(0xe2) &&
|
||||||
std::string key = text.substr(offs, char_len);
|
word[1] == char(0x96) &&
|
||||||
if (trie.map.find(key) != trie.map.end()) {
|
word[2] == char(0x81)) {
|
||||||
return char_len + llama_trie_find(trie.map.at(key), text, offs + char_len);
|
return std::string(" ") + word.substr(3);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return 0;
|
return word;
|
||||||
}
|
}
|
||||||
|
|
||||||
// original implementation:
|
// original implementation:
|
||||||
|
@ -1832,13 +1834,12 @@ struct llama_tokenizer {
|
||||||
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
// split string into utf8 chars / token?
|
// split string into utf8 chars
|
||||||
int index = 0;
|
int index = 0;
|
||||||
size_t offs = 0;
|
size_t offs = 0;
|
||||||
while (offs < text.size()) {
|
while (offs < text.size()) {
|
||||||
llama_sp_symbol sym;
|
llama_sp_symbol sym;
|
||||||
size_t len = utf8_len(text[offs]);
|
size_t len = utf8_len(text[offs]);
|
||||||
// size_t len = llama_trie_find(vocab_.trie, text, offs);
|
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
len = utf8_len(text[offs]);
|
len = utf8_len(text[offs]);
|
||||||
}
|
}
|
||||||
|
@ -1908,7 +1909,7 @@ private:
|
||||||
|
|
||||||
if (p == rev_merge.end()) {
|
if (p == rev_merge.end()) {
|
||||||
// output any symbols that did not form tokens as bytes.
|
// output any symbols that did not form tokens as bytes.
|
||||||
for (int j = 0; j < (int) symbol.n; ++j) {
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
||||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||||
output.push_back(token_id);
|
output.push_back(token_id);
|
||||||
}
|
}
|
||||||
|
@ -1954,11 +1955,11 @@ private:
|
||||||
std::map<std::string, std::pair<int, int> > rev_merge;
|
std::map<std::string, std::pair<int, int> > rev_merge;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
|
||||||
llama_tokenizer tokenizer(vocab);
|
llama_tokenizer tokenizer(vocab);
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
if (text.empty()) {
|
if (raw_text.empty()) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1966,6 +1967,13 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
||||||
output.push_back(llama_token_bos());
|
output.push_back(llama_token_bos());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string text;
|
||||||
|
if (escape) {
|
||||||
|
text = llama_escape_whitespace(raw_text);
|
||||||
|
} else {
|
||||||
|
text = raw_text;
|
||||||
|
}
|
||||||
|
|
||||||
tokenizer.tokenize(text, output);
|
tokenizer.tokenize(text, output);
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
@ -3620,7 +3628,7 @@ int llama_tokenize_with_model(
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos) {
|
bool add_bos) {
|
||||||
auto res = llama_tokenize(model->vocab, text, add_bos);
|
auto res = llama_tokenize(model->vocab, text, add_bos, true);
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
fprintf(stderr, "%s: too many tokens\n", __func__);
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
||||||
|
@ -3643,6 +3651,27 @@ int llama_tokenize(
|
||||||
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_tokenize_bpe(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * text,
|
||||||
|
llama_token * tokens,
|
||||||
|
int n_max_tokens,
|
||||||
|
bool add_bos) {
|
||||||
|
auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);
|
||||||
|
|
||||||
|
if (n_max_tokens < (int) res.size()) {
|
||||||
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
||||||
|
return -((int) res.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < res.size(); i++) {
|
||||||
|
tokens[i] = res[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int llama_n_vocab_from_model(const struct llama_model * model) {
|
int llama_n_vocab_from_model(const struct llama_model * model) {
|
||||||
return model->vocab.id_to_token.size();
|
return model->vocab.id_to_token.size();
|
||||||
}
|
}
|
||||||
|
@ -3696,18 +3725,26 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||||
return ctx->embedding.data();
|
return ctx->embedding.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
std::string llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
||||||
if (token >= llama_n_vocab_from_model(model)) {
|
if (token >= llama_n_vocab_from_model(model)) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return model->vocab.id_to_token[token].tok.c_str();
|
return llama_unescape_whitespace(model->vocab.id_to_token[token].tok);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||||
return llama_token_to_str_with_model(&ctx->model, token);
|
return llama_token_to_str_with_model(&ctx->model, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
|
||||||
|
if (token >= llama_n_vocab_from_model(&ctx->model)) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctx->model.vocab.id_to_token[token].tok;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos() {
|
llama_token llama_token_bos() {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
16
llama.h
16
llama.h
|
@ -11,6 +11,7 @@
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
#ifdef LLAMA_SHARED
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
@ -278,6 +279,13 @@ extern "C" {
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
|
LLAMA_API int llama_tokenize_bpe(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * text,
|
||||||
|
llama_token * tokens,
|
||||||
|
int n_max_tokens,
|
||||||
|
bool add_bos);
|
||||||
|
|
||||||
LLAMA_API int llama_tokenize_with_model(
|
LLAMA_API int llama_tokenize_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * text,
|
const char * text,
|
||||||
|
@ -319,11 +327,15 @@ extern "C" {
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
// Token Id -> String. Uses the vocabulary in the provided context
|
||||||
LLAMA_API const char * llama_token_to_str(
|
LLAMA_API std::string llama_token_to_str(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
LLAMA_API const char * llama_token_to_str_with_model(
|
LLAMA_API std::string llama_token_to_str_bpe(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
llama_token token);
|
||||||
|
|
||||||
|
LLAMA_API std::string llama_token_to_str_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
|
|
|
@ -5,44 +5,10 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static std::string escape_whitespace(const std::string& text) {
|
|
||||||
std::string result;
|
|
||||||
bool escaping = false;
|
|
||||||
result += char(0xe2);
|
|
||||||
result += char(0x96);
|
|
||||||
result += char(0x81);
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
|
||||||
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
|
||||||
if (!escaping) {
|
|
||||||
result += char(0xe2);
|
|
||||||
result += char(0x96);
|
|
||||||
result += char(0x81);
|
|
||||||
escaping = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
escaping = false;
|
|
||||||
result += text[offs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
|
||||||
const char* word = llama_token_to_str(ctx, token);
|
|
||||||
if (strlen(word) >= 3 &&
|
|
||||||
word[0] == char(0xe2) &&
|
|
||||||
word[1] == char(0x96) &&
|
|
||||||
word[2] == char(0x81)) {
|
|
||||||
return std::string(" ") + (word + 3);
|
|
||||||
}
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||||
std::string result;
|
std::string result;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
result += unescape_whitespace(ctx, tokens[i]);
|
result += llama_token_to_str(ctx, tokens[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -50,6 +16,9 @@ static std::string unescape_whitespace(llama_context* ctx, const llama_token* to
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||||
{
|
{
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
|
{" ", {1, 259,},},
|
||||||
|
{ "\t", { 1, 29871, 12, }, },
|
||||||
|
{ "\n", { 1, 29871, 13, }, },
|
||||||
{ "Hello world", { 1, 15043, 3186, }, },
|
{ "Hello world", { 1, 15043, 3186, }, },
|
||||||
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
||||||
{ "Hello World", { 1, 15043, 2787, }, },
|
{ "Hello World", { 1, 15043, 2787, }, },
|
||||||
|
@ -58,7 +27,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||||
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||||
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||||
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||||
};
|
{"How are you?", { 1, 1128, 526, 366, 29973, }, },
|
||||||
|
};
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -109,8 +79,8 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
std::vector<llama_token> res(test_kv.first.size());
|
std::vector<llama_token> res(test_kv.first.size() + 2);
|
||||||
const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
|
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
|
||||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
|
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
|
@ -15,7 +15,7 @@ static std::string escape_whitespace(const std::string& text) {
|
||||||
result += char(0x96);
|
result += char(0x96);
|
||||||
result += char(0x81);
|
result += char(0x81);
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
if (text[offs] == ' ') {
|
||||||
if (!escaping) {
|
if (!escaping) {
|
||||||
result += char(0xe2);
|
result += char(0xe2);
|
||||||
result += char(0x96);
|
result += char(0x96);
|
||||||
|
@ -31,21 +31,10 @@ static std::string escape_whitespace(const std::string& text) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
|
||||||
const char* word = llama_token_to_str(ctx, token);
|
|
||||||
if (strlen(word) >= 3 &&
|
|
||||||
word[0] == char(0xe2) &&
|
|
||||||
word[1] == char(0x96) &&
|
|
||||||
word[2] == char(0x81)) {
|
|
||||||
return std::string(" ") + (word + 3);
|
|
||||||
}
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||||
std::string result;
|
std::string result;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
result += unescape_whitespace(ctx, tokens[i]);
|
result += llama_token_to_str(ctx, tokens[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -97,22 +86,22 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
std::string forward = llama_token_to_str(ctx, i);
|
std::string forward = llama_token_to_str_bpe(ctx, i);
|
||||||
std::vector<llama_token> tokens(forward.length());
|
std::vector<llama_token> tokens(forward.length());
|
||||||
int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
|
int n = llama_tokenize_bpe(ctx, forward.c_str(), tokens.data(), forward.length(), false);
|
||||||
if (n == 1) {
|
if (n == 1) {
|
||||||
if (i != tokens[0]) {
|
if (i != tokens[0]) {
|
||||||
std::string backward = unescape_whitespace(ctx, tokens[0]);
|
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
|
||||||
__func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
|
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (i <= 258) {
|
if (i <= 258) {
|
||||||
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
|
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
|
||||||
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
|
||||||
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -121,8 +110,8 @@ int main(int argc, char **argv) {
|
||||||
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
|
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
|
||||||
std::wstring wstr(1, ch);
|
std::wstring wstr(1, ch);
|
||||||
std::string str = converter.to_bytes(wstr);
|
std::string str = converter.to_bytes(wstr);
|
||||||
std::vector<llama_token> tokens(strlen(str.c_str()));
|
std::vector<llama_token> tokens(str.length() + 1);
|
||||||
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
|
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length() + 1, false);
|
||||||
if (n == 1) {
|
if (n == 1) {
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||||
__func__, str.c_str(), tokens[0]);
|
__func__, str.c_str(), tokens[0]);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue