Merge branch 'master' into stablelm-support

This commit is contained in:
Galunid 2023-10-24 12:52:48 +02:00
commit 27d0c11897
43 changed files with 4182 additions and 3056 deletions

View file

@ -1,8 +1,7 @@
--- ---
name: Issue and enhancement template name: Bug template
about: Used to report issues and request enhancements for llama.cpp about: Used to report bugs in llama.cpp
title: "[User] Insert summary of your issue or enhancement.." labels: ["bug"]
labels: ''
assignees: '' assignees: ''
--- ---
@ -46,7 +45,7 @@ $ g++ --version
# Failure Information (for bugs) # Failure Information (for bugs)
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. Please help provide information about the failure / bug.
# Steps to Reproduce # Steps to Reproduce

28
.github/ISSUE_TEMPLATE/enhancement.md vendored Normal file
View file

@ -0,0 +1,28 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

1
.gitignore vendored
View file

@ -10,6 +10,7 @@
*.gcno *.gcno
*.gcda *.gcda
*.dot *.dot
*.bat
*.metallib *.metallib
.DS_Store .DS_Store
.build/ .build/

View file

@ -605,8 +605,8 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
const sampling = make.obj("sampling", "common/sampling.cpp"); const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp"); const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
if (server.target.isWindows()) { if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32"); server.linkSystemLibrary("ws2_32");
} }

View file

@ -880,13 +880,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
} }
if (params.ignore_eos) { if (params.ignore_eos) {
params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY; params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
} }
{ {
LOG("warming up the model with an empty run\n"); LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_tokens_rm(lctx, -1, -1); llama_kv_cache_tokens_rm(lctx, -1, -1);
llama_reset_timings(lctx); llama_reset_timings(lctx);
@ -941,7 +941,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
} }
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) { std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(ctx); const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
std::string piece; std::string piece;
std::string result; std::string result;
@ -1186,7 +1186,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx)); const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");

View file

@ -147,7 +147,7 @@ llama_token llama_sampling_sample(
// apply penalties // apply penalties
if (!prev.empty()) { if (!prev.empty()) {
const float nl_logit = logits[llama_token_nl(ctx_main)]; const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p, llama_sample_repetition_penalties(ctx_main, &cur_p,
prev.data() + prev.size() - penalty_last_n, prev.data() + prev.size() - penalty_last_n,
@ -155,7 +155,7 @@ llama_token llama_sampling_sample(
if (!penalize_nl) { if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) { for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx_main)) { if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit; cur_p.data[idx].logit = nl_logit;
break; break;
} }

View file

@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
int64_t used_samples = 0; int64_t used_samples = 0;
ggml_set_f32(target_probs, 0.0f); ggml_set_f32(target_probs, 0.0f);
llama_token bos = llama_token_bos(lctx); llama_token bos = llama_token_bos(llama_get_model(lctx));
llama_token eos = llama_token_eos(lctx); llama_token eos = llama_token_eos(llama_get_model(lctx));
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
for (int k=0; k<n_batch; ++k) { for (int k=0; k<n_batch; ++k) {
// printf("%s: batch %d\n", __func__, k); // printf("%s: batch %d\n", __func__, k);
@ -924,7 +924,7 @@ size_t tokenize_file(
for (llama_token token=0; token < n_vocab; ++token) { for (llama_token token=0; token < n_vocab; ++token) {
max_token_text_size = std::max( max_token_text_size = std::max(
max_token_text_size, max_token_text_size,
strlen(llama_token_get_text(lctx, token))); strlen(llama_token_get_text(llama_get_model(lctx), token)));
} }
// upper bound of context byte length. // upper bound of context byte length.

View file

@ -110,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f: with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
print("hello print: ",hparams["architectures"][0]) print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM": if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0]) print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit() sys.exit()

View file

@ -118,15 +118,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size): for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") if i not in reverse_vocab:
scores.append(0.0) # dummy tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))

View file

@ -123,15 +123,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size): for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") if i not in reverse_vocab:
scores.append(0.0) # dummy tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))

View file

@ -136,9 +136,11 @@ for i in range(vocab_size):
tokens.append(f"[PAD{i}]") tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab: elif reverse_vocab[i] in added_vocab:
# NOTE: wouldn't we like to distinguish CONTROL tokens here?
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.USER_DEFINED) if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else: else:
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)

View file

@ -139,15 +139,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size): for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") if i not in reverse_vocab:
scores.append(0.0) # dummy tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))

View file

@ -111,17 +111,25 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size): for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") if i not in reverse_vocab:
scores.append(0.0) # dummy tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer) special_vocab.add_to_gguf(gguf_writer)

View file

@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
i_batch[i] = -1; i_batch[i] = -1;
LOG_TEE("\n"); LOG_TEE("\n");
if (n_parallel > 1) { if (n_parallel > 1) {

View file

@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
} }
// Function matching type llama_beam_search_callback_fn_t. // Function matching type llama_beam_search_callback_fn_t.

View file

@ -246,14 +246,14 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin()); inp_sfx.erase(inp_sfx.begin());
} }
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) { if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
} }
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx; embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(ctx)); embd_inp.push_back(llama_token_middle(model));
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@ -261,7 +261,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
} }
@ -577,10 +577,10 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){ if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if(is_interacting && !params.interactive_first) { if(is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
fflush(stdout); fflush(stdout);
printf("\n"); printf("\n");
@ -627,14 +627,14 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin()); inp_sfx.erase(inp_sfx.begin());
} }
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) { if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
} }
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx; embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(ctx)); embd_inp.push_back(llama_token_middle(model));
embd.clear(); embd.clear();
embd_guidance.clear(); embd_guidance.clear();
n_remain = params.n_predict; n_remain = params.n_predict;
@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
is_interacting = false; is_interacting = false;
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -661,7 +661,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n"); LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
@ -724,7 +724,7 @@ int main(int argc, char ** argv) {
} }
// end of text token // end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) { if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
break; break;
} }
@ -736,7 +736,7 @@ int main(int argc, char ** argv) {
} }
} }
if (!params.interactive && n_remain <= 0) { if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout); fflush(stdout);
} }

View file

@ -933,7 +933,7 @@ struct sql_printer : public printer {
}; };
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx)); std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0; int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
} }
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(ctx); llama_token token = llama_token_bos(llama_get_model(ctx));
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);

View file

@ -1,7 +1,7 @@
set(TARGET clip) set(TARGET clip)
add_library(${TARGET} clip.cpp clip.h) add_library(${TARGET} clip.cpp clip.h)
install(TARGETS ${TARGET} LIBRARY) install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC) if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h

View file

@ -610,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean)); new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std)); new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
} }
if (verbosity >= 2) { if (verbosity >= 2) {

View file

@ -137,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) { inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params); int id = sample_id(ctx_llama, params);
static std::string ret; static std::string ret;
if (id == llama_token_eos(ctx_llama)) { if (id == llama_token_eos(llama_get_model(ctx_llama))) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = llama_token_to_piece(ctx_llama, id); ret = llama_token_to_piece(ctx_llama, id);

View file

@ -248,7 +248,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
} }
@ -693,7 +693,7 @@ int main(int argc, char ** argv) {
} }
// deal with end of text token in interactive mode // deal with end of text token in interactive mode
if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n"); LOG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -720,7 +720,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n"); LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
@ -804,7 +804,7 @@ int main(int argc, char ** argv) {
} }
// end of text token // end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
LOG_TEE(" [end of text]\n"); LOG_TEE(" [end of text]\n");
break; break;
} }

View file

@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 && if (client.n_decoded > 2 &&
(id == llama_token_eos(ctx) || (id == llama_token_eos(model) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos || client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) { client.response.find('\n') != std::string::npos)) {

View file

@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
} }
const auto batch_logits = llama_get_logits(ctx); const auto batch_logits = llama_get_logits(ctx);
@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
} }
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {

View file

@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()

View file

@ -24,6 +24,10 @@ Command line options:
- `--port`: Set the port to listen. Default: `8080`. - `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public) - `--path`: path from which to serve static files (default examples/server/public)
- `--embedding`: Enable embedding extraction, Default: disabled. - `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
## Build ## Build
@ -158,6 +162,8 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
*Result JSON:* *Result JSON:*
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -188,6 +194,12 @@ node index.js
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- **POST** `/tokenize`: Tokenize a given text. - **POST** `/tokenize`: Tokenize a given text.
*Options:* *Options:*
@ -218,8 +230,32 @@ node index.js
It also accepts all the options of `/completion` except `stream` and `prompt`. It also accepts all the options of `/completion` except `stream` and `prompt`.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
## More examples ## More examples
### Change system prompt on runtime
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
`prompt`: Specify a context that you want all connecting clients to respect.
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
```json
{
"system_prompt": {
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
"anti_prompt": "User:",
"assistant_name": "Assistant:"
}
}
```
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
### Interactive mode ### Interactive mode
Check the sample in [chat.mjs](chat.mjs). Check the sample in [chat.mjs](chat.mjs).

View file

@ -8,6 +8,7 @@ import json
app = Flask(__name__) app = Flask(__name__)
slot_id = -1
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.") parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n') parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
if(is_present(body, "stop")): postData["stop"] += body["stop"] if(is_present(body, "stop")): postData["stop"] += body["stop"]
postData["n_keep"] = -1 postData["n_keep"] = -1
postData["stream"] = stream postData["stream"] = stream
postData["cache_prompt"] = True
postData["slot_id"] = slot_id
return postData return postData
def make_resData(data, chat=False, promptToken=[]): def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
} }
] ]
} }
slot_id = data["slot_id"]
if (chat): if (chat):
if (start): if (start):
resData["choices"][0]["delta"] = { resData["choices"][0]["delta"] = {

View file

@ -7,6 +7,11 @@ const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find( const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema" (_, index) => args[index - 1] === "--grammar-json-schema"
); );
const no_cached_prompt = args.find(
(_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar"); const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
// Example usage: function,arguments // Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8') grammar = readFileSync(grammarFile, 'utf-8')
} }
// for cached prompt
let slot_id = -1;
const API_URL = 'http://127.0.0.1:8080' const API_URL = 'http://127.0.0.1:8080'
const chat = [ const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
top_p: 0.9, top_p: 0.9,
n_keep: n_keep, n_keep: n_keep,
n_predict: 256, n_predict: 256,
cache_prompt: no_cached_prompt === "false",
slot_id: slot_id,
stop: ["\n### Human:"], // stop completion after generating this stop: ["\n### Human:"], // stop completion after generating this
grammar, grammar,
stream: true, stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
const t = Buffer.from(chunk).toString('utf8') const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) { if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6)) const message = JSON.parse(t.substring(6))
slot_id = message.slot_id
answer += message.content answer += message.content
process.stdout.write(message.content) process.stdout.write(message.content)
if (message.stop) { if (message.stop) {

File diff suppressed because it is too large Load diff

View file

@ -125,6 +125,7 @@
background-color: #222; background-color: #222;
color: #ddd; color: #ddd;
} }
code { code {
font-family: monospace; font-family: monospace;
padding: 0.1em 0.3em; padding: 0.1em 0.3em;
@ -141,7 +142,8 @@
display: inline; display: inline;
} }
header, footer { header,
footer {
text-align: center; text-align: center;
} }
@ -163,6 +165,7 @@
0% { 0% {
background-position: 0%; background-position: 0%;
} }
100% { 100% {
background-position: 100%; background-position: 100%;
} }
@ -181,6 +184,7 @@
--loading-color-1: #22222200; --loading-color-1: #22222200;
--loading-color-2: #222222ff; --loading-color-2: #222222ff;
} }
.popover-content { .popover-content {
background-color: black; background-color: black;
} }
@ -194,6 +198,8 @@
import { llama } from '/completion.js'; import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs'; import { SchemaConverter } from '/json-schema-to-grammar.mjs';
let selected_image = false;
var slot_id = -1;
const session = signal({ const session = signal({
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.", prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@ -203,6 +209,7 @@
type: "chat", // "chat" | "completion" type: "chat", // "chat" | "completion"
char: "Llama", char: "Llama",
user: "User", user: "User",
image_selected: ''
}) })
const params = signal({ const params = signal({
@ -220,7 +227,9 @@
mirostat_tau: 5, // target entropy mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate mirostat_eta: 0.1, // learning rate
grammar: '', grammar: '',
n_probs: 0, // no completion_probabilities n_probs: 0, // no completion_probabilities,
image_data: [],
cache_prompt: true
}) })
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */ /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -270,6 +279,7 @@
// saved templates were successfuly imported. // saved templates were successfuly imported.
console.log('Processing saved templates and updating default template') console.log('Processing saved templates and updating default template')
params.value = { ...params.value, image_data: [] };
//console.log(importedTemplates); //console.log(importedTemplates);
savedUserTemplates.value = importedTemplates; savedUserTemplates.value = importedTemplates;
@ -294,7 +304,9 @@
function userTemplateApply(t) { function userTemplateApply(t) {
session.value = t.data.session; session.value = t.data.session;
session.value = { ...session.value, image_selected: '' };
params.value = t.data.params; params.value = t.data.params;
params.value = { ...params.value, image_data: [] };
} }
function userTemplateResetToDefaultAndApply() { function userTemplateResetToDefaultAndApply() {
@ -385,20 +397,25 @@
throw new Error("already running"); throw new Error("already running");
} }
controller.value = new AbortController(); controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) { for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
const data = chunk.data; const data = chunk.data;
if (data.stop) { if (data.stop) {
while ( while (
currentMessages.length > 0 && currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) { ) {
currentMessages.pop(); currentMessages.pop();
} }
transcriptUpdate([...history, [char, currentMessages]]) transcriptUpdate([...history, [char, currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data); console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else { } else {
currentMessages.push(data); currentMessages.push(data);
slot_id = data.slot_id;
if (selected_image && !data.multimodal) {
alert("The server was not compiled for multimodal or the model projector can't be loaded.");
return;
}
transcriptUpdate([...history, [char, currentMessages]]) transcriptUpdate([...history, [char, currentMessages]])
} }
@ -419,7 +436,7 @@
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]]) transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
const prompt = template(session.value.template, { let prompt = template(session.value.template, {
message: msg, message: msg,
history: session.value.transcript.flatMap( history: session.value.transcript.flatMap(
([name, data]) => ([name, data]) =>
@ -434,9 +451,12 @@
) )
).join("\n"), ).join("\n"),
}); });
if (selected_image) {
prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
}
await runLlama(prompt, { await runLlama(prompt, {
...params.value, ...params.value,
slot_id: slot_id,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")], stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}, "{{char}}"); }, "{{char}}");
} }
@ -446,10 +466,11 @@
console.log('already running...'); console.log('already running...');
return; return;
} }
const {prompt} = session.value; const { prompt } = session.value;
transcriptUpdate([...session.value.transcript, ["", prompt]]); transcriptUpdate([...session.value.transcript, ["", prompt]]);
await runLlama(prompt, { await runLlama(prompt, {
...params.value, ...params.value,
slot_id: slot_id,
stop: [], stop: [],
}, ""); }, "");
} }
@ -467,6 +488,27 @@
transcriptUpdate([]); transcriptUpdate([]);
} }
const uploadImage = (e) => {
e.preventDefault();
document.getElementById("fileInput").click();
document.getElementById("fileInput").addEventListener("change", function (event) {
const selectedFile = event.target.files[0];
if (selectedFile) {
const reader = new FileReader();
reader.onload = function () {
const image_data = reader.result;
session.value = { ...session.value, image_selected: image_data };
params.value = {
...params.value, image_data: [
{ data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
}
};
selected_image = true;
reader.readAsDataURL(selectedFile);
}
});
}
function MessageInput() { function MessageInput() {
const message = useSignal("") const message = useSignal("")
@ -497,6 +539,7 @@
</div> </div>
<div class="right"> <div class="right">
<button type="submit" disabled=${generating.value}>Send</button> <button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${uploadImage}>Upload Image</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button> <button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button> <button onclick=${reset}>Reset</button>
</div> </div>
@ -540,7 +583,7 @@
data; data;
message = html`<${Markdownish} text=${template(text)} />` message = html`<${Markdownish} text=${template(text)} />`
} }
if(user) { if (user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>` return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
} else { } else {
return html`<p key=${index}>${message}</p>` return html`<p key=${index}>${message}</p>`
@ -549,6 +592,7 @@
return html` return html`
<section id="chat" ref=${container}> <section id="chat" ref=${container}>
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
${messages.flatMap(chatLine)} ${messages.flatMap(chatLine)}
</section>`; </section>`;
}; };
@ -567,7 +611,7 @@
const converter = new SchemaConverter( const converter = new SchemaConverter(
grammarJsonSchemaPropOrder.value grammarJsonSchemaPropOrder.value
.split(',') .split(',')
.reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {}) .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
) )
converter.visit(schema, '') converter.visit(schema, '')
params.value = { params.value = {
@ -579,7 +623,7 @@
} }
} }
const FloatField = ({label, max, min, name, step, value}) => { const FloatField = ({ label, max, min, name, step, value }) => {
return html` return html`
<div> <div>
<label for="${name}">${label}</label> <label for="${name}">${label}</label>
@ -589,7 +633,7 @@
` `
}; };
const IntField = ({label, max, min, name, value}) => { const IntField = ({ label, max, min, name, value }) => {
return html` return html`
<div> <div>
<label for="${name}">${label}</label> <label for="${name}">${label}</label>
@ -672,7 +716,7 @@
${GrammarControl()} ${GrammarControl()}
</fieldset> </fieldset>
` `
); );
const CompletionConfigForm = () => ( const CompletionConfigForm = () => (
html` html`
@ -694,20 +738,20 @@
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()} ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
<fieldset class="two"> <fieldset class="two">
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})} ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})} ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
</fieldset> </fieldset>
<details> <details>
<summary>More options</summary> <summary>More options</summary>
<fieldset class="two"> <fieldset class="two">
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})} ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})} ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
</fieldset> </fieldset>
<hr /> <hr />
<fieldset class="three"> <fieldset class="three">
@ -716,11 +760,11 @@
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label> <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label> <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
</div> </div>
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})} ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})} ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
</fieldset> </fieldset>
<fieldset> <fieldset>
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})} ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset> </fieldset>
</details> </details>
</form> </form>
@ -759,20 +803,20 @@
const popoverChildren = html` const popoverChildren = html`
<div class="prob-set"> <div class="prob-set">
${probs.map((p, index) => { ${probs.map((p, index) => {
return html` return html`
<div <div
key=${index} key=${index}
title=${`prob: ${p.prob}`} title=${`prob: ${p.prob}`}
style=${{ style=${{
padding: '0.3em', padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent' backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}} }}
> >
<span>${p.tok_str}: </span> <span>${p.tok_str}: </span>
<span>${Math.floor(p.prob * 100)}%</span> <span>${Math.floor(p.prob * 100)}%</span>
</div> </div>
` `
})} })}
</div> </div>
` `
@ -851,9 +895,9 @@
ref=${popoverRef} ref=${popoverRef}
class="popover-content" class="popover-content"
style=${{ style=${{
top: position.value.top, top: position.value.top,
left: position.value.left, left: position.value.left,
}} }}
> >
${props.popoverChildren} ${props.popoverChildren}
</div> </div>
@ -952,8 +996,11 @@
</head> </head>
<body> <body>
<div id="container"></div> <div id="container">
<input type="file" id="fileInput" accept="image/*" style="display: none;">
</div>
<div id="portal"></div> <div id="portal"></div>
</body> </body>
</html> </html>

File diff suppressed because it is too large Load diff

View file

@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? // is it an end of stream?
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
LOG_TEE("\n"); LOG_TEE("\n");
break; break;

View file

@ -163,7 +163,7 @@ int main(int argc, char ** argv) {
printf("%s", token_str.c_str()); printf("%s", token_str.c_str());
fflush(stdout); fflush(stdout);
if (id == llama_token_eos(ctx_tgt)) { if (id == llama_token_eos(model_tgt)) {
has_eos = true; has_eos = true;
} }

View file

@ -62,6 +62,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul);
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
GGML_METAL_DECL_KERNEL(scale); GGML_METAL_DECL_KERNEL(scale);
GGML_METAL_DECL_KERNEL(scale_4);
GGML_METAL_DECL_KERNEL(silu); GGML_METAL_DECL_KERNEL(silu);
GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(gelu);
@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul);
GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(mul_row);
GGML_METAL_ADD_KERNEL(scale); GGML_METAL_ADD_KERNEL(scale);
GGML_METAL_ADD_KERNEL(scale_4);
GGML_METAL_ADD_KERNEL(silu); GGML_METAL_ADD_KERNEL(silu);
GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(gelu);
@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul); GGML_METAL_DEL_KERNEL(mul);
GGML_METAL_DEL_KERNEL(mul_row); GGML_METAL_DEL_KERNEL(mul_row);
GGML_METAL_DEL_KERNEL(scale); GGML_METAL_DEL_KERNEL(scale);
GGML_METAL_DEL_KERNEL(scale_4);
GGML_METAL_DEL_KERNEL(silu); GGML_METAL_DEL_KERNEL(silu);
GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(gelu);
@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
const float scale = *(const float *) src1->data; const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale]; int64_t n = ggml_nelements(dst);
if (n % 4 == 0) {
n /= 4;
[encoder setComputePipelineState:ctx->pipeline_scale_4];
} else {
[encoder setComputePipelineState:ctx->pipeline_scale];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) { switch (ggml_get_unary_op(gf->nodes[i])) {

View file

@ -125,9 +125,17 @@ kernel void kernel_mul_row(
} }
kernel void kernel_scale( kernel void kernel_scale(
device const float * src0,
device float * dst,
constant float & scale,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale;
}
kernel void kernel_scale_4(
device const float4 * src0, device const float4 * src0,
device float4 * dst, device float4 * dst,
constant float & scale, constant float & scale,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale; dst[tpig] = src0[tpig] * scale;
} }

View file

@ -8006,7 +8006,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
} }
} }
const llama_token eos = llama_token_eos(ctx); const llama_token eos = llama_token_eos(&ctx->model);
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded; std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
std::vector<llama_grammar_candidate> candidates_grammar; std::vector<llama_grammar_candidate> candidates_grammar;
@ -8216,7 +8216,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
const int64_t t_start_sample_us = ggml_time_us(); const int64_t t_start_sample_us = ggml_time_us();
if (token == llama_token_eos(ctx)) { if (token == llama_token_eos(&ctx->model)) {
for (const auto & stack : grammar->stacks) { for (const auto & stack : grammar->stacks) {
if (stack.empty()) { if (stack.empty()) {
return; return;
@ -9425,7 +9425,7 @@ struct llama_context * llama_new_context_with_model(
// build worst-case graph // build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
int n_past = cparams.n_ctx - n_tokens; int n_past = cparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -10186,43 +10186,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data(); return ctx->embedding.data();
} }
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) { const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
return ctx->model.vocab.id_to_token[token].text.c_str(); return model->vocab.id_to_token[token].text.c_str();
} }
float llama_token_get_score(const struct llama_context * ctx, llama_token token) { float llama_token_get_score(const struct llama_model * model, llama_token token) {
return ctx->model.vocab.id_to_token[token].score; return model->vocab.id_to_token[token].score;
} }
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) { llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
return ctx->model.vocab.id_to_token[token].type; return model->vocab.id_to_token[token].type;
} }
llama_token llama_token_bos(const struct llama_context * ctx) { llama_token llama_token_bos(const struct llama_model * model) {
return ctx->model.vocab.special_bos_id; return model->vocab.special_bos_id;
} }
llama_token llama_token_eos(const struct llama_context * ctx) { llama_token llama_token_eos(const struct llama_model * model) {
return ctx->model.vocab.special_eos_id; return model->vocab.special_eos_id;
} }
llama_token llama_token_nl(const struct llama_context * ctx) { llama_token llama_token_nl(const struct llama_model * model) {
return ctx->model.vocab.linefeed_id; return model->vocab.linefeed_id;
}
llama_token llama_token_prefix(const struct llama_context * ctx) {
return ctx->model.vocab.special_prefix_id;
} }
llama_token llama_token_middle(const struct llama_context * ctx) { llama_token llama_token_prefix(const struct llama_model * model) {
return ctx->model.vocab.special_middle_id; return model->vocab.special_prefix_id;
} }
llama_token llama_token_suffix(const struct llama_context * ctx) { llama_token llama_token_middle(const struct llama_model * model) {
return ctx->model.vocab.special_suffix_id; return model->vocab.special_middle_id;
} }
llama_token llama_token_eot(const struct llama_context * ctx) { llama_token llama_token_suffix(const struct llama_model * model) {
return ctx->model.vocab.special_eot_id; return model->vocab.special_suffix_id;
}
llama_token llama_token_eot(const struct llama_model * model) {
return model->vocab.special_eot_id;
} }
int llama_tokenize( int llama_tokenize(

21
llama.h
View file

@ -494,21 +494,22 @@ extern "C" {
// Vocab // Vocab
// //
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token); LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token); LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token); LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
// codellama infill tokens // codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
// //
// Tokenization // Tokenization

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -28,11 +28,16 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-1-llama.cpp) llama_build_executable(test-tokenizer-1-llama.cpp)
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_build_executable(test-tokenizer-1-bpe.cpp) llama_build_executable(test-tokenizer-1-bpe.cpp)
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-grad0.cpp) # SLOW

View file

@ -91,9 +91,19 @@ int main(int argc, char **argv) {
} }
} }
} }
// TODO: why doesn't this work for the full range of Unicodes? // Restrict to assigned unicode planes
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) { for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4;
}
}
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp); std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens); std::string check = llama_detokenize_bpe(ctx, tokens);
@ -103,7 +113,6 @@ int main(int argc, char **argv) {
return 4; return 4;
} }
} }
llama_free_model(model); llama_free_model(model);
llama_free(ctx); llama_free(ctx);