resolve merge conflicts
This commit is contained in:
commit
66cffa8aff
127 changed files with 8174 additions and 6065 deletions
|
@ -5,7 +5,6 @@
|
|||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
|
@ -31,6 +30,8 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
|
@ -163,6 +164,8 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
|
@ -196,15 +199,31 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// auto enable conversation mode if chat template is available
|
||||
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
|
||||
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
||||
if (has_chat_template) {
|
||||
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
||||
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
||||
} else {
|
||||
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
||||
}
|
||||
}
|
||||
|
||||
// in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
|
||||
if (params.conversation_mode && !has_chat_template) {
|
||||
LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
|
||||
}
|
||||
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation) {
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
||||
} else {
|
||||
|
@ -241,9 +260,9 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
const bool add_bos = llama_add_bos_token(model);
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
if (!llama_model_has_encoder(model)) {
|
||||
GGML_ASSERT(!llama_add_eos_token(model));
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
}
|
||||
|
||||
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
||||
|
@ -251,8 +270,10 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> embd_inp;
|
||||
|
||||
{
|
||||
auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
auto prompt = (params.conversation_mode && params.enable_chat_template)
|
||||
// format the system prompt in conversation mode (fallback to default if empty)
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
// otherwise use the prompt as is
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG_DBG("tokenize the prompt\n");
|
||||
|
@ -269,7 +290,7 @@ int main(int argc, char ** argv) {
|
|||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
LOG_ERR("input is empty\n");
|
||||
|
@ -326,7 +347,7 @@ int main(int argc, char ** argv) {
|
|||
params.n_keep += add_bos; // always keep the BOS token
|
||||
}
|
||||
|
||||
if (params.conversation) {
|
||||
if (params.conversation_mode) {
|
||||
params.interactive_first = true;
|
||||
}
|
||||
|
||||
|
@ -450,7 +471,11 @@ int main(int argc, char ** argv) {
|
|||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s\n", control_message);
|
||||
LOG_INF( "%s", control_message);
|
||||
if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
|
||||
LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
@ -495,7 +520,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = llama_token_bos(model);
|
||||
decoder_start_token_id = llama_vocab_bos(vocab);
|
||||
}
|
||||
|
||||
embd_inp.clear();
|
||||
|
@ -742,7 +767,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// deal with end of generation tokens in interactive mode
|
||||
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
||||
if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found an EOG token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -762,7 +787,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation) {
|
||||
if (params.conversation_mode) {
|
||||
const auto id = common_sampler_last(smpl);
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
}
|
||||
|
@ -770,17 +795,17 @@ int main(int argc, char ** argv) {
|
|||
if (n_past > 0 && is_interacting) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.conversation) {
|
||||
if (params.conversation_mode) {
|
||||
LOG("\n> ");
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty() && !params.conversation) {
|
||||
if (!params.input_prefix.empty() && !params.conversation_mode) {
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG("%s", params.input_prefix.c_str());
|
||||
}
|
||||
|
@ -804,7 +829,7 @@ int main(int argc, char ** argv) {
|
|||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty() && !params.conversation) {
|
||||
if (!params.input_suffix.empty() && !params.conversation_mode) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
@ -817,7 +842,7 @@ int main(int argc, char ** argv) {
|
|||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
bool format_chat = params.conversation && params.enable_chat_template;
|
||||
bool format_chat = params.conversation_mode && params.enable_chat_template;
|
||||
std::string user_inp = format_chat
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
|
@ -830,8 +855,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||
if (need_insert_eot && format_chat) {
|
||||
llama_token eot = llama_token_eot(model);
|
||||
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
|
||||
llama_token eot = llama_vocab_eot(vocab);
|
||||
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
|
||||
need_insert_eot = false;
|
||||
}
|
||||
|
||||
|
@ -866,7 +891,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
|
||||
LOG(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue