diff --git a/llama.cpp b/llama.cpp index 3c992d6f6..59d858e03 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12459,11 +12459,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token return 0; } -int32_t llama_chat_apply_template_internal( - const std::string & chat_template, - const std::vector & chat, - std::string & dest, bool add_ass); - // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { size_t start = 0; @@ -12479,13 +12474,13 @@ static std::string trim(const std::string & str) { // Simple version of "llama_apply_chat_template" that only works with strings // This function uses heuristic checks to determine commonly used template. It is not a jinja parser. -int32_t llama_chat_apply_template_internal( - const std::string & chat_template, +static int32_t llama_chat_apply_template_internal( + const std::string & tmpl, const std::vector & chat, std::string & dest, bool add_ass) { // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 std::stringstream ss; - if (chat_template.find("<|im_start|>") != std::string::npos) { + if (tmpl.find("<|im_start|>") != std::string::npos) { // chatml template for (auto message : chat) { ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; @@ -12493,16 +12488,16 @@ int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (chat_template.find("[INST]") != std::string::npos) { + } else if (tmpl.find("[INST]") != std::string::npos) { // llama2 template and its variants // [variant] support system message - bool support_system_message = chat_template.find("<>") != std::string::npos; + bool support_system_message = tmpl.find("<>") != std::string::npos; // [variant] space before + after response - bool space_around_response = chat_template.find("' ' + eos_token") != std::string::npos; + bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; // [variant] add BOS inside history - bool add_bos_inside_history = chat_template.find("bos_token + '[INST]") != std::string::npos; + bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos; // [variant] trim spaces from the input message - bool strip_message = chat_template.find("content.strip()") != std::string::npos; + bool strip_message = tmpl.find("content.strip()") != std::string::npos; // construct the prompt bool is_inside_turn = true; // skip BOS at the beginning ss << "[INST] "; @@ -12528,7 +12523,7 @@ int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" - } else if (chat_template.find("<|user|>") != std::string::npos) { + } else if (tmpl.find("<|user|>") != std::string::npos) { // zephyr template for (auto message : chat) { ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; @@ -12546,24 +12541,24 @@ int32_t llama_chat_apply_template_internal( LLAMA_API int32_t llama_chat_apply_template( const struct llama_model * model, - const char * custom_template, + const char * tmpl, const struct llama_chat_message * chat, size_t n_msg, bool add_ass, char * buf, int32_t length) { - std::string current_template(custom_template == nullptr ? "" : custom_template); - if (custom_template == nullptr) { + std::string curr_tmpl(tmpl == nullptr ? "" : tmpl); + if (tmpl == nullptr) { GGML_ASSERT(model != nullptr); // load template from model std::vector model_template(2048, 0); // longest known template is about 1200 bytes std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), current_template.size()); + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size()); if (res < 0) { // worst case: there is no information about template, we will use chatml by default - current_template = "<|im_start|>"; // see llama_chat_apply_template_internal + curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal } else { - current_template = std::string(model_template.data(), model_template.size()); + curr_tmpl = std::string(model_template.data(), model_template.size()); } } // format the chat to string @@ -12573,7 +12568,7 @@ LLAMA_API int32_t llama_chat_apply_template( chat_vec[i] = &chat[i]; } std::string formatted_chat; - int32_t res = llama_chat_apply_template_internal(current_template, chat_vec, formatted_chat, add_ass); + int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass); if (res < 0) { return res; } diff --git a/llama.h b/llama.h index 3e5357685..a3813d1ea 100644 --- a/llama.h +++ b/llama.h @@ -707,7 +707,7 @@ extern "C" { /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" /// NOTE: This function only support some known jinja templates. It is not a jinja parser. - /// @param custom_template A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. + /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. /// @param chat Pointer to a list of multiple llama_chat_message /// @param n_msg Number of llama_chat_message in this chat /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. @@ -716,7 +716,7 @@ extern "C" { /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template. LLAMA_API int32_t llama_chat_apply_template( const struct llama_model * model, - const char * custom_template, + const char * tmpl, const struct llama_chat_message * chat, size_t n_msg, bool add_ass,