Merge remote-tracking branch 'origin/master' into jinja
This commit is contained in:
commit
cb72cf1fc3
215 changed files with 23423 additions and 18704 deletions
|
@ -2,6 +2,9 @@
|
|||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
|
@ -18,6 +21,7 @@
|
|||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
|
@ -62,11 +66,29 @@
|
|||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
#define PATH_MAX MAX_PATH
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
|
||||
//
|
||||
// CURL utils
|
||||
//
|
||||
|
||||
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
||||
struct curl_slist_ptr {
|
||||
struct curl_slist * ptr = nullptr;
|
||||
~curl_slist_ptr() {
|
||||
if (ptr) {
|
||||
curl_slist_free_all(ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
@ -843,7 +865,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
|
@ -851,26 +873,28 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
return iparams;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
||||
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
||||
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
@ -878,40 +902,40 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
||||
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
|
||||
|
||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
||||
int err = llama_control_vector_apply(lctx,
|
||||
cvec.data.data(),
|
||||
cvec.data.size(),
|
||||
cvec.n_embd,
|
||||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
int err = llama_apply_adapter_cvec(
|
||||
lctx,
|
||||
cvec.data.data(),
|
||||
cvec.data.size(),
|
||||
cvec.n_embd,
|
||||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
@ -919,30 +943,31 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
common_lora_adapter_container loaded_la;
|
||||
loaded_la.path = la.path;
|
||||
loaded_la.scale = la.scale;
|
||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||
if (loaded_la.adapter == nullptr) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
}
|
||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||
}
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||
|
||||
la.ptr = lora.get();
|
||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
|
||||
if (llama_token_is_eog(model, i)) {
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias.push_back({i, -INFINITY});
|
||||
}
|
||||
|
@ -963,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_token_bos(model);
|
||||
llama_token eos = llama_token_eos(model);
|
||||
llama_token bos = llama_vocab_bos(vocab);
|
||||
llama_token eos = llama_vocab_eos(vocab);
|
||||
|
||||
// some models (e.g. T5) don't have a BOS token
|
||||
if (bos != LLAMA_TOKEN_NULL) {
|
||||
tmp.push_back(bos);
|
||||
|
@ -979,7 +1005,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
|
@ -993,17 +1019,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
llama_perf_context_reset(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
iparams.context = lctx;
|
||||
iparams.model.reset(model);
|
||||
iparams.context.reset(lctx);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||
llama_lora_adapter_clear(ctx);
|
||||
for (auto & la : lora_adapters) {
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
llama_clear_adapter_lora(ctx);
|
||||
for (auto & la : lora) {
|
||||
if (la.scale != 0.0f) {
|
||||
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1120,7 +1146,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|||
|
||||
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
||||
// Initialize libcurl
|
||||
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
if (!curl) {
|
||||
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||
return false;
|
||||
|
@ -1134,11 +1161,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!hf_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer ";
|
||||
auth_header += hf_token.c_str();
|
||||
struct curl_slist *http_headers = NULL;
|
||||
http_headers = curl_slist_append(http_headers, auth_header.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
|
||||
std::string auth_header = "Authorization: Bearer " + hf_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
@ -1148,8 +1173,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
struct stat model_file_info;
|
||||
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
|
@ -1409,7 +1433,7 @@ struct llama_model * common_load_model_from_url(
|
|||
}
|
||||
}
|
||||
|
||||
return llama_load_model_from_file(local_path.c_str(), params);
|
||||
return llama_model_load_from_file(local_path.c_str(), params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
|
@ -1435,6 +1459,80 @@ struct llama_model * common_load_model_from_hf(
|
|||
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
||||
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
||||
*
|
||||
* Return pair of <repo, file> (with "repo" already having tag removed)
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||
}
|
||||
|
||||
// fetch model info from Hugging Face Hub API
|
||||
json model_info;
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (!hf_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + hf_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
throw std::runtime_error("error: cannot make GET request to HF API");
|
||||
}
|
||||
|
||||
long res_code;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
if (res_code == 200) {
|
||||
model_info = json::parse(res_str);
|
||||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
||||
}
|
||||
|
||||
// check response
|
||||
if (!model_info.contains("ggufFile")) {
|
||||
throw std::runtime_error("error: model does not have ggufFile");
|
||||
}
|
||||
json & gguf_file = model_info.at("ggufFile");
|
||||
if (!gguf_file.contains("rfilename")) {
|
||||
throw std::runtime_error("error: ggufFile does not have rfilename");
|
||||
}
|
||||
|
||||
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
|
@ -1456,6 +1554,11 @@ struct llama_model * common_load_model_from_hf(
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return std::make_pair("", "");
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
|
@ -1554,21 +1657,23 @@ std::vector<llama_token> common_tokenize(
|
|||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
return common_tokenize(vocab, text, add_special, parse_special);
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
@ -1576,13 +1681,19 @@ std::vector<llama_token> common_tokenize(
|
|||
return result;
|
||||
}
|
||||
|
||||
static std::string _common_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
||||
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
return common_token_to_piece(vocab, token, special);
|
||||
}
|
||||
|
||||
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||||
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||||
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
}
|
||||
else {
|
||||
|
@ -1592,17 +1703,19 @@ static std::string _common_token_to_piece(const struct llama_model * model, llam
|
|||
return piece;
|
||||
}
|
||||
|
||||
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
return _common_token_to_piece(llama_get_model(ctx), token, special);
|
||||
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
return common_detokenize(vocab, tokens, special);
|
||||
}
|
||||
|
||||
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string text;
|
||||
text.resize(std::max(text.capacity(), tokens.size()));
|
||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
if (n_chars < 0) {
|
||||
text.resize(-n_chars);
|
||||
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
||||
}
|
||||
|
||||
|
@ -1630,9 +1743,8 @@ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
|
@ -1643,16 +1755,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|||
int alloc_size = 0;
|
||||
bool fallback = false; // indicate if we must fallback to default chatml
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
for (const auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, /* name */ nullptr) : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// error: chat template is not supported
|
||||
if (res < 0) {
|
||||
|
@ -1660,18 +1772,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
} else {
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(
|
||||
fallback ? nullptr : model,
|
||||
fallback ? "chatml" : ptr_tmpl,
|
||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
@ -1724,11 +1835,13 @@ static std::string _llama_model_meta_val_str(const struct llama_model * model, c
|
|||
|
||||
llama_chat_templates llama_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
|
||||
{
|
||||
auto bos_token = _common_token_to_piece(model, llama_token_bos(model), true);
|
||||
auto eos_token = _common_token_to_piece(model, llama_token_eos(model), true);
|
||||
auto vocab = llama_model_get_vocab(model);
|
||||
auto bos_token = common_token_to_piece(vocab, llama_token_bos(vocab), true);
|
||||
auto eos_token = common_token_to_piece(vocab, llama_token_eos(vocab), true);
|
||||
std::string default_template_src = chat_template_override;
|
||||
std::string tool_use_template_src = chat_template_override;
|
||||
if (chat_template_override.empty()) {
|
||||
// TODO:
|
||||
default_template_src = _llama_model_meta_val_str(model, "tokenizer.chat_template");
|
||||
tool_use_template_src = _llama_model_meta_val_str(model, "tokenizer.chat_template.tool_use");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue