Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
dfef2c4c37
7 changed files with 252 additions and 37 deletions
|
@ -1802,6 +1802,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.n_threads_http = value;
|
params.n_threads_http = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cache-reuse"}, "N",
|
||||||
|
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.n_cache_reuse = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--metrics"},
|
{"--metrics"},
|
||||||
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
||||||
|
|
|
@ -283,7 +283,8 @@ struct common_params {
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
|
|
|
@ -147,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
|
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||||
|
@ -523,9 +524,30 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
- `input_prefix`: Set the prefix of the code to infill.
|
- `input_prefix`: Set the prefix of the code to infill.
|
||||||
- `input_suffix`: Set the suffix of the code to infill.
|
- `input_suffix`: Set the suffix of the code to infill.
|
||||||
|
- `prompt`: Added after the `FIM_MID` token
|
||||||
|
- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874
|
||||||
|
|
||||||
It also accepts all the options of `/completion`.
|
It also accepts all the options of `/completion`.
|
||||||
|
|
||||||
|
If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
<FIM_REP>myproject
|
||||||
|
<FIM_SEP>{chunk 0 filename}
|
||||||
|
{chunk 0 text}
|
||||||
|
<FIM_SEP>{chunk 1 filename}
|
||||||
|
{chunk 1 text}
|
||||||
|
...
|
||||||
|
<FIM_SEP>filename
|
||||||
|
<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
|
||||||
|
```
|
||||||
|
|
||||||
|
If the tokens are missing, then the extra context is simply prefixed at the start:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
[extra_context]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
|
||||||
|
```
|
||||||
|
|
||||||
### **GET** `/props`: Get server global properties.
|
### **GET** `/props`: Get server global properties.
|
||||||
|
|
||||||
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
|
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
|
||||||
|
|
|
@ -139,6 +139,7 @@ struct slot_params {
|
||||||
|
|
||||||
json input_prefix;
|
json input_prefix;
|
||||||
json input_suffix;
|
json input_suffix;
|
||||||
|
json extra_context;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
|
@ -170,6 +171,7 @@ struct server_slot {
|
||||||
|
|
||||||
// when a task is submitted, we first tokenize the prompt and store it here
|
// when a task is submitted, we first tokenize the prompt and store it here
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
std::vector<llama_token> extra_tokens;
|
||||||
|
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
std::vector<llama_token> cache_tokens;
|
std::vector<llama_token> cache_tokens;
|
||||||
|
@ -800,7 +802,7 @@ struct server_context {
|
||||||
int slot_prompt_len = slot_prompt.size();
|
int slot_prompt_len = slot_prompt.size();
|
||||||
|
|
||||||
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
||||||
int lcp_len = common_part(slot_prompt, prompt);
|
int lcp_len = longest_common_prefix(slot_prompt, prompt);
|
||||||
|
|
||||||
// fraction of the common substring length compared to the current slot's prompt length
|
// fraction of the common substring length compared to the current slot's prompt length
|
||||||
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
|
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
|
||||||
|
@ -908,8 +910,26 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
|
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
|
||||||
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
|
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
|
||||||
|
slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
|
||||||
|
for (const auto & chunk : slot.params.extra_context) {
|
||||||
|
// { "text": string, "filename": string }
|
||||||
|
if (!chunk.contains("text") || !chunk["text"].is_string()) {
|
||||||
|
send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// filename is optional
|
||||||
|
if (chunk.contains("filename") && !chunk["filename"].is_string()) {
|
||||||
|
send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// get prompt
|
// get prompt
|
||||||
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
|
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
|
||||||
|
@ -1938,13 +1958,66 @@ struct server_context {
|
||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_CMPL_TYPE_INFILL:
|
case SERVER_TASK_CMPL_TYPE_INFILL:
|
||||||
{
|
{
|
||||||
|
// use FIM repo-level pattern:
|
||||||
|
// ref: https://arxiv.org/pdf/2409.12186
|
||||||
|
//
|
||||||
|
// [FIM_REP]myproject
|
||||||
|
// [FIM_SEP]filename0
|
||||||
|
// extra chunk 0
|
||||||
|
// [FIM_SEP]filename1
|
||||||
|
// extra chunk 1
|
||||||
|
// ...
|
||||||
|
// [FIM_SEP]filename
|
||||||
|
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
|
||||||
|
//
|
||||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
|
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
|
||||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
|
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
|
||||||
|
|
||||||
// for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
slot.extra_tokens.clear();
|
||||||
const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
|
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
static const auto k_fim_repo = tokenize("myproject\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.push_back(llama_token_fim_rep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & chunk : slot.params.extra_context) {
|
||||||
|
// { "text": string, "filename": string }
|
||||||
|
const std::string text = chunk.value("text", "");
|
||||||
|
const std::string filename = chunk.value("filename", "tmp");
|
||||||
|
|
||||||
|
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
const auto k_fim_file = tokenize(filename + "\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
||||||
|
} else {
|
||||||
|
// chunk separator in binary form to avoid confusing the AI
|
||||||
|
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
|
||||||
|
static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto chunk_tokens = tokenize(text, false, false);
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
// TODO: current filename
|
||||||
|
static const auto k_fim_file = tokenize("filename\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
||||||
|
const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
|
||||||
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
|
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
|
||||||
|
|
||||||
|
// fill the rest of the context with extra chunks
|
||||||
|
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
|
||||||
|
|
||||||
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
|
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
|
||||||
suffix_tokens.resize(n_suffix_take);
|
suffix_tokens.resize(n_suffix_take);
|
||||||
|
|
||||||
|
@ -1958,6 +2031,11 @@ struct server_context {
|
||||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
|
||||||
|
|
||||||
|
// put the extra context before the FIM prefix
|
||||||
|
embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
embd_inp.push_back(llama_token_fim_mid(model));
|
embd_inp.push_back(llama_token_fim_mid(model));
|
||||||
|
|
||||||
|
@ -2016,7 +2094,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
// if input prompt is too big, truncate it
|
||||||
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
|
|
||||||
|
@ -2046,12 +2124,82 @@ struct server_context {
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
// reuse any previously computed tokens that are common with the new prompt
|
// reuse any previously computed tokens that are common with the new prompt
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (int i = 0; i < slot.n_past; ++i) {
|
for (int i = 0; i < slot.n_past; ++i) {
|
||||||
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||||
|
if (params.n_cache_reuse > 0) {
|
||||||
|
size_t head_c = slot.n_past; // cache
|
||||||
|
size_t head_p = slot.n_past; // current prompt
|
||||||
|
|
||||||
|
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
|
||||||
|
|
||||||
|
while (head_c < slot.cache_tokens.size() &&
|
||||||
|
head_p < prompt_tokens.size()) {
|
||||||
|
if (llama_token_is_control(model, slot.cache_tokens[head_c]) &&
|
||||||
|
slot.cache_tokens[head_c] != llama_token_fim_rep(model) &&
|
||||||
|
slot.cache_tokens[head_c] != llama_token_fim_sep(model)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_is_control(model, prompt_tokens[head_p]) &&
|
||||||
|
prompt_tokens[head_p] != llama_token_fim_rep(model) &&
|
||||||
|
prompt_tokens[head_p] != llama_token_fim_sep(model)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n_match = 0;
|
||||||
|
|
||||||
|
while (head_c + n_match < slot.cache_tokens.size() &&
|
||||||
|
head_p + n_match < prompt_tokens.size() &&
|
||||||
|
slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
|
||||||
|
if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) &&
|
||||||
|
slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) &&
|
||||||
|
slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) &&
|
||||||
|
prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) &&
|
||||||
|
prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
n_match++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_match >= (size_t) params.n_cache_reuse) {
|
||||||
|
SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||||
|
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||||
|
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
//}
|
||||||
|
|
||||||
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
|
||||||
|
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_match; i++) {
|
||||||
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||||
|
|
||||||
|
common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
|
||||||
|
|
||||||
|
slot.n_past++;
|
||||||
|
}
|
||||||
|
|
||||||
|
head_c += n_match;
|
||||||
|
head_p += n_match;
|
||||||
|
} else {
|
||||||
|
head_c += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3261,6 +3409,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_update_slots(std::bind(
|
ctx_server.queue_tasks.on_update_slots(std::bind(
|
||||||
&server_context::update_slots, &ctx_server));
|
&server_context::update_slots, &ctx_server));
|
||||||
|
|
||||||
|
|
|
@ -195,14 +195,14 @@ static std::string gen_chatcmplid() {
|
||||||
// other common utils
|
// other common utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t common_part(const std::string & a, const std::string & b) {
|
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
|
|
||||||
|
|
|
@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
// load 2 halfs into register in a single instruction
|
||||||
|
const half2 x_reg = *((half2 *) &(x[ib + iqs]));
|
||||||
// automatic half -> float type cast if dfloat == float
|
// automatic half -> float type cast if dfloat == float
|
||||||
v.x = x[ib + iqs + 0];
|
v.x = __low2float(x_reg);
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = __high2float(x_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
|
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
|
||||||
|
@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
||||||
#ifdef GGML_CUDA_F16
|
#ifdef GGML_CUDA_F16
|
||||||
tmp += __hmul2(v, {
|
if ( y_offset == 1 ) {
|
||||||
y[iybs + iqs + j/qr + 0],
|
// load 2 dfloats into register in a single instruction
|
||||||
y[iybs + iqs + j/qr + y_offset]
|
const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
|
||||||
});
|
tmp += __hmul2(v, y_reg);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
tmp += __hmul2(v, {
|
||||||
|
y[iybs + iqs + j/qr + 0],
|
||||||
|
y[iybs + iqs + j/qr + y_offset]
|
||||||
|
});
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
if ( y_offset == 1 ) {
|
||||||
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
// load 2 dfloats into register in a single instruction
|
||||||
|
const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
|
||||||
|
tmp += v.x * y_reg.x;
|
||||||
|
tmp += v.y * y_reg.y;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
||||||
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
||||||
|
}
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6596,8 +6596,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_eot_id = t.second;
|
vocab.special_eot_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6610,8 +6610,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_eom_id = t.second;
|
vocab.special_eom_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6627,8 +6627,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_pre_id = t.second;
|
vocab.special_fim_pre_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6644,8 +6644,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_suf_id = t.second;
|
vocab.special_fim_suf_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6661,8 +6661,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_mid_id = t.second;
|
vocab.special_fim_mid_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6677,8 +6677,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_pad_id = t.second;
|
vocab.special_fim_pad_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6694,8 +6694,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_rep_id = t.second;
|
vocab.special_fim_rep_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6708,8 +6708,8 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_fim_sep_id = t.second;
|
vocab.special_fim_sep_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6720,6 +6720,19 @@ static void llm_load_vocab(
|
||||||
// this is currently determined based on the token text, which is obviously not ideal
|
// this is currently determined based on the token text, which is obviously not ideal
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||||||
vocab.special_eog_ids.clear();
|
vocab.special_eog_ids.clear();
|
||||||
|
|
||||||
|
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & t : vocab.token_to_id) {
|
for (const auto & t : vocab.token_to_id) {
|
||||||
if (false
|
if (false
|
||||||
|| t.first == "<|eot_id|>"
|
|| t.first == "<|eot_id|>"
|
||||||
|
@ -6732,13 +6745,20 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_eog_ids.insert(t.second);
|
vocab.special_eog_ids.insert(t.second);
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// token is control, but not marked as EOG -> print a warning
|
||||||
|
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sanity checks
|
||||||
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
||||||
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
||||||
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue