server : add --no-context-shift option (#9607)
* server : add --no-context-shift option * small fix * Update examples/server/tests/features/embeddings.feature Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * tests : minor fix * revert usage of GGML_ASSERT * update server documentation --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f0c7b5edf8
commit
0b3bf966f4
6 changed files with 139 additions and 22 deletions
|
@ -1180,6 +1180,15 @@ struct server_context {
|
|||
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
||||
}
|
||||
|
||||
// if context shift is disabled, we stop when it reaches the context limit
|
||||
if (slot.n_decoded >= slot.n_ctx) {
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
|
||||
}
|
||||
|
||||
if (llama_token_is_eog(model, result.tok)) {
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
|
@ -1480,7 +1489,7 @@ struct server_context {
|
|||
if (result.error) {
|
||||
error_handler(result.data);
|
||||
cancel_tasks(id_tasks);
|
||||
break;
|
||||
return;
|
||||
}
|
||||
|
||||
size_t idx = result.data["index"];
|
||||
|
@ -1827,6 +1836,14 @@ struct server_context {
|
|||
for (server_slot & slot : slots) {
|
||||
if (slot.ga_n == 1) {
|
||||
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
||||
if (!params.ctx_shift) {
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
slot.release();
|
||||
send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Shift context
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||
|
@ -1961,6 +1978,14 @@ struct server_context {
|
|||
continue;
|
||||
}
|
||||
} else {
|
||||
if (!params.ctx_shift) {
|
||||
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
||||
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
|
||||
slot.release();
|
||||
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (slot.params.n_keep < 0) {
|
||||
slot.params.n_keep = slot.n_prompt_tokens;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue