server : token healing for infilling/FIM
This commit is contained in:
parent
d5eea13797
commit
3ba5c55bc4
1 changed files with 23 additions and 15 deletions
|
@ -2088,6 +2088,8 @@ struct server_context {
|
||||||
slot.t_start_process_prompt = ggml_time_us();
|
slot.t_start_process_prompt = ggml_time_us();
|
||||||
slot.t_start_generation = 0;
|
slot.t_start_generation = 0;
|
||||||
|
|
||||||
|
llama_token_healing_output token_healing_out{};
|
||||||
|
|
||||||
if (slot.infill) {
|
if (slot.infill) {
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
bool suff_rm_leading_spc = true;
|
bool suff_rm_leading_spc = true;
|
||||||
|
@ -2107,6 +2109,12 @@ struct server_context {
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
||||||
|
|
||||||
|
if (slot.sparams.token_healing_enabled) {
|
||||||
|
// For FIM roll back only the prefix part (i.e. cursor location)
|
||||||
|
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
|
||||||
|
prefix_tokens, slot.sparams.token_healing_n_rollback);
|
||||||
|
}
|
||||||
|
|
||||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||||
if (add_bos) {
|
if (add_bos) {
|
||||||
|
@ -2122,6 +2130,11 @@ struct server_context {
|
||||||
prompt_tokens = embd_inp;
|
prompt_tokens = embd_inp;
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||||
|
|
||||||
|
if (slot.sparams.token_healing_enabled) {
|
||||||
|
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
|
||||||
|
prompt_tokens, slot.sparams.token_healing_n_rollback);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -2136,6 +2149,16 @@ struct server_context {
|
||||||
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
|
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (slot.sparams.token_healing_enabled) {
|
||||||
|
slot.n_th_prefix = token_healing_out.prefix.size();
|
||||||
|
LOG_VERBOSE("token healing prompt", {
|
||||||
|
{"id_slot", slot.id},
|
||||||
|
{"id_task", slot.id_task},
|
||||||
|
{"removed_suffix", token_healing_out.prefix},
|
||||||
|
{"n_tokens_removed", token_healing_out.n_tokens_removed}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// empty prompt passed -> release the slot and send empty response
|
// empty prompt passed -> release the slot and send empty response
|
||||||
if (prompt_tokens.empty()) {
|
if (prompt_tokens.empty()) {
|
||||||
LOG_INFO("empty prompt - releasing slot", {
|
LOG_INFO("empty prompt - releasing slot", {
|
||||||
|
@ -2151,21 +2174,6 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Roll back prompt tokens if token healing
|
|
||||||
llama_token_healing_output token_healing_out{};
|
|
||||||
if (slot.sparams.token_healing_enabled) {
|
|
||||||
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
|
|
||||||
prompt_tokens, slot.sparams.token_healing_n_rollback);
|
|
||||||
slot.n_th_prefix = token_healing_out.prefix.size();
|
|
||||||
slot.n_prompt_tokens = prompt_tokens.size();
|
|
||||||
LOG_VERBOSE("token healing prompt", {
|
|
||||||
{"id_slot", slot.id},
|
|
||||||
{"id_task", slot.id_task},
|
|
||||||
{"removed_suffix", token_healing_out.prefix},
|
|
||||||
{"n_tokens_removed", token_healing_out.n_tokens_removed}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.embedding) {
|
if (slot.embedding) {
|
||||||
// this prompt is too large to process - discard it
|
// this prompt is too large to process - discard it
|
||||||
if (slot.n_prompt_tokens > n_ubatch) {
|
if (slot.n_prompt_tokens > n_ubatch) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue