llama.vim : set time limit for the generation phase
This commit is contained in:
parent
c507a65af5
commit
6669b550db
2 changed files with 36 additions and 29 deletions
|
@ -12,14 +12,14 @@ highlight llama_hl_hint guifg=#ff772f
|
||||||
highlight llama_hl_info guifg=#77ff2f
|
highlight llama_hl_info guifg=#77ff2f
|
||||||
|
|
||||||
let s:default_config = {
|
let s:default_config = {
|
||||||
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
||||||
\ 'n_prefix': 128,
|
\ 'n_prefix': 128,
|
||||||
\ 'n_suffix': 128,
|
\ 'n_suffix': 128,
|
||||||
\ 'n_predict': 64,
|
\ 'n_predict': 64,
|
||||||
\ 'n_probs': 3,
|
\ 't_max_prompt_ms': 300,
|
||||||
\ 'temperature': 0.1,
|
\ 't_max_predict_ms': 200,
|
||||||
\ 'auto_fim': v:true,
|
\ 'auto_fim': v:true,
|
||||||
\ 'stop': ["\n"]
|
\ 'stop': ["\n"]
|
||||||
\ }
|
\ }
|
||||||
|
|
||||||
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
||||||
|
@ -48,6 +48,8 @@ function! llama#init()
|
||||||
autocmd!
|
autocmd!
|
||||||
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
|
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
|
||||||
autocmd InsertLeave * call llama#fim_cancel()
|
autocmd InsertLeave * call llama#fim_cancel()
|
||||||
|
|
||||||
|
autocmd CursorMoved * call llama#fim_cancel()
|
||||||
augroup END
|
augroup END
|
||||||
|
|
||||||
silent! call llama#fim_cancel()
|
silent! call llama#fim_cancel()
|
||||||
|
@ -85,19 +87,20 @@ function! llama#fim(is_auto) abort
|
||||||
\ . "\n"
|
\ . "\n"
|
||||||
|
|
||||||
let l:request = json_encode({
|
let l:request = json_encode({
|
||||||
\ 'prompt': "",
|
\ 'prompt': "",
|
||||||
\ 'input_prefix': l:prefix,
|
\ 'input_prefix': l:prefix,
|
||||||
\ 'input_suffix': l:suffix,
|
\ 'input_suffix': l:suffix,
|
||||||
"\ 'stop': g:llama_config.stop,
|
"\ 'stop': g:llama_config.stop,
|
||||||
\ 'n_predict': g:llama_config.n_predict,
|
\ 'n_predict': g:llama_config.n_predict,
|
||||||
"\ 'n_probs': g:llama_config.n_probs,
|
\ 'penalty_last_n': 0,
|
||||||
\ 'penalty_last_n': 0,
|
\ 'top_k': 5,
|
||||||
\ 'temperature': g:llama_config.temperature,
|
\ 'infill_p': 0.20,
|
||||||
\ 'top_k': 5,
|
\ 'infill_p_eog': 0.001,
|
||||||
\ 'infill_p': 0.20,
|
\ 'stream': v:false,
|
||||||
\ 'infill_p_eog': 0.001,
|
\ 'samplers': ["top_k", "infill"],
|
||||||
\ 'stream': v:false,
|
\ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms,
|
||||||
\ 'samplers': ["top_k", "infill"]
|
\ 't_max_predict_ms': g:llama_config.t_max_predict_ms,
|
||||||
|
\ 'cache_prompt': v:true
|
||||||
\ })
|
\ })
|
||||||
|
|
||||||
let l:curl_command = printf(
|
let l:curl_command = printf(
|
||||||
|
@ -181,9 +184,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:t_prompt_ms = 1.0
|
let l:t_prompt_ms = 1.0
|
||||||
let l:s_prompt = 0
|
let l:s_prompt = 0
|
||||||
|
|
||||||
let l:n_gen = 0
|
let l:n_predict = 0
|
||||||
let l:t_gen_ms = 1.0
|
let l:t_predict_ms = 1.0
|
||||||
let l:s_gen = 0
|
let l:s_predict = 0
|
||||||
|
|
||||||
if s:can_accept && v:shell_error
|
if s:can_accept && v:shell_error
|
||||||
if !self.is_auto
|
if !self.is_auto
|
||||||
|
@ -221,9 +224,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
|
let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
|
||||||
let l:s_prompt = get(l:timings, 'prompt_per_second', 0)
|
let l:s_prompt = get(l:timings, 'prompt_per_second', 0)
|
||||||
|
|
||||||
let l:n_gen = get(l:timings, 'predicted_n', 0)
|
let l:n_predict = get(l:timings, 'predicted_n', 0)
|
||||||
let l:t_gen_ms = get(l:timings, 'predicted_ms', 1)
|
let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
|
||||||
let l:s_gen = get(l:timings, 'predicted_per_second', 0)
|
let l:s_predict = get(l:timings, 'predicted_per_second', 0)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -256,8 +259,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
|
|
||||||
let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
|
let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
|
||||||
\ l:prefix,
|
\ l:prefix,
|
||||||
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
||||||
\ l:n_gen, l:t_gen_ms, l:s_gen,
|
\ l:n_predict, l:t_predict_ms, l:s_predict,
|
||||||
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
||||||
\ )
|
\ )
|
||||||
|
|
||||||
|
|
|
@ -6725,6 +6725,10 @@ static void llm_load_vocab(
|
||||||
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
||||||
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// build special tokens cache
|
// build special tokens cache
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue