llama.vim : async context processing
ggml-ci
This commit is contained in:
parent
2960510153
commit
bc2857b88c
1 changed files with 68 additions and 26 deletions
|
@ -17,7 +17,7 @@
|
||||||
"
|
"
|
||||||
" start the llama.cpp server with a FIM-compatible model. for example:
|
" start the llama.cpp server with a FIM-compatible model. for example:
|
||||||
"
|
"
|
||||||
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512
|
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
|
||||||
"
|
"
|
||||||
" --batch-size [512, model max context]
|
" --batch-size [512, model max context]
|
||||||
"
|
"
|
||||||
|
@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f
|
||||||
"
|
"
|
||||||
" - completion request
|
" - completion request
|
||||||
" - yank
|
" - yank
|
||||||
" - reading a file
|
" - entering a buffer
|
||||||
|
" - leaving a buffer
|
||||||
|
" - writing a file
|
||||||
"
|
"
|
||||||
" ring context parameters:
|
" ring context parameters:
|
||||||
"
|
"
|
||||||
|
@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
|
call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
|
||||||
|
|
||||||
|
" send asynchronous job with the new extra context so that it is ready for the next FIM
|
||||||
|
let l:extra_context = []
|
||||||
|
for l:chunk in s:ring_chunks
|
||||||
|
call add(l:extra_context, l:chunk.str)
|
||||||
|
endfor
|
||||||
|
|
||||||
|
let l:request = json_encode({
|
||||||
|
\ 'prompt': "",
|
||||||
|
\ 'input_prefix': "",
|
||||||
|
\ 'input_suffix': "",
|
||||||
|
\ 'n_predict': 1,
|
||||||
|
\ 'penalty_last_n': 0,
|
||||||
|
\ 'top_k': 100,
|
||||||
|
\ 'stream': v:false,
|
||||||
|
\ 'samplers': ["top_k", "infill"],
|
||||||
|
\ 'cache_prompt': v:true,
|
||||||
|
\ 'extra_context': l:extra_context,
|
||||||
|
\ 't_max_prompt_ms': 1,
|
||||||
|
\ 't_max_predict_ms': 1
|
||||||
|
\ })
|
||||||
|
|
||||||
|
let l:curl_command = printf(
|
||||||
|
\ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
|
||||||
|
\ g:llama_config.endpoint, shellescape(l:request)
|
||||||
|
\ )
|
||||||
|
|
||||||
|
call jobstart(l:curl_command, {
|
||||||
|
\ 'on_exit': function('s:fim_on_exit')
|
||||||
|
\ })
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
function! llama#fim(is_auto) abort
|
function! llama#fim(is_auto) abort
|
||||||
|
@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort
|
||||||
\ . join(l:lines_suffix, "\n")
|
\ . join(l:lines_suffix, "\n")
|
||||||
\ . "\n"
|
\ . "\n"
|
||||||
|
|
||||||
" TODO: per-file location
|
|
||||||
let l:delta_y = abs(s:pos_y - s:pos_y_pick)
|
|
||||||
|
|
||||||
" only gather chunks if the cursor has moved a lot
|
|
||||||
if a:is_auto && l:delta_y > 32
|
|
||||||
" randomly pick a prefix or a suffix chunk
|
|
||||||
if s:rand(0, 1)
|
|
||||||
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
|
|
||||||
else
|
|
||||||
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
|
|
||||||
endif
|
|
||||||
|
|
||||||
let s:pos_y_pick = s:pos_y
|
|
||||||
endif
|
|
||||||
|
|
||||||
" array of strings
|
" array of strings
|
||||||
let l:extra_context = []
|
let l:extra_context = []
|
||||||
for l:chunk in s:ring_chunks
|
for l:chunk in s:ring_chunks
|
||||||
|
@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort
|
||||||
\ 'is_auto': a:is_auto
|
\ 'is_auto': a:is_auto
|
||||||
\ })
|
\ })
|
||||||
|
|
||||||
|
" TODO: per-file location
|
||||||
|
let l:delta_y = abs(s:pos_y - s:pos_y_pick)
|
||||||
|
|
||||||
|
" only gather chunks if the cursor has moved a lot
|
||||||
|
if a:is_auto && l:delta_y > 32
|
||||||
|
" randomly pick a prefix or a suffix chunk
|
||||||
|
if s:rand(0, 1)
|
||||||
|
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
|
||||||
|
else
|
||||||
|
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
|
||||||
|
endif
|
||||||
|
|
||||||
|
let s:pos_y_pick = s:pos_y
|
||||||
|
endif
|
||||||
|
|
||||||
" this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
|
" this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
|
||||||
if !a:is_auto
|
if !a:is_auto
|
||||||
augroup llama_insert
|
augroup llama_insert
|
||||||
|
@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:generation_settings = get(l:response, 'generation_settings', {})
|
let l:generation_settings = get(l:response, 'generation_settings', {})
|
||||||
let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
|
let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
|
||||||
|
|
||||||
let l:n_cached = get(l:response, 'tokens_cached', 0)
|
let l:n_cached = get(l:response, 'tokens_cached', 0)
|
||||||
|
let l:truncated = get(l:response, 'truncated', v:false)
|
||||||
|
|
||||||
" if response.timings is available
|
" if response.timings is available
|
||||||
if len(get(l:response, 'timings', {})) > 0
|
if len(get(l:response, 'timings', {})) > 0
|
||||||
|
@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:id_vt_fim = nvim_create_namespace('vt_fim')
|
let l:id_vt_fim = nvim_create_namespace('vt_fim')
|
||||||
let l:id_vt_info = nvim_create_namespace('vt_info')
|
let l:id_vt_info = nvim_create_namespace('vt_info')
|
||||||
|
|
||||||
" construct the info message and display it to the right of the current line
|
" construct the info message
|
||||||
if g:llama_config.show_info > 0 && l:has_info
|
if g:llama_config.show_info > 0 && l:has_info
|
||||||
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
|
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
|
||||||
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
|
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
|
||||||
|
|
||||||
let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
|
if l:truncated
|
||||||
\ g:llama_config.show_info == 2 ? l:prefix : '',
|
let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
|
||||||
\ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
|
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
|
||||||
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
\ l:n_cached, l:n_ctx
|
||||||
\ l:n_predict, l:t_predict_ms, l:s_predict,
|
\ )
|
||||||
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
else
|
||||||
\ )
|
let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
|
||||||
|
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
|
||||||
|
\ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
|
||||||
|
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
||||||
|
\ l:n_predict, l:t_predict_ms, l:s_predict,
|
||||||
|
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
||||||
|
\ )
|
||||||
|
endif
|
||||||
|
|
||||||
if g:llama_config.show_info == 1
|
if g:llama_config.show_info == 1
|
||||||
|
"" display it in the statusline
|
||||||
let &statusline = l:info
|
let &statusline = l:info
|
||||||
elseif g:llama_config.show_info == 2
|
elseif g:llama_config.show_info == 2
|
||||||
|
" display it to the right of the current line
|
||||||
call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
|
call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
|
||||||
\ 'virt_text': [[l:info, 'llama_hl_info']],
|
\ 'virt_text': [[l:info, 'llama_hl_info']],
|
||||||
\ 'virt_text_pos': 'eol',
|
\ 'virt_text_pos': 'eol',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue