llama.vim : process extra chunks in the background [no ci]
This commit is contained in:
parent
0c1f51b73e
commit
42a9008b31
1 changed files with 73 additions and 13 deletions
|
@ -17,11 +17,11 @@
|
||||||
"
|
"
|
||||||
" start the llama.cpp server with a FIM-compatible model. for example:
|
" start the llama.cpp server with a FIM-compatible model. for example:
|
||||||
"
|
"
|
||||||
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
|
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 512
|
||||||
"
|
"
|
||||||
" --batch-size [512, model max context]
|
" --batch-size [512, model max context]
|
||||||
"
|
"
|
||||||
" adjust the batch size to control how much of the provided context will be used during the inference
|
" adjust the batch size to control how much of the provided local context will be used during the inference
|
||||||
" lower values will use smaller part of the context around the cursor, which will result in faster processing
|
" lower values will use smaller part of the context around the cursor, which will result in faster processing
|
||||||
"
|
"
|
||||||
" --ubatch-size [64, 2048]
|
" --ubatch-size [64, 2048]
|
||||||
|
@ -58,11 +58,12 @@ highlight llama_hl_info guifg=#77ff2f
|
||||||
" - leaving a buffer
|
" - leaving a buffer
|
||||||
" - writing a file
|
" - writing a file
|
||||||
"
|
"
|
||||||
" ring context parameters:
|
" parameters for the ring-buffer with extra context:
|
||||||
"
|
"
|
||||||
" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
|
" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
|
||||||
" ring_chunk_size: max size of the chunks (in number of lines)
|
" ring_chunk_size: max size of the chunks (in number of lines)
|
||||||
" ring_scope: the range around the cursor position (in number of lines) for gathering chunks
|
" ring_scope: the range around the cursor position (in number of lines) for gathering chunks
|
||||||
|
" ring_update_ms: how often to process queued chunks in normal mode
|
||||||
"
|
"
|
||||||
let s:default_config = {
|
let s:default_config = {
|
||||||
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
||||||
|
@ -77,6 +78,7 @@ let s:default_config = {
|
||||||
\ 'ring_n_chunks': 64,
|
\ 'ring_n_chunks': 64,
|
||||||
\ 'ring_chunk_size': 64,
|
\ 'ring_chunk_size': 64,
|
||||||
\ 'ring_scope': 1024,
|
\ 'ring_scope': 1024,
|
||||||
|
\ 'ring_update_ms': 1000,
|
||||||
\ }
|
\ }
|
||||||
|
|
||||||
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
||||||
|
@ -101,7 +103,8 @@ function! llama#init()
|
||||||
let s:line_cur_prefix = ''
|
let s:line_cur_prefix = ''
|
||||||
let s:line_cur_suffix = ''
|
let s:line_cur_suffix = ''
|
||||||
|
|
||||||
let s:ring_chunks = []
|
let s:ring_chunks = [] " current set of chunks used as extra context
|
||||||
|
let s:ring_queued = [] " chunks that are queued to be sent for processing
|
||||||
let s:ring_n_evict = 0
|
let s:ring_n_evict = 0
|
||||||
|
|
||||||
let s:hint_shown = v:false
|
let s:hint_shown = v:false
|
||||||
|
@ -112,6 +115,7 @@ function! llama#init()
|
||||||
|
|
||||||
let s:timer_fim = -1
|
let s:timer_fim = -1
|
||||||
let s:t_fim_start = reltime() " used to measure total FIM time
|
let s:t_fim_start = reltime() " used to measure total FIM time
|
||||||
|
let s:t_last_move = reltime() " last time the cursor moved
|
||||||
|
|
||||||
let s:current_job = v:null
|
let s:current_job = v:null
|
||||||
|
|
||||||
|
@ -120,15 +124,14 @@ function! llama#init()
|
||||||
autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
|
autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
|
||||||
autocmd InsertLeavePre * call llama#fim_cancel()
|
autocmd InsertLeavePre * call llama#fim_cancel()
|
||||||
|
|
||||||
autocmd CursorMoved * call llama#fim_cancel()
|
autocmd CursorMoved * call s:on_move()
|
||||||
|
autocmd CursorMovedI * call s:on_move()
|
||||||
autocmd CompleteChanged * call llama#fim_cancel()
|
autocmd CompleteChanged * call llama#fim_cancel()
|
||||||
|
|
||||||
if g:llama_config.auto_fim
|
if g:llama_config.auto_fim
|
||||||
autocmd InsertEnter * call llama#fim(v:true, v:false)
|
autocmd InsertEnter * call llama#fim(v:true, v:false)
|
||||||
autocmd CursorMovedI * call llama#fim(v:true, v:false)
|
autocmd CursorMovedI * call llama#fim(v:true, v:false)
|
||||||
"autocmd CursorHoldI * call llama#fim(v:true, v:true)
|
"autocmd CursorHoldI * call llama#fim(v:true, v:true)
|
||||||
else
|
|
||||||
autocmd CursorMovedI * call llama#fim_cancel()
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
|
autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
|
||||||
|
@ -142,6 +145,11 @@ function! llama#init()
|
||||||
augroup END
|
augroup END
|
||||||
|
|
||||||
silent! call llama#fim_cancel()
|
silent! call llama#fim_cancel()
|
||||||
|
|
||||||
|
" init background update of the ring buffer
|
||||||
|
if g:llama_config.ring_n_chunks > 0
|
||||||
|
call s:ring_update()
|
||||||
|
endif
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
" TODO: figure out something better
|
" TODO: figure out something better
|
||||||
|
@ -163,6 +171,7 @@ function! s:chunk_sim(c0, c1)
|
||||||
return 2.0 * l:common / (l:lines0 + l:lines1)
|
return 2.0 * l:common / (l:lines0 + l:lines1)
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
|
" pick a chunk from the provided text and queue it for processing
|
||||||
function! s:pick_chunk(text, no_mod, do_evict)
|
function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
" do not pick chunks from buffers with pending changes or buffers that are not files
|
" do not pick chunks from buffers with pending changes or buffers that are not files
|
||||||
if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
|
if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
|
||||||
|
@ -190,6 +199,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
|
|
||||||
" check if this chunk is already added
|
" check if this chunk is already added
|
||||||
let l:exist = v:false
|
let l:exist = v:false
|
||||||
|
|
||||||
for i in range(len(s:ring_chunks))
|
for i in range(len(s:ring_chunks))
|
||||||
if s:ring_chunks[i].data == l:chunk
|
if s:ring_chunks[i].data == l:chunk
|
||||||
let l:exist = v:true
|
let l:exist = v:true
|
||||||
|
@ -197,11 +207,30 @@ function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
endif
|
endif
|
||||||
endfor
|
endfor
|
||||||
|
|
||||||
|
for i in range(len(s:ring_queued))
|
||||||
|
if s:ring_queued[i].data == l:chunk
|
||||||
|
let l:exist = v:true
|
||||||
|
break
|
||||||
|
endif
|
||||||
|
endfor
|
||||||
|
|
||||||
if l:exist
|
if l:exist
|
||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
" evict chunks that are very similar to the new one
|
" evict chunks that are very similar to the new one
|
||||||
|
for i in range(len(s:ring_queued) - 1, 0, -1)
|
||||||
|
if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.5
|
||||||
|
if a:do_evict
|
||||||
|
call remove(s:ring_queued, i)
|
||||||
|
let s:ring_n_evict += 1
|
||||||
|
else
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endfor
|
||||||
|
|
||||||
|
" also from s:ring_chunks
|
||||||
for i in range(len(s:ring_chunks) - 1, 0, -1)
|
for i in range(len(s:ring_chunks) - 1, 0, -1)
|
||||||
if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5
|
if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5
|
||||||
if a:do_evict
|
if a:do_evict
|
||||||
|
@ -213,11 +242,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
endif
|
endif
|
||||||
endfor
|
endfor
|
||||||
|
|
||||||
|
if len(s:ring_queued) == 16
|
||||||
|
call remove(s:ring_queued, 0)
|
||||||
|
endif
|
||||||
|
|
||||||
|
call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
|
||||||
|
|
||||||
|
"let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
" called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks
|
||||||
|
function! s:ring_update()
|
||||||
|
call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
|
||||||
|
|
||||||
|
" update only if in normal mode or if the cursor hasn't moved for a while
|
||||||
|
if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
if len(s:ring_queued) == 0
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
" move the first queued chunk to the ring buffer
|
||||||
if len(s:ring_chunks) == g:llama_config.ring_n_chunks
|
if len(s:ring_chunks) == g:llama_config.ring_n_chunks
|
||||||
call remove(s:ring_chunks, 0)
|
call remove(s:ring_chunks, 0)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
|
call add(s:ring_chunks, remove(s:ring_queued, 0))
|
||||||
|
|
||||||
|
"let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
|
||||||
|
|
||||||
" send asynchronous job with the new extra context so that it is ready for the next FIM
|
" send asynchronous job with the new extra context so that it is ready for the next FIM
|
||||||
let l:extra_context = []
|
let l:extra_context = []
|
||||||
|
@ -229,16 +283,16 @@ function! s:pick_chunk(text, no_mod, do_evict)
|
||||||
\ })
|
\ })
|
||||||
endfor
|
endfor
|
||||||
|
|
||||||
|
" no samplers needed here
|
||||||
let l:request = json_encode({
|
let l:request = json_encode({
|
||||||
\ 'prompt': "",
|
\ 'prompt': "",
|
||||||
\ 'input_prefix': "",
|
\ 'input_prefix': "",
|
||||||
\ 'input_suffix': "",
|
\ 'input_suffix': "",
|
||||||
\ 'n_predict': 1,
|
\ 'n_predict': 1,
|
||||||
\ 'penalty_last_n': 0,
|
\ 'penalty_last_n': 0,
|
||||||
\ 'top_k': 40,
|
\ 'temperature': 0.0,
|
||||||
\ 'top_p': 0.99,
|
|
||||||
\ 'stream': v:false,
|
\ 'stream': v:false,
|
||||||
\ 'samplers': ["top_k", "top_p", "infill"],
|
\ 'samplers': ["temperature"],
|
||||||
\ 'cache_prompt': v:true,
|
\ 'cache_prompt': v:true,
|
||||||
\ 'extra_context': l:extra_context,
|
\ 'extra_context': l:extra_context,
|
||||||
\ 't_max_prompt_ms': 1,
|
\ 't_max_prompt_ms': 1,
|
||||||
|
@ -409,6 +463,12 @@ function! llama#fim_cancel()
|
||||||
silent! iunmap <buffer> <Esc>
|
silent! iunmap <buffer> <Esc>
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
|
function! s:on_move()
|
||||||
|
let s:t_last_move = reltime()
|
||||||
|
|
||||||
|
call llama#fim_cancel()
|
||||||
|
endfunction
|
||||||
|
|
||||||
" callback that processes the result from the server
|
" callback that processes the result from the server
|
||||||
function! s:fim_on_stdout(job_id, data, event) dict
|
function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:raw = join(a:data, "\n")
|
let l:raw = join(a:data, "\n")
|
||||||
|
@ -511,9 +571,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
\ l:n_cached, l:n_ctx
|
\ l:n_cached, l:n_ctx
|
||||||
\ )
|
\ )
|
||||||
else
|
else
|
||||||
let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
|
let l:info = printf("%s | context: %d / %d / r=%d / q=%d / e=%d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
|
||||||
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
|
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
|
||||||
\ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
|
\ l:n_cached, l:n_ctx, len(s:ring_chunks), len(s:ring_queued), s:ring_n_evict,
|
||||||
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
|
||||||
\ l:n_predict, l:t_predict_ms, l:s_predict,
|
\ l:n_predict, l:t_predict_ms, l:s_predict,
|
||||||
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue