llama : clean-up
ggml-ci
This commit is contained in:
parent
4b1bd81661
commit
865d9bc48a
2 changed files with 85 additions and 46 deletions
|
@ -1,31 +1,72 @@
|
||||||
|
" LLM-based text completion using llama.cpp
|
||||||
|
"
|
||||||
|
" requires:
|
||||||
|
"
|
||||||
|
" - neovim
|
||||||
|
" - curl
|
||||||
|
" - llama.cpp server instance
|
||||||
|
" - FIM-compatible model
|
||||||
|
"
|
||||||
" sample config:
|
" sample config:
|
||||||
"
|
"
|
||||||
" - Ctrl+F - trigger FIM completion manually
|
" - Tab - accept the current suggestion
|
||||||
|
" - Shift+Tab - accept just the first line of the segguestion
|
||||||
|
" - Ctrl+F - trigger FIM completion manually
|
||||||
"
|
"
|
||||||
" run this once to initialise the plugin:
|
" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
|
||||||
"
|
"
|
||||||
" :call llama#init()
|
" start the llama.cpp server with a FIM-compatible model. for example:
|
||||||
|
"
|
||||||
|
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048
|
||||||
|
"
|
||||||
|
" --batch-size [512, model max context]
|
||||||
|
"
|
||||||
|
" adjust the batch size to control how much of the provided context will be used during the inference
|
||||||
|
" lower values will use smaller part of the context around the cursor, which will result in faster processing
|
||||||
|
"
|
||||||
|
" --ubatch-size [64, 2048]
|
||||||
|
"
|
||||||
|
" chunks the batch into smaller chunks for faster processing
|
||||||
|
" depends on the specific hardware. use llama-bench to profile and determine the best size
|
||||||
|
"
|
||||||
|
" run this once to initialise llama.vim:
|
||||||
|
"
|
||||||
|
" :call llama#init()
|
||||||
"
|
"
|
||||||
|
|
||||||
" color of the suggested text
|
" color of the suggested text
|
||||||
highlight llama_hl_hint guifg=#ff772f
|
highlight llama_hl_hint guifg=#ff772f
|
||||||
highlight llama_hl_info guifg=#77ff2f
|
highlight llama_hl_info guifg=#77ff2f
|
||||||
|
|
||||||
|
" endpoint: llama.cpp server endpoint
|
||||||
|
" n_prefix: number of lines to include in the prefix
|
||||||
|
" n_suffix: number of lines to include in the suffix
|
||||||
|
" n_predict: max number of tokens to predict
|
||||||
|
" t_max_prompt_ms: max alloted time for the text generation
|
||||||
|
" show_info: show extra info about the inference
|
||||||
|
" auto_fim: trigger FIM completion automatically on cursor movement
|
||||||
let s:default_config = {
|
let s:default_config = {
|
||||||
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
\ 'endpoint': 'http://127.0.0.1:8012/infill',
|
||||||
\ 'n_prefix': 128,
|
\ 'n_prefix': 256,
|
||||||
\ 'n_suffix': 128,
|
\ 'n_suffix': 256,
|
||||||
\ 'n_predict': 64,
|
\ 'n_predict': 64,
|
||||||
\ 't_max_prompt_ms': 300,
|
\ 't_max_prompt_ms': 500,
|
||||||
\ 't_max_predict_ms': 200,
|
\ 't_max_predict_ms': 200,
|
||||||
|
\ 'show_info': v:true,
|
||||||
\ 'auto_fim': v:true,
|
\ 'auto_fim': v:true,
|
||||||
\ 'stop': ["\n"]
|
|
||||||
\ }
|
\ }
|
||||||
|
|
||||||
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
let g:llama_config = get(g:, 'llama_config', s:default_config)
|
||||||
|
|
||||||
function! llama#init()
|
function! llama#init()
|
||||||
let s:pos_x = 0
|
if !executable('curl')
|
||||||
|
echohl WarningMsg
|
||||||
|
echo 'llama.vim requires the "curl" command to be available'
|
||||||
|
echohl None
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
|
let s:pos_x = 0 " cursor position upon start of completion
|
||||||
let s:pos_y = 0
|
let s:pos_y = 0
|
||||||
let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
|
let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
|
||||||
|
|
||||||
|
@ -46,8 +87,8 @@ function! llama#init()
|
||||||
|
|
||||||
augroup llama
|
augroup llama
|
||||||
autocmd!
|
autocmd!
|
||||||
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
|
autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
|
||||||
autocmd InsertLeave * call llama#fim_cancel()
|
autocmd InsertLeavePre * call llama#fim_cancel()
|
||||||
|
|
||||||
autocmd CursorMoved * call llama#fim_cancel()
|
autocmd CursorMoved * call llama#fim_cancel()
|
||||||
augroup END
|
augroup END
|
||||||
|
@ -90,7 +131,6 @@ function! llama#fim(is_auto) abort
|
||||||
\ 'prompt': "",
|
\ 'prompt': "",
|
||||||
\ 'input_prefix': l:prefix,
|
\ 'input_prefix': l:prefix,
|
||||||
\ 'input_suffix': l:suffix,
|
\ 'input_suffix': l:suffix,
|
||||||
"\ 'stop': g:llama_config.stop,
|
|
||||||
\ 'n_predict': g:llama_config.n_predict,
|
\ 'n_predict': g:llama_config.n_predict,
|
||||||
\ 'penalty_last_n': 0,
|
\ 'penalty_last_n': 0,
|
||||||
\ 'top_k': 100,
|
\ 'top_k': 100,
|
||||||
|
@ -126,16 +166,23 @@ function! llama#fim(is_auto) abort
|
||||||
endif
|
endif
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
function! llama#fim_accept()
|
" if first_line == v:true accept only the first line of the response
|
||||||
|
function! llama#fim_accept(first_line)
|
||||||
" insert the suggestion at the cursor location
|
" insert the suggestion at the cursor location
|
||||||
if s:can_accept && len(s:content) > 0
|
if s:can_accept && len(s:content) > 0
|
||||||
call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
|
call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
|
||||||
if len(s:content) > 1
|
if len(s:content) > 1
|
||||||
call append(s:pos_y, s:content[1:-1])
|
if !a:first_line
|
||||||
|
call append(s:pos_y, s:content[1:-1])
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
" move the cursor to the end of the accepted text
|
" move the cursor to the end of the accepted text
|
||||||
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
|
if !a:first_line
|
||||||
|
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
|
||||||
|
else
|
||||||
|
call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call llama#fim_cancel()
|
call llama#fim_cancel()
|
||||||
|
@ -146,6 +193,11 @@ function! llama#fim_cancel()
|
||||||
call jobstop(s:current_job)
|
call jobstop(s:current_job)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if s:timer_fim != -1
|
||||||
|
call timer_stop(s:timer_fim)
|
||||||
|
let s:timer_fim = -1
|
||||||
|
endif
|
||||||
|
|
||||||
" clear the virtual text
|
" clear the virtual text
|
||||||
let l:bufnr = bufnr('%')
|
let l:bufnr = bufnr('%')
|
||||||
|
|
||||||
|
@ -155,7 +207,9 @@ function! llama#fim_cancel()
|
||||||
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
|
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
|
||||||
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
|
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
|
||||||
|
|
||||||
|
" remove the mappings
|
||||||
silent! iunmap <buffer> <Tab>
|
silent! iunmap <buffer> <Tab>
|
||||||
|
silent! iunmap <buffer> <S-Tab>
|
||||||
silent! iunmap <buffer> <Esc>
|
silent! iunmap <buffer> <Esc>
|
||||||
|
|
||||||
augroup llama_insert
|
augroup llama_insert
|
||||||
|
@ -173,6 +227,8 @@ function! s:fim_auto_enable()
|
||||||
augroup END
|
augroup END
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
|
" auto-start a fim job a short time after the cursor has moved
|
||||||
|
" if there is already a job queued - cancel it
|
||||||
function! s:fim_auto()
|
function! s:fim_auto()
|
||||||
if s:current_job != v:null
|
if s:current_job != v:null
|
||||||
call jobstop(s:current_job)
|
call jobstop(s:current_job)
|
||||||
|
@ -189,7 +245,7 @@ function! s:fim_auto()
|
||||||
let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
|
let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
|
" callback that processes the result from the server
|
||||||
function! s:fim_on_stdout(job_id, data, event) dict
|
function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:raw = join(a:data, "\n")
|
let l:raw = join(a:data, "\n")
|
||||||
if len(l:raw) == 0
|
if len(l:raw) == 0
|
||||||
|
@ -199,6 +255,13 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let s:can_accept = v:true
|
let s:can_accept = v:true
|
||||||
let l:has_info = v:false
|
let l:has_info = v:false
|
||||||
|
|
||||||
|
if s:can_accept && v:shell_error
|
||||||
|
if !self.is_auto
|
||||||
|
call add(s:content, "<| curl error: is the server on? |>")
|
||||||
|
endif
|
||||||
|
let s:can_accept = v:false
|
||||||
|
endif
|
||||||
|
|
||||||
let l:n_prompt = 0
|
let l:n_prompt = 0
|
||||||
let l:t_prompt_ms = 1.0
|
let l:t_prompt_ms = 1.0
|
||||||
let l:s_prompt = 0
|
let l:s_prompt = 0
|
||||||
|
@ -207,13 +270,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:t_predict_ms = 1.0
|
let l:t_predict_ms = 1.0
|
||||||
let l:s_predict = 0
|
let l:s_predict = 0
|
||||||
|
|
||||||
if s:can_accept && v:shell_error
|
|
||||||
if !self.is_auto
|
|
||||||
call add(s:content, "<| curl error: is the server on? |>")
|
|
||||||
endif
|
|
||||||
let s:can_accept = v:false
|
|
||||||
endif
|
|
||||||
|
|
||||||
" get the generated suggestion
|
" get the generated suggestion
|
||||||
if s:can_accept
|
if s:can_accept
|
||||||
let l:response = json_decode(l:raw)
|
let l:response = json_decode(l:raw)
|
||||||
|
@ -227,7 +283,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
call remove(s:content, -1)
|
call remove(s:content, -1)
|
||||||
endwhile
|
endwhile
|
||||||
|
|
||||||
" if response.timings
|
" if response.timings is available
|
||||||
if len(get(l:response, 'timings', {})) > 0
|
if len(get(l:response, 'timings', {})) > 0
|
||||||
let l:has_info = v:true
|
let l:has_info = v:true
|
||||||
let l:timings = get(l:response, 'timings', {})
|
let l:timings = get(l:response, 'timings', {})
|
||||||
|
@ -264,8 +320,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
let l:id_vt_fim = nvim_create_namespace('vt_fim')
|
let l:id_vt_fim = nvim_create_namespace('vt_fim')
|
||||||
let l:id_vt_info = nvim_create_namespace('vt_info')
|
let l:id_vt_info = nvim_create_namespace('vt_info')
|
||||||
|
|
||||||
" construct the info message:
|
" construct the info message and display it to the right of the current line
|
||||||
if l:has_info
|
if g:llama_config.show_info && l:has_info
|
||||||
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
|
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
|
||||||
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
|
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
|
||||||
|
|
||||||
|
@ -282,6 +338,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
\ })
|
\ })
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
" display the suggestion
|
||||||
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
|
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
|
||||||
\ 'virt_text': [[s:content[0], 'llama_hl_hint']],
|
\ 'virt_text': [[s:content[0], 'llama_hl_hint']],
|
||||||
\ 'virt_text_win_col': virtcol('.') - 1
|
\ 'virt_text_win_col': virtcol('.') - 1
|
||||||
|
@ -293,8 +350,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
|
||||||
\ })
|
\ })
|
||||||
|
|
||||||
" setup accept/cancel events
|
" setup accept/cancel events
|
||||||
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
|
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR>
|
||||||
inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
|
inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
|
||||||
|
|
||||||
augroup llama_insert
|
augroup llama_insert
|
||||||
autocmd!
|
autocmd!
|
||||||
|
|
|
@ -1791,11 +1791,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
<<<<<<< HEAD
|
|
||||||
=======
|
|
||||||
float p_max = 0.0f;
|
|
||||||
>>>>>>> af919ec1 (llama : simplify infill sampler)
|
|
||||||
float p_txt_sum = 0.0f;
|
float p_txt_sum = 0.0f;
|
||||||
float p_eog_sum = 0.0f;
|
float p_eog_sum = 0.0f;
|
||||||
|
|
||||||
|
@ -1807,20 +1804,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
<<<<<<< HEAD
|
|
||||||
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
|
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
|
||||||
|
|
||||||
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
|
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
|
||||||
|
|
||||||
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
|
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
|
||||||
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
|
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
|
||||||
=======
|
|
||||||
const float rat = p_txt_sum / p_eog_sum;
|
|
||||||
LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
|
|
||||||
|
|
||||||
if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
|
|
||||||
LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
|
|
||||||
>>>>>>> af919ec1 (llama : simplify infill sampler)
|
|
||||||
|
|
||||||
// keep just the EOG tokens
|
// keep just the EOG tokens
|
||||||
const auto size_org = cur_p->size;
|
const auto size_org = cur_p->size;
|
||||||
|
@ -1891,7 +1880,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
<<<<<<< HEAD
|
|
||||||
size_t n_non_eog = 0;
|
size_t n_non_eog = 0;
|
||||||
|
|
||||||
size_t size_org = cur_p->size;
|
size_t size_org = cur_p->size;
|
||||||
|
@ -1908,12 +1896,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
||||||
|
|
||||||
if (cur_p->data[i].p < thold && !is_eog) {
|
if (cur_p->data[i].p < thold && !is_eog) {
|
||||||
continue;
|
continue;
|
||||||
=======
|
|
||||||
// mask non-EOG tokens with prob < 0.2
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
||||||
if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
|
||||||
cur_p->data[i].logit = -INFINITY;
|
|
||||||
>>>>>>> af919ec1 (llama : simplify infill sampler)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_eog) {
|
if (!is_eog) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue