From 865d9bc48a903287649784e15b4a9d48934a9ace Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Oct 2024 12:26:22 +0300 Subject: [PATCH] llama : clean-up ggml-ci --- examples/llama.vim | 111 +++++++++++++++++++++++++++++++---------- src/llama-sampling.cpp | 20 +------- 2 files changed, 85 insertions(+), 46 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index c89ddea65..99712d234 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,31 +1,72 @@ +" LLM-based text completion using llama.cpp +" +" requires: +" +" - neovim +" - curl +" - llama.cpp server instance +" - FIM-compatible model +" " sample config: " -" - Ctrl+F - trigger FIM completion manually +" - Tab - accept the current suggestion +" - Shift+Tab - accept just the first line of the segguestion +" - Ctrl+F - trigger FIM completion manually " -" run this once to initialise the plugin: +" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim " -" :call llama#init() +" start the llama.cpp server with a FIM-compatible model. for example: +" +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 +" +" --batch-size [512, model max context] +" +" adjust the batch size to control how much of the provided context will be used during the inference +" lower values will use smaller part of the context around the cursor, which will result in faster processing +" +" --ubatch-size [64, 2048] +" +" chunks the batch into smaller chunks for faster processing +" depends on the specific hardware. use llama-bench to profile and determine the best size +" +" run this once to initialise llama.vim: +" +" :call llama#init() " " color of the suggested text highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f +" endpoint: llama.cpp server endpoint +" n_prefix: number of lines to include in the prefix +" n_suffix: number of lines to include in the suffix +" n_predict: max number of tokens to predict +" t_max_prompt_ms: max alloted time for the text generation +" show_info: show extra info about the inference +" auto_fim: trigger FIM completion automatically on cursor movement let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 128, - \ 'n_suffix': 128, + \ 'n_prefix': 256, + \ 'n_suffix': 256, \ 'n_predict': 64, - \ 't_max_prompt_ms': 300, + \ 't_max_prompt_ms': 500, \ 't_max_predict_ms': 200, + \ 'show_info': v:true, \ 'auto_fim': v:true, - \ 'stop': ["\n"] \ } let g:llama_config = get(g:, 'llama_config', s:default_config) function! llama#init() - let s:pos_x = 0 + if !executable('curl') + echohl WarningMsg + echo 'llama.vim requires the "curl" command to be available' + echohl None + return + endif + + let s:pos_x = 0 " cursor position upon start of completion let s:pos_y = 0 let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case @@ -46,8 +87,8 @@ function! llama#init() augroup llama autocmd! - autocmd InsertEnter * inoremap :call llama#fim(v:false) - autocmd InsertLeave * call llama#fim_cancel() + autocmd InsertEnter * inoremap :call llama#fim(v:false) + autocmd InsertLeavePre * call llama#fim_cancel() autocmd CursorMoved * call llama#fim_cancel() augroup END @@ -90,7 +131,6 @@ function! llama#fim(is_auto) abort \ 'prompt': "", \ 'input_prefix': l:prefix, \ 'input_suffix': l:suffix, - "\ 'stop': g:llama_config.stop, \ 'n_predict': g:llama_config.n_predict, \ 'penalty_last_n': 0, \ 'top_k': 100, @@ -126,16 +166,23 @@ function! llama#fim(is_auto) abort endif endfunction -function! llama#fim_accept() +" if first_line == v:true accept only the first line of the response +function! llama#fim_accept(first_line) " insert the suggestion at the cursor location if s:can_accept && len(s:content) > 0 call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) if len(s:content) > 1 - call append(s:pos_y, s:content[1:-1]) + if !a:first_line + call append(s:pos_y, s:content[1:-1]) + endif endif " move the cursor to the end of the accepted text - call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + if !a:first_line + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + else + call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1) + endif endif call llama#fim_cancel() @@ -146,6 +193,11 @@ function! llama#fim_cancel() call jobstop(s:current_job) endif + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + " clear the virtual text let l:bufnr = bufnr('%') @@ -155,7 +207,9 @@ function! llama#fim_cancel() call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) + " remove the mappings silent! iunmap + silent! iunmap silent! iunmap augroup llama_insert @@ -173,6 +227,8 @@ function! s:fim_auto_enable() augroup END endfunction +" auto-start a fim job a short time after the cursor has moved +" if there is already a job queued - cancel it function! s:fim_auto() if s:current_job != v:null call jobstop(s:current_job) @@ -189,7 +245,7 @@ function! s:fim_auto() let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) endfunction - +" callback that processes the result from the server function! s:fim_on_stdout(job_id, data, event) dict let l:raw = join(a:data, "\n") if len(l:raw) == 0 @@ -199,6 +255,13 @@ function! s:fim_on_stdout(job_id, data, event) dict let s:can_accept = v:true let l:has_info = v:false + if s:can_accept && v:shell_error + if !self.is_auto + call add(s:content, "<| curl error: is the server on? |>") + endif + let s:can_accept = v:false + endif + let l:n_prompt = 0 let l:t_prompt_ms = 1.0 let l:s_prompt = 0 @@ -207,13 +270,6 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:t_predict_ms = 1.0 let l:s_predict = 0 - if s:can_accept && v:shell_error - if !self.is_auto - call add(s:content, "<| curl error: is the server on? |>") - endif - let s:can_accept = v:false - endif - " get the generated suggestion if s:can_accept let l:response = json_decode(l:raw) @@ -227,7 +283,7 @@ function! s:fim_on_stdout(job_id, data, event) dict call remove(s:content, -1) endwhile - " if response.timings + " if response.timings is available if len(get(l:response, 'timings', {})) > 0 let l:has_info = v:true let l:timings = get(l:response, 'timings', {}) @@ -264,8 +320,8 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:id_vt_fim = nvim_create_namespace('vt_fim') let l:id_vt_info = nvim_create_namespace('vt_info') - " construct the info message: - if l:has_info + " construct the info message and display it to the right of the current line + if g:llama_config.show_info && l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) @@ -282,6 +338,7 @@ function! s:fim_on_stdout(job_id, data, event) dict \ }) endif + " display the suggestion call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hl_hint']], \ 'virt_text_win_col': virtcol('.') - 1 @@ -293,8 +350,8 @@ function! s:fim_on_stdout(job_id, data, event) dict \ }) " setup accept/cancel events - inoremap :call llama#fim_accept() - inoremap :call llama#fim_cancel() + inoremap :call llama#fim_accept(v:false) + inoremap :call llama#fim_accept(v:true) augroup llama_insert autocmd! diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 4a5b922c4..96a979018 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1791,11 +1791,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ for (size_t i = 0; i < cur_p->size; ++i) { LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } +#endif -<<<<<<< HEAD -======= - float p_max = 0.0f; ->>>>>>> af919ec1 (llama : simplify infill sampler) float p_txt_sum = 0.0f; float p_eog_sum = 0.0f; @@ -1807,20 +1804,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } -<<<<<<< HEAD const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); if (3*p_eog_sum*cur_p->size > p_txt_sum) { LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); -======= - const float rat = p_txt_sum / p_eog_sum; - LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size); - - if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) { - LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum); ->>>>>>> af919ec1 (llama : simplify infill sampler) // keep just the EOG tokens const auto size_org = cur_p->size; @@ -1891,7 +1880,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } -<<<<<<< HEAD size_t n_non_eog = 0; size_t size_org = cur_p->size; @@ -1908,12 +1896,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ if (cur_p->data[i].p < thold && !is_eog) { continue; -======= - // mask non-EOG tokens with prob < 0.2 - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { - cur_p->data[i].logit = -INFINITY; ->>>>>>> af919ec1 (llama : simplify infill sampler) } if (!is_eog) {