From 865d9bc48a903287649784e15b4a9d48934a9ace Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Oct 2024 12:26:22 +0300
Subject: [PATCH] llama : clean-up

ggml-ci
---
 examples/llama.vim     | 111 +++++++++++++++++++++++++++++++----------
 src/llama-sampling.cpp |  20 +-------
 2 files changed, 85 insertions(+), 46 deletions(-)
diff --git a/examples/llama.vim b/examples/llama.vim
index c89ddea65..99712d234 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,31 +1,72 @@
+" LLM-based text completion using llama.cpp
+"
+" requires:
+"
+"   - neovim
+"   - curl
+"   - llama.cpp server instance
+"   - FIM-compatible model
+"
 " sample config:
 "
-"   - Ctrl+F - trigger FIM completion manually
+"   - Tab       - accept the current suggestion
+"   - Shift+Tab - accept just the first line of the segguestion
+"   - Ctrl+F    - trigger FIM completion manually
 "
-" run this once to initialise the plugin:
+" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
 "
-" :call llama#init()
+" start the llama.cpp server with a FIM-compatible model. for example:
+"
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048
+"
+"   --batch-size [512, model max context]
+"
+"     adjust the batch size to control how much of the provided context will be used during the inference
+"     lower values will use smaller part of the context around the cursor, which will result in faster processing
+"
+"   --ubatch-size [64, 2048]
+"
+"     chunks the batch into smaller chunks for faster processing
+"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"
+" run this once to initialise llama.vim:
+"
+"   :call llama#init()
 "
 
 " color of the suggested text
 highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
+" endpoint:         llama.cpp server endpoint
+" n_prefix:         number of lines to include in the prefix
+" n_suffix:         number of lines to include in the suffix
+" n_predict:        max number of tokens to predict
+" t_max_prompt_ms:  max alloted time for the text generation
+" show_info:        show extra info about the inference
+" auto_fim:         trigger FIM completion automatically on cursor movement
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':         128,
-    \ 'n_suffix':         128,
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         256,
     \ 'n_predict':        64,
-    \ 't_max_prompt_ms':  300,
+    \ 't_max_prompt_ms':  500,
     \ 't_max_predict_ms': 200,
+    \ 'show_info':        v:true,
     \ 'auto_fim':         v:true,
-    \ 'stop':             ["\n"]
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
 function! llama#init()
-    let s:pos_x  = 0
+    if !executable('curl')
+        echohl WarningMsg
+        echo 'llama.vim requires the "curl" command to be available'
+        echohl None
+        return
+    endif
+
+    let s:pos_x  = 0 " cursor position upon start of completion
     let s:pos_y  = 0
     let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
 
@@ -46,8 +87,8 @@ function! llama#init()
 
     augroup llama
         autocmd!
-        autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
-        autocmd InsertLeave * call llama#fim_cancel()
+        autocmd InsertEnter    * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
+        autocmd InsertLeavePre * call llama#fim_cancel()
 
         autocmd CursorMoved * call llama#fim_cancel()
     augroup END
@@ -90,7 +131,6 @@ function! llama#fim(is_auto) abort
         \ 'prompt':           "",
         \ 'input_prefix':     l:prefix,
         \ 'input_suffix':     l:suffix,
-       "\ 'stop':             g:llama_config.stop,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
         \ 'top_k':            100,
@@ -126,16 +166,23 @@ function! llama#fim(is_auto) abort
     endif
 endfunction
 
-function! llama#fim_accept()
+" if first_line == v:true accept only the first line of the response
+function! llama#fim_accept(first_line)
     " insert the suggestion at the cursor location
     if s:can_accept && len(s:content) > 0
         call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
         if len(s:content) > 1
-            call append(s:pos_y, s:content[1:-1])
+            if !a:first_line
+                call append(s:pos_y, s:content[1:-1])
+            endif
         endif
 
         " move the cursor to the end of the accepted text
-        call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+        if !a:first_line
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+        else
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1)
+        endif
     endif
 
     call llama#fim_cancel()
@@ -146,6 +193,11 @@ function! llama#fim_cancel()
         call jobstop(s:current_job)
     endif
 
+    if s:timer_fim != -1
+        call timer_stop(s:timer_fim)
+        let s:timer_fim = -1
+    endif
+
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
@@ -155,7 +207,9 @@ function! llama#fim_cancel()
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
 
+    " remove the mappings
     silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <S-Tab>
     silent! iunmap <buffer> <Esc>
 
     augroup llama_insert
@@ -173,6 +227,8 @@ function! s:fim_auto_enable()
     augroup END
 endfunction
 
+" auto-start a fim job a short time after the cursor has moved
+" if there is already a job queued - cancel it
 function! s:fim_auto()
     if s:current_job != v:null
         call jobstop(s:current_job)
@@ -189,7 +245,7 @@ function! s:fim_auto()
     let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
 endfunction
 
-
+" callback that processes the result from the server
 function! s:fim_on_stdout(job_id, data, event) dict
     let l:raw = join(a:data, "\n")
     if len(l:raw) == 0
@@ -199,6 +255,13 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let s:can_accept = v:true
     let l:has_info   = v:false
 
+    if s:can_accept && v:shell_error
+        if !self.is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
     let l:n_prompt    = 0
     let l:t_prompt_ms = 1.0
     let l:s_prompt    = 0
@@ -207,13 +270,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:t_predict_ms = 1.0
     let l:s_predict    = 0
 
-    if s:can_accept && v:shell_error
-        if !self.is_auto
-            call add(s:content, "<| curl error: is the server on? |>")
-        endif
-        let s:can_accept = v:false
-    endif
-
     " get the generated suggestion
     if s:can_accept
         let l:response = json_decode(l:raw)
@@ -227,7 +283,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
             call remove(s:content, -1)
         endwhile
 
-        " if response.timings
+        " if response.timings is available
         if len(get(l:response, 'timings', {})) > 0
             let l:has_info = v:true
             let l:timings  = get(l:response, 'timings', {})
@@ -264,8 +320,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:id_vt_fim  = nvim_create_namespace('vt_fim')
     let l:id_vt_info = nvim_create_namespace('vt_info')
 
-    " construct the info message:
-    if l:has_info
+    " construct the info message and display it to the right of the current line
+    if g:llama_config.show_info && l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
@@ -282,6 +338,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
             \ })
     endif
 
+    " display the suggestion
     call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
         \ 'virt_text_win_col': virtcol('.') - 1
@@ -293,8 +350,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
         \ })
 
     " setup accept/cancel events
-    inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
-    inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
+    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
 
     augroup llama_insert
         autocmd!
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 4a5b922c4..96a979018 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1791,11 +1791,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     for (size_t i = 0; i < cur_p->size; ++i) {
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
+#endif
 
-<<<<<<< HEAD
-=======
-    float p_max     = 0.0f;
->>>>>>> af919ec1 (llama : simplify infill sampler)
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
 
@@ -1807,20 +1804,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
-<<<<<<< HEAD
     const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
 
     LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
 
     if (3*p_eog_sum*cur_p->size > p_txt_sum) {
         LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
-=======
-    const float rat = p_txt_sum / p_eog_sum;
-    LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
-
-    if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
-        LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
->>>>>>> af919ec1 (llama : simplify infill sampler)
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
@@ -1891,7 +1880,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
-<<<<<<< HEAD
     size_t n_non_eog = 0;
 
     size_t size_org = cur_p->size;
@@ -1908,12 +1896,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
         if (cur_p->data[i].p < thold && !is_eog) {
             continue;
-=======
-    // mask non-EOG tokens with prob < 0.2
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
-            cur_p->data[i].logit = -INFINITY;
->>>>>>> af919ec1 (llama : simplify infill sampler)
         }
 
         if (!is_eog) {