Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile
This commit is contained in:
commit
926d90fbab
7 changed files with 218 additions and 11 deletions
|
@ -194,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top-p") {
|
||||
|
@ -564,8 +570,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
|
||||
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
|
||||
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
|
||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
|
|
132
examples/llama.vim
Normal file
132
examples/llama.vim
Normal file
|
@ -0,0 +1,132 @@
|
|||
" Requires an already running llama.cpp server
|
||||
" To install either copy or symlink to ~/.vim/autoload/llama.vim
|
||||
" Then start with either :call llama#doLlamaGen(),
|
||||
" or add a keybind to your vimrc such as
|
||||
" nnoremap Z :call llama#doLlamaGen()<CR>
|
||||
" Similarly, you could add an insert mode keybind with
|
||||
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
|
||||
"
|
||||
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
|
||||
" let g:llama_api_url = "192.168.1.10:8080"
|
||||
" llama_overrides can also be set through buffer/window scopes. For instance
|
||||
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
|
||||
" Could be added to your .vimrc to automatically set a lower temperature when
|
||||
" editing a python script
|
||||
" Additionally, an override dict can be stored at the top of a file
|
||||
" !*{"stop": ["User:"]}
|
||||
" Could be added to the start of your chatlog.txt to set the stopping token
|
||||
" These parameter dicts are merged together from lowest to highest priority:
|
||||
" server default -> g:llama_overrides -> w:llama_overrides ->
|
||||
" b:llama_overrides -> in file (!*) overrides
|
||||
"
|
||||
" Sublists (like logit_bias and stop) are overridden, not merged
|
||||
" Example override:
|
||||
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
|
||||
if !exists("g:llama_api_url")
|
||||
let g:llama_api_url= "127.0.0.1:8080"
|
||||
endif
|
||||
if !exists("g:llama_overrides")
|
||||
let g:llama_overrides = {}
|
||||
endif
|
||||
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
|
||||
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
|
||||
let s:linedict = {}
|
||||
|
||||
func s:callbackHandler(bufn, channel, msg)
|
||||
if len(a:msg) < 3
|
||||
return
|
||||
elseif a:msg[0] == "d"
|
||||
let l:msg = a:msg[6:-1]
|
||||
else
|
||||
let l:msg = a:msg
|
||||
endif
|
||||
let l:decoded_msg = json_decode(l:msg)
|
||||
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
|
||||
if len(l:newtext) > 0
|
||||
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
|
||||
else
|
||||
echo "nothing genned"
|
||||
endif
|
||||
if len(newtext) > 1
|
||||
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
|
||||
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
|
||||
endif
|
||||
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
|
||||
echo "Finished generation"
|
||||
endif
|
||||
endfunction
|
||||
|
||||
func llama#doLlamaGen()
|
||||
if exists("b:job")
|
||||
if job_status(b:job) == "run"
|
||||
call job_stop(b:job)
|
||||
return
|
||||
endif
|
||||
endif
|
||||
|
||||
let l:cbuffer = bufnr("%")
|
||||
let s:linedict[l:cbuffer] = line('$')
|
||||
let l:buflines = getbufline(l:cbuffer, 1, 1000)
|
||||
let l:querydata = copy(s:querydata)
|
||||
call extend(l:querydata, g:llama_overrides)
|
||||
if exists("w:llama_overrides")
|
||||
call extend(l:querydata, w:llama_overrides)
|
||||
endif
|
||||
if exists("b:llama_overrides")
|
||||
call extend(l:querydata, b:llama_overrides)
|
||||
endif
|
||||
if l:buflines[0][0:1] == '!*'
|
||||
let l:userdata = json_decode(l:buflines[0][2:-1])
|
||||
call extend(l:querydata, l:userdata)
|
||||
let l:buflines = l:buflines[1:-1]
|
||||
endif
|
||||
let l:querydata.prompt = join(l:buflines, "\n")
|
||||
let l:curlcommand = copy(s:curlcommand)
|
||||
let l:curlcommand[2] = json_encode(l:querydata)
|
||||
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
|
||||
endfunction
|
||||
|
||||
" Echos the tokkenization of the provided string , or cursor to end of word
|
||||
" Onus is placed on the user to include the preceding space
|
||||
func llama#tokenizeWord(...)
|
||||
if (a:0 > 0)
|
||||
let l:input = a:1
|
||||
else
|
||||
exe "normal \"*ye"
|
||||
let l:input = @*
|
||||
endif
|
||||
let l:querydata = {"content": l:input}
|
||||
let l:curlcommand = copy(s:curlcommand)
|
||||
let l:curlcommand[2] = json_encode(l:querydata)
|
||||
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
|
||||
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
|
||||
endfunction
|
||||
|
||||
func s:tokenizeWordCallback(plaintext, channel, msg)
|
||||
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
|
||||
endfunction
|
||||
|
||||
|
||||
" Echos the token count of the entire buffer (or provided string)
|
||||
" Example usage :echo llama#tokenCount()
|
||||
func llama#tokenCount(...)
|
||||
if (a:0 > 0)
|
||||
let l:buflines = a:1
|
||||
else
|
||||
let l:buflines = getline(1,1000)
|
||||
if l:buflines[0][0:1] == '!*'
|
||||
let l:buflines = l:buflines[1:-1]
|
||||
endif
|
||||
let l:buflines = join(l:buflines, "\n")
|
||||
endif
|
||||
let l:querydata = {"content": l:buflines}
|
||||
let l:curlcommand = copy(s:curlcommand)
|
||||
let l:curlcommand[2] = json_encode(l:querydata)
|
||||
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
|
||||
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
|
||||
endfunction
|
||||
|
||||
func s:tokenCountCallback(channel, msg)
|
||||
let resp = json_decode(a:msg)
|
||||
echo len(resp.tokens)
|
||||
endfunction
|
|
@ -1,3 +1,5 @@
|
|||
" Basic plugin example
|
||||
|
||||
function! Llm()
|
||||
|
||||
let url = "http://127.0.0.1:8080/completion"
|
||||
|
@ -16,8 +18,10 @@ function! Llm()
|
|||
" Extract the content field from the response
|
||||
let content = json_decode(response).content
|
||||
|
||||
let split_newlines = split(content, '\n', 1)
|
||||
|
||||
" Insert the content at the cursor position
|
||||
call setline(line('.'), getline('.') . content)
|
||||
call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
|
||||
endfunction
|
||||
|
||||
command! Llm call Llm()
|
||||
|
|
|
@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
|||
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||
|
||||
### Extended Context Size
|
||||
|
||||
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
|
||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||
|
||||
### Keep Prompt
|
||||
|
||||
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
|
||||
|
|
|
@ -151,6 +151,8 @@ node .
|
|||
|
||||
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
||||
|
||||
`grammar`: Set grammar for grammar-based sampling (default: no grammar)
|
||||
|
||||
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
|
||||
|
||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#ifndef NDEBUG
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
|
@ -195,6 +196,8 @@ struct llama_server_context
|
|||
llama_context *ctx = nullptr;
|
||||
gpt_params params;
|
||||
|
||||
llama_grammar *grammar = nullptr;
|
||||
|
||||
bool truncated = false;
|
||||
bool stopped_eos = false;
|
||||
bool stopped_word = false;
|
||||
|
@ -226,6 +229,7 @@ struct llama_server_context
|
|||
void rewind()
|
||||
{
|
||||
params.antiprompt.clear();
|
||||
params.grammar.clear();
|
||||
num_prompt_tokens = 0;
|
||||
num_tokens_predicted = 0;
|
||||
generated_text = "";
|
||||
|
@ -237,6 +241,7 @@ struct llama_server_context
|
|||
stopped_limit = false;
|
||||
stopping_word = "";
|
||||
multibyte_pending = 0;
|
||||
grammar = nullptr;
|
||||
|
||||
n_remain = 0;
|
||||
n_past = 0;
|
||||
|
@ -257,6 +262,33 @@ struct llama_server_context
|
|||
return true;
|
||||
}
|
||||
|
||||
bool loadGrammar()
|
||||
{
|
||||
if (!params.grammar.empty()) {
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
|
||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parsed_grammar.rules.empty()) {
|
||||
LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
|
||||
return false;
|
||||
}
|
||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||
|
||||
{
|
||||
auto it = params.logit_bias.find(llama_token_eos());
|
||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
||||
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void loadPrompt()
|
||||
{
|
||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
||||
|
@ -420,6 +452,10 @@ struct llama_server_context
|
|||
logits[llama_token_nl()] = nl_logit;
|
||||
}
|
||||
|
||||
if (grammar != nullptr) {
|
||||
llama_sample_grammar(ctx, &candidates_p, grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0)
|
||||
{
|
||||
// Greedy sampling
|
||||
|
@ -457,10 +493,15 @@ struct llama_server_context
|
|||
}
|
||||
}
|
||||
|
||||
if (grammar != nullptr) {
|
||||
llama_grammar_accept_token(ctx, grammar, result.tok);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
||||
{
|
||||
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
|
||||
}
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(result.tok);
|
||||
num_tokens_predicted++;
|
||||
|
@ -947,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama)
|
|||
{"stream", llama.stream},
|
||||
{"logit_bias", llama.params.logit_bias},
|
||||
{"n_probs", llama.params.n_probs},
|
||||
{"grammar", llama.params.grammar},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -1048,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|||
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
|
||||
llama.params.seed = body.value("seed", default_params.seed);
|
||||
llama.params.prompt = body.value("prompt", default_params.prompt);
|
||||
llama.params.grammar = body.value("grammar", default_params.grammar);
|
||||
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
|
||||
|
||||
llama.params.logit_bias.clear();
|
||||
|
@ -1179,6 +1222,12 @@ int main(int argc, char **argv)
|
|||
|
||||
parse_options_completion(json::parse(req.body), llama);
|
||||
|
||||
if (!llama.loadGrammar())
|
||||
{
|
||||
res.status = 400;
|
||||
return;
|
||||
}
|
||||
|
||||
llama.loadPrompt();
|
||||
llama.beginCompletion();
|
||||
|
||||
|
@ -1334,8 +1383,12 @@ int main(int argc, char **argv)
|
|||
|
||||
svr.set_error_handler([](const Request &, Response &res)
|
||||
{
|
||||
res.set_content("File Not Found", "text/plain");
|
||||
res.status = 404; });
|
||||
if (res.status == 400) {
|
||||
res.set_content("Invalid request", "text/plain");
|
||||
} else {
|
||||
res.set_content("File Not Found", "text/plain");
|
||||
res.status = 404;
|
||||
} });
|
||||
|
||||
// set timeouts and change hostname and port
|
||||
svr.set_read_timeout(sparams.read_timeout);
|
||||
|
@ -1363,6 +1416,9 @@ int main(int argc, char **argv)
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (llama.grammar != nullptr) {
|
||||
llama_grammar_free(llama.grammar);
|
||||
}
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
|
|
12
llama.cpp
12
llama.cpp
|
@ -150,7 +150,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|||
}
|
||||
|
||||
// amount of VRAM needed per batch size to hold temporary results
|
||||
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
||||
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
|
@ -158,14 +158,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|||
{ MODEL_7B, 512ull * kB },
|
||||
{ MODEL_13B, 640ull * kB },
|
||||
{ MODEL_30B, 768ull * kB },
|
||||
{ MODEL_65B, 1536ull * kB },
|
||||
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
||||
{ MODEL_65B, 1360ull * kB },
|
||||
{ MODEL_70B, 1360ull * kB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// amount of VRAM needed per batch size and context to hold temporary results
|
||||
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
||||
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
|
@ -173,8 +173,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|||
{ MODEL_7B, 128ull },
|
||||
{ MODEL_13B, 160ull },
|
||||
{ MODEL_30B, 208ull },
|
||||
{ MODEL_65B, 416ull },
|
||||
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
||||
{ MODEL_65B, 320ull },
|
||||
{ MODEL_70B, 320ull },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue