updated docs, shifted kv extra space to be subtracted from user's ctx value instead of added on load.

This commit is contained in:
Concedo 2023-11-30 14:19:40 +08:00
parent 66ef4a20e2
commit a012342a77
2 changed files with 26 additions and 4 deletions

View file

@ -98,6 +98,8 @@ static std::mutex concat_output_mtx;
static std::string concat_output = ""; static std::string concat_output = "";
static std::string concat_output_reader_copy = ""; static std::string concat_output_reader_copy = "";
const size_t extra_context_handle_fragmentation = 80;
inline bool IsNanCheck(float f) inline bool IsNanCheck(float f)
{ {
const unsigned int u = *(unsigned int*)&f; const unsigned int u = *(unsigned int*)&f;
@ -883,7 +885,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{ {
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
llama_context_params llama_ctx_params = llama_context_default_params(); llama_context_params llama_ctx_params = llama_context_default_params();
llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation llama_ctx_params.n_ctx = clamped_max_context_length;
//llama_ctx_paran_parts = -1; //llama_ctx_paran_parts = -1;
llama_ctx_params.seed = -1; llama_ctx_params.seed = -1;
llama_ctx_params.f16_kv = inputs.f16_kv; llama_ctx_params.f16_kv = inputs.f16_kv;
@ -1421,6 +1423,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
stop_sequence.push_back(stopper); stop_sequence.push_back(stopper);
} }
} }
std::string addedmemory = inputs.memory; std::string addedmemory = inputs.memory;
params.prompt = inputs.prompt; params.prompt = inputs.prompt;
params.seed = inputs.seed; params.seed = inputs.seed;
@ -1442,6 +1445,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
params.n_threads_batch = n_blasthreads; params.n_threads_batch = n_blasthreads;
bool stream_sse = inputs.stream_sse; bool stream_sse = inputs.stream_sse;
if(params.n_ctx >= 256 && useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
{
params.n_ctx -= extra_context_handle_fragmentation; //add some additional buffer to handle KV fragmentation
if(debugmode==1)
{
printf("\nTrue max context permitted: %d\n",params.n_ctx);
}
}
bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1; bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
generation_finished = false; // Set current generation status generation_finished = false; // Set current generation status

View file

@ -367,9 +367,19 @@
"content": { "content": {
"application/json": { "application/json": {
"example": { "example": {
"max_context_length": 2048,
"max_length": 100,
"prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.", "prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.",
"quiet": false,
"rep_pen": 1.1,
"rep_pen_range": 256,
"rep_pen_slope": 1,
"temperature": 0.5, "temperature": 0.5,
"top_p": 0.9 "tfs": 1.0,
"top_a": 0,
"top_k": 100,
"top_p": 0.9,
"typical": 1.0
}, },
"schema": { "schema": {
"$ref": "#/components/schemas/GenerationInput" "$ref": "#/components/schemas/GenerationInput"