updated docs, shifted kv extra space to be subtracted from user's ctx value instead of added on load.
This commit is contained in:
parent
66ef4a20e2
commit
a012342a77
2 changed files with 26 additions and 4 deletions
|
@ -98,6 +98,8 @@ static std::mutex concat_output_mtx;
|
||||||
static std::string concat_output = "";
|
static std::string concat_output = "";
|
||||||
static std::string concat_output_reader_copy = "";
|
static std::string concat_output_reader_copy = "";
|
||||||
|
|
||||||
|
const size_t extra_context_handle_fragmentation = 80;
|
||||||
|
|
||||||
inline bool IsNanCheck(float f)
|
inline bool IsNanCheck(float f)
|
||||||
{
|
{
|
||||||
const unsigned int u = *(unsigned int*)&f;
|
const unsigned int u = *(unsigned int*)&f;
|
||||||
|
@ -883,7 +885,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
llama_context_params llama_ctx_params = llama_context_default_params();
|
llama_context_params llama_ctx_params = llama_context_default_params();
|
||||||
llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation
|
llama_ctx_params.n_ctx = clamped_max_context_length;
|
||||||
//llama_ctx_paran_parts = -1;
|
//llama_ctx_paran_parts = -1;
|
||||||
llama_ctx_params.seed = -1;
|
llama_ctx_params.seed = -1;
|
||||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||||
|
@ -1421,6 +1423,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
stop_sequence.push_back(stopper);
|
stop_sequence.push_back(stopper);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string addedmemory = inputs.memory;
|
std::string addedmemory = inputs.memory;
|
||||||
params.prompt = inputs.prompt;
|
params.prompt = inputs.prompt;
|
||||||
params.seed = inputs.seed;
|
params.seed = inputs.seed;
|
||||||
|
@ -1442,6 +1445,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
params.n_threads_batch = n_blasthreads;
|
params.n_threads_batch = n_blasthreads;
|
||||||
bool stream_sse = inputs.stream_sse;
|
bool stream_sse = inputs.stream_sse;
|
||||||
|
|
||||||
|
if(params.n_ctx >= 256 && useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
|
||||||
|
{
|
||||||
|
params.n_ctx -= extra_context_handle_fragmentation; //add some additional buffer to handle KV fragmentation
|
||||||
|
if(debugmode==1)
|
||||||
|
{
|
||||||
|
printf("\nTrue max context permitted: %d\n",params.n_ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
|
bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
|
||||||
|
|
||||||
generation_finished = false; // Set current generation status
|
generation_finished = false; // Set current generation status
|
||||||
|
|
|
@ -367,9 +367,19 @@
|
||||||
"content": {
|
"content": {
|
||||||
"application/json": {
|
"application/json": {
|
||||||
"example": {
|
"example": {
|
||||||
|
"max_context_length": 2048,
|
||||||
|
"max_length": 100,
|
||||||
"prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.",
|
"prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.",
|
||||||
|
"quiet": false,
|
||||||
|
"rep_pen": 1.1,
|
||||||
|
"rep_pen_range": 256,
|
||||||
|
"rep_pen_slope": 1,
|
||||||
"temperature": 0.5,
|
"temperature": 0.5,
|
||||||
"top_p": 0.9
|
"tfs": 1.0,
|
||||||
|
"top_a": 0,
|
||||||
|
"top_k": 100,
|
||||||
|
"top_p": 0.9,
|
||||||
|
"typical": 1.0
|
||||||
},
|
},
|
||||||
"schema": {
|
"schema": {
|
||||||
"$ref": "#/components/schemas/GenerationInput"
|
"$ref": "#/components/schemas/GenerationInput"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue