fixed a few OOM errors with larger contexts - I cannot figure out why they happen, so I am forced to increase the buffer size.
This commit is contained in:
parent
f53238f570
commit
69b85f5b61
5 changed files with 25 additions and 27 deletions
|
@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
n_threads = params.n_threads = inputs.threads;
|
n_threads = params.n_threads = inputs.threads;
|
||||||
n_batch = params.n_batch = inputs.batch_size;
|
n_batch = params.n_batch = inputs.batch_size;
|
||||||
modelname = params.model = inputs.model_filename;
|
modelname = params.model = inputs.model_filename;
|
||||||
|
params.memory_f16 = inputs.f16_kv;
|
||||||
|
params.n_ctx = inputs.max_context_length;
|
||||||
|
model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;
|
||||||
|
|
||||||
if (file_format == FileFormat::GPT2_1)
|
if (file_format == FileFormat::GPT2_1)
|
||||||
{
|
{
|
||||||
|
@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
params.temp = inputs.temperature;
|
params.temp = inputs.temperature;
|
||||||
params.repeat_last_n = inputs.rep_pen_range;
|
params.repeat_last_n = inputs.rep_pen_range;
|
||||||
params.repeat_penalty = inputs.rep_pen;
|
params.repeat_penalty = inputs.rep_pen;
|
||||||
|
params.n_ctx = inputs.max_context_length;
|
||||||
params.n_batch = n_batch;
|
params.n_batch = n_batch;
|
||||||
params.n_threads = n_threads;
|
params.n_threads = n_threads;
|
||||||
|
|
||||||
|
@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
|
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
|
||||||
|
|
||||||
//truncate to front of the prompt if its too long
|
//truncate to front of the prompt if its too long
|
||||||
int32_t nctx = 512;
|
int32_t nctx = params.n_ctx;
|
||||||
if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2)
|
|
||||||
{
|
|
||||||
nctx = model_v1.hparams.n_ctx;
|
|
||||||
}
|
|
||||||
else if(file_format==FileFormat::GPTJ_3)
|
|
||||||
{
|
|
||||||
nctx = model_v2.hparams.n_ctx;
|
|
||||||
}
|
|
||||||
else if(file_format==FileFormat::GPT2_1)
|
|
||||||
{
|
|
||||||
nctx = model_gpt2_v1.hparams.n_ctx;
|
|
||||||
}
|
|
||||||
else if(file_format==FileFormat::GPT2_2)
|
|
||||||
{
|
|
||||||
nctx = model_gpt2_v2.hparams.n_ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (embd_inp.size() + params.n_predict > nctx)
|
if (embd_inp.size() + params.n_predict > nctx)
|
||||||
{
|
{
|
||||||
|
|
|
@ -349,7 +349,7 @@ def main(args):
|
||||||
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
|
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
|
||||||
modelname = os.path.abspath(ggml_selected_file)
|
modelname = os.path.abspath(ggml_selected_file)
|
||||||
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
|
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
|
||||||
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
|
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap))
|
||||||
print("Load Model OK: " + str(loadok))
|
print("Load Model OK: " + str(loadok))
|
||||||
|
|
||||||
if not loadok:
|
if not loadok:
|
||||||
|
@ -378,7 +378,7 @@ def main(args):
|
||||||
RunServerMultiThreaded(args.host, args.port, embedded_kailite)
|
RunServerMultiThreaded(args.host, args.port, embedded_kailite)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("Welcome to KoboldCpp - Version 1.3") # just update version manually
|
print("Welcome to KoboldCpp - Version 1.4") # just update version manually
|
||||||
parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
|
parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
|
||||||
parser.add_argument("model_file", help="Model file to load", nargs="?")
|
parser.add_argument("model_file", help="Model file to load", nargs="?")
|
||||||
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
|
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
|
||||||
|
@ -396,6 +396,6 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||||
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||||
parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
|
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -36,10 +36,13 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto desiredMaxCtx = model.hparams.n_ctx;
|
||||||
|
|
||||||
// load hparams
|
// load hparams
|
||||||
{
|
{
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
|
||||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||||
|
@ -47,6 +50,9 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
||||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||||
|
|
||||||
|
//used to expand KV size if needed
|
||||||
|
desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||||
|
@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
||||||
|
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = desiredMaxCtx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
|
ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
|
||||||
|
@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
||||||
|
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = desiredMaxCtx;
|
||||||
|
|
||||||
const int n_mem = n_layer*n_ctx;
|
const int n_mem = n_layer*n_ctx;
|
||||||
const int n_elements = n_embd*n_mem;
|
const int n_elements = n_embd*n_mem;
|
||||||
|
|
|
@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto memory_type = GGML_TYPE_F16;
|
||||||
|
|
||||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||||
// in order to save memory and also to speed up the computation
|
// in order to save memory and also to speed up the computation
|
||||||
ggml_type wtype = GGML_TYPE_COUNT;
|
ggml_type wtype = GGML_TYPE_COUNT;
|
||||||
|
@ -242,9 +244,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
||||||
|
|
||||||
const int n_mem = n_layer*n_ctx;
|
const int n_mem = n_layer*n_ctx;
|
||||||
const int n_elements = n_embd*n_mem;
|
const int n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
|
|
||||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||||
|
|
||||||
|
@ -370,7 +372,8 @@ bool gpt2_eval(
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
static size_t buf_size = 256u*1024*1024;
|
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
|
||||||
|
static size_t buf_size = 1024u*1024*1024;
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
|
|
|
@ -378,7 +378,8 @@ bool gptj_eval(
|
||||||
|
|
||||||
const int d_key = n_embd/n_head;
|
const int d_key = n_embd/n_head;
|
||||||
|
|
||||||
static size_t buf_size = 256u*1024*1024;
|
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
|
||||||
|
static size_t buf_size = 1024u*1024*1024;
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue