fixed a few OOM errors with larger contexts - I cannot figure out why they happen, so I am forced to increase the buffer size.

This commit is contained in:
Concedo 2023-04-11 00:14:57 +08:00
parent f53238f570
commit 69b85f5b61
5 changed files with 25 additions and 27 deletions

View file

@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_threads = params.n_threads = inputs.threads;
n_batch = params.n_batch = inputs.batch_size;
modelname = params.model = inputs.model_filename;
params.memory_f16 = inputs.f16_kv;
params.n_ctx = inputs.max_context_length;
model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;
if (file_format == FileFormat::GPT2_1)
{
@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
params.temp = inputs.temperature;
params.repeat_last_n = inputs.rep_pen_range;
params.repeat_penalty = inputs.rep_pen;
params.n_ctx = inputs.max_context_length;
params.n_batch = n_batch;
params.n_threads = n_threads;
@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
//truncate to front of the prompt if its too long
int32_t nctx = 512;
if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2)
{
nctx = model_v1.hparams.n_ctx;
}
else if(file_format==FileFormat::GPTJ_3)
{
nctx = model_v2.hparams.n_ctx;
}
else if(file_format==FileFormat::GPT2_1)
{
nctx = model_gpt2_v1.hparams.n_ctx;
}
else if(file_format==FileFormat::GPT2_2)
{
nctx = model_gpt2_v2.hparams.n_ctx;
}
int32_t nctx = params.n_ctx;
if (embd_inp.size() + params.n_predict > nctx)
{

View file

@ -349,7 +349,7 @@ def main(args):
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
modelname = os.path.abspath(ggml_selected_file)
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap))
print("Load Model OK: " + str(loadok))
if not loadok:
@ -378,7 +378,7 @@ def main(args):
RunServerMultiThreaded(args.host, args.port, embedded_kailite)
if __name__ == '__main__':
print("Welcome to KoboldCpp - Version 1.3") # just update version manually
print("Welcome to KoboldCpp - Version 1.4") # just update version manually
parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
parser.add_argument("model_file", help="Model file to load", nargs="?")
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
@ -396,6 +396,6 @@ if __name__ == '__main__':
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
args = parser.parse_args()
main(args)

View file

@ -36,10 +36,13 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
}
}
auto desiredMaxCtx = model.hparams.n_ctx;
// load hparams
{
auto & hparams = model.hparams;
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
@ -47,6 +50,9 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
//used to expand KV size if needed
desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_ctx = desiredMaxCtx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_ctx = desiredMaxCtx;
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;

View file

@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
}
}
auto memory_type = GGML_TYPE_F16;
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
@ -243,8 +245,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
@ -370,7 +372,8 @@ bool gpt2_eval(
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
static size_t buf_size = 256u*1024*1024;
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
static size_t buf_size = 1024u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {

View file

@ -378,7 +378,8 @@ bool gptj_eval(
const int d_key = n_embd/n_head;
static size_t buf_size = 256u*1024*1024;
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
static size_t buf_size = 1024u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {