diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 81d14067c..e04f04c40 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in n_threads = params.n_threads = inputs.threads; n_batch = params.n_batch = inputs.batch_size; modelname = params.model = inputs.model_filename; + params.memory_f16 = inputs.f16_kv; + params.n_ctx = inputs.max_context_length; + model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx; if (file_format == FileFormat::GPT2_1) { @@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o params.temp = inputs.temperature; params.repeat_last_n = inputs.rep_pen_range; params.repeat_penalty = inputs.rep_pen; + params.n_ctx = inputs.max_context_length; params.n_batch = n_batch; params.n_threads = n_threads; @@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); //truncate to front of the prompt if its too long - int32_t nctx = 512; - if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2) - { - nctx = model_v1.hparams.n_ctx; - } - else if(file_format==FileFormat::GPTJ_3) - { - nctx = model_v2.hparams.n_ctx; - } - else if(file_format==FileFormat::GPT2_1) - { - nctx = model_gpt2_v1.hparams.n_ctx; - } - else if(file_format==FileFormat::GPT2_2) - { - nctx = model_gpt2_v2.hparams.n_ctx; - } + int32_t nctx = params.n_ctx; if (embd_inp.size() + params.n_predict > nctx) { diff --git a/koboldcpp.py b/koboldcpp.py index d2364373d..ab791d4d7 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -349,7 +349,7 @@ def main(args): mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1 modelname = os.path.abspath(ggml_selected_file) print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]") - loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap) + loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap)) print("Load Model OK: " + str(loadok)) if not loadok: @@ -378,7 +378,7 @@ def main(args): RunServerMultiThreaded(args.host, args.port, embedded_kailite) if __name__ == '__main__': - print("Welcome to KoboldCpp - Version 1.3") # just update version manually + print("Welcome to KoboldCpp - Version 1.4") # just update version manually parser = argparse.ArgumentParser(description='Kobold llama.cpp server') parser.add_argument("model_file", help="Model file to load", nargs="?") portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args @@ -396,6 +396,6 @@ if __name__ == '__main__': parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') - parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true') + parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') args = parser.parse_args() main(args) diff --git a/otherarch/gpt2_v1.cpp b/otherarch/gpt2_v1.cpp index 6a784a85d..1bea45b8c 100644 --- a/otherarch/gpt2_v1.cpp +++ b/otherarch/gpt2_v1.cpp @@ -36,10 +36,13 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model } } + auto desiredMaxCtx = model.hparams.n_ctx; + // load hparams { auto & hparams = model.hparams; + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); @@ -47,6 +50,9 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + //used to expand KV size if needed + desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); @@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; + const int n_ctx = desiredMaxCtx; const int n_vocab = hparams.n_vocab; ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g @@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; + const int n_ctx = desiredMaxCtx; const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 3f90b7f5b..b24dfd36d 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } } + auto memory_type = GGML_TYPE_F16; + // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation ggml_type wtype = GGML_TYPE_COUNT; @@ -242,9 +244,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements); const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); @@ -370,7 +372,8 @@ bool gpt2_eval( const int n_head = hparams.n_head; const int n_vocab = hparams.n_vocab; - static size_t buf_size = 256u*1024*1024; + //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now + static size_t buf_size = 1024u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 7ed33b816..484ebf5ab 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -378,7 +378,8 @@ bool gptj_eval( const int d_key = n_embd/n_head; - static size_t buf_size = 256u*1024*1024; + //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now + static size_t buf_size = 1024u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) {