diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 40a799f66..81d14067c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -35,6 +35,12 @@ static std::vector current_context_tokens; static size_t mem_per_token = 0; static std::vector logits; +inline bool IsNanCheck(float f) +{ + const unsigned int u = *(unsigned int*)&f; + return (u&0x7F800000) == 0x7F800000 && (u&0x7FFFFF); // Both NaN and qNan. +} + ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format) { ggml_time_init(); @@ -93,6 +99,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: legacy_gptj_eval(model_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + + //if the logits are NAN, it means the model is incompatible + if(logits.size()>0 && IsNanCheck(logits[0])) + { + printf("\nBad Logits detected! Retrying GPT-J model loading..."); + ggml_v1_free(model_v1.ctx); + return ModelLoadResult::RETRY_LOAD; + } + return ModelLoadResult::SUCCESS; } else @@ -110,7 +125,17 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - gptj_eval(model_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + gptj_eval(model_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + + //if the logits are NAN, it means the model is incompatible + if(logits.size()>0 && IsNanCheck(logits[0])) + { + printf("\nBad Logits detected! Retrying GPT-J model loading..."); + ggml_free(model_v2.ctx); + return ModelLoadResult::RETRY_LOAD; + } + return ModelLoadResult::SUCCESS; } @@ -204,7 +229,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past); //if using BLAS and prompt is big enough, switch to single thread and use a huge batch - bool blasmode = false; //(embd_inp.size() >= 32 && ggml_cpu_has_blas()); + // bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2); + // bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas()); + bool blasmode = false; int original_batch = params.n_batch; int original_threads = params.n_threads; if (blasmode) @@ -355,7 +382,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } } time2 = timer_check(); - printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs", time1, time2, (time1 + time2)); + float pt1 = (time1*1000.0/(embd_inp_size==0?1:embd_inp_size)); + float pt2 = (time2*1000.0/(params.n_predict==0?1:params.n_predict)); + printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2)); fflush(stdout); output.status = 1; snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str()); diff --git a/koboldcpp.py b/koboldcpp.py index 8879f09c6..7797bb09a 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -333,7 +333,7 @@ def main(args): print("\nNo ggml model file was selected. Exiting.") time.sleep(1) sys.exit(2) - except: + except Exception as ex: print("File selection GUI unsupported. Please check command line: script.py --help") time.sleep(1) sys.exit(2) @@ -376,6 +376,7 @@ def main(args): RunServerMultiThreaded(args.host, args.port, embedded_kailite) if __name__ == '__main__': + print("Welcome to KoboldCpp - Version 1.3") # just update version manually parser = argparse.ArgumentParser(description='Kobold llama.cpp server') parser.add_argument("model_file", help="Model file to load", nargs="?") portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args diff --git a/model_adapter.cpp b/model_adapter.cpp index 159d40e5c..0a7c1fd20 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -86,10 +86,32 @@ void print_tok_vec(std::vector &embd) if(vocabsiz==50400) //know GPT-J vocab size { fileformat = FileFormat::GPTJ_1; + uint32_t temp; + fin.read((char *)&temp, sizeof(temp)); //ctx + fin.read((char *)&temp, sizeof(temp)); //n_embd + fin.read((char *)&temp, sizeof(temp)); //n_head + fin.read((char *)&temp, sizeof(temp)); //n_layer + fin.read((char *)&temp, sizeof(temp)); //n_rot + fin.read((char *)&temp, sizeof(temp)); //f16 + if(temp!=0 && temp!=1) + { + fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type + } } if(vocabsiz==50257) { fileformat = FileFormat::GPT2_1; + uint32_t temp; + fin.read((char *)&temp, sizeof(temp)); //ctx + fin.read((char *)&temp, sizeof(temp)); //n_embd + fin.read((char *)&temp, sizeof(temp)); //n_head + fin.read((char *)&temp, sizeof(temp)); //n_layer + fin.read((char *)&temp, sizeof(temp)); //f16 + if(temp!=0 && temp!=1) + { + fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type + } + } } else if(magic == 0x67676d66) //v2 format ggmf diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index f54aac56f..3f90b7f5b 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -141,6 +141,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g ctx_size += (6 + 12*n_layer)*256; // object overhead + // if(wtype==GGML_TYPE_Q4_0 || wtype==GGML_TYPE_Q4_1) + // { + // //quantized needs more context + // ctx_size = (ctx_size*4); + // } + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } diff --git a/otherarch/gptj_quantize.cpp b/otherarch/gptj_quantize.cpp index 7782f2846..8d29f7748 100644 --- a/otherarch/gptj_quantize.cpp +++ b/otherarch/gptj_quantize.cpp @@ -291,6 +291,13 @@ int main(int argc, char ** argv) { return 1; } + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + const std::string fname_inp = argv[1]; const std::string fname_out = argv[2];