added version label, improved file type checks
This commit is contained in:
parent
1543c700d8
commit
18a154715e
5 changed files with 69 additions and 4 deletions
|
@ -35,6 +35,12 @@ static std::vector<gpt_vocab::id> current_context_tokens;
|
|||
static size_t mem_per_token = 0;
|
||||
static std::vector<float> logits;
|
||||
|
||||
inline bool IsNanCheck(float f)
|
||||
{
|
||||
const unsigned int u = *(unsigned int*)&f;
|
||||
return (u&0x7F800000) == 0x7F800000 && (u&0x7FFFFF); // Both NaN and qNan.
|
||||
}
|
||||
|
||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format)
|
||||
{
|
||||
ggml_time_init();
|
||||
|
@ -93,6 +99,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
}
|
||||
// determine the required inference memory per token:
|
||||
legacy_gptj_eval(model_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
|
||||
//if the logits are NAN, it means the model is incompatible
|
||||
if(logits.size()>0 && IsNanCheck(logits[0]))
|
||||
{
|
||||
printf("\nBad Logits detected! Retrying GPT-J model loading...");
|
||||
ggml_v1_free(model_v1.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else
|
||||
|
@ -111,6 +126,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
// determine the required inference memory per token:
|
||||
gptj_eval(model_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
|
||||
//if the logits are NAN, it means the model is incompatible
|
||||
if(logits.size()>0 && IsNanCheck(logits[0]))
|
||||
{
|
||||
printf("\nBad Logits detected! Retrying GPT-J model loading...");
|
||||
ggml_free(model_v2.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -204,7 +229,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
|
||||
|
||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||
bool blasmode = false; //(embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||
// bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
|
||||
// bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||
bool blasmode = false;
|
||||
int original_batch = params.n_batch;
|
||||
int original_threads = params.n_threads;
|
||||
if (blasmode)
|
||||
|
@ -355,7 +382,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
}
|
||||
}
|
||||
time2 = timer_check();
|
||||
printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs", time1, time2, (time1 + time2));
|
||||
float pt1 = (time1*1000.0/(embd_inp_size==0?1:embd_inp_size));
|
||||
float pt2 = (time2*1000.0/(params.n_predict==0?1:params.n_predict));
|
||||
printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2));
|
||||
fflush(stdout);
|
||||
output.status = 1;
|
||||
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
||||
|
|
|
@ -333,7 +333,7 @@ def main(args):
|
|||
print("\nNo ggml model file was selected. Exiting.")
|
||||
time.sleep(1)
|
||||
sys.exit(2)
|
||||
except:
|
||||
except Exception as ex:
|
||||
print("File selection GUI unsupported. Please check command line: script.py --help")
|
||||
time.sleep(1)
|
||||
sys.exit(2)
|
||||
|
@ -376,6 +376,7 @@ def main(args):
|
|||
RunServerMultiThreaded(args.host, args.port, embedded_kailite)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Welcome to KoboldCpp - Version 1.3") # just update version manually
|
||||
parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
|
||||
parser.add_argument("model_file", help="Model file to load", nargs="?")
|
||||
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
|
||||
|
|
|
@ -86,10 +86,32 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
if(vocabsiz==50400) //know GPT-J vocab size
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_1;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_rot
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
if(temp!=0 && temp!=1)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
|
||||
}
|
||||
}
|
||||
if(vocabsiz==50257)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_1;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
if(temp!=0 && temp!=1)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else if(magic == 0x67676d66) //v2 format ggmf
|
||||
|
|
|
@ -141,6 +141,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|||
|
||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||
|
||||
// if(wtype==GGML_TYPE_Q4_0 || wtype==GGML_TYPE_Q4_1)
|
||||
// {
|
||||
// //quantized needs more context
|
||||
// ctx_size = (ctx_size*4);
|
||||
// }
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
|
|
|
@ -291,6 +291,13 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue