diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index d08b39a2c..fc7bf520d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -14,7 +14,7 @@ #include "ggml.h" -#define CL_DMMV_BLOCK_SIZE 32; +#define CL_DMMV_BLOCK_SIZE 64; #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE( diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 3449be635..55a4668e4 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -857,18 +857,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o gpt_vocab::id id = 0; // predict unsigned int embdsize = embd.size(); + //print progress + if (!startedsampling) + { + printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size()); + } + fflush(stdout); + if (embdsize > 0) { - //print progress - if (!startedsampling) - { - printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size()); - } - else - { - printf("\rGenerating (%d / %d tokens)", (1 + params.n_predict - remaining_tokens), params.n_predict); - } - fflush(stdout); bool evalres = false; @@ -954,40 +951,35 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o printf("\n"); } + unsigned int eosID = 0; + float * logitsPtr; if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) { - float * logits; if(file_format == FileFormat::GGJT_3) { - logits = llama_get_logits(llama_ctx_v3); + logitsPtr = llama_get_logits(llama_ctx_v3); } else { - logits = llama_v2_get_logits(llama_ctx_v2); + logitsPtr = llama_v2_get_logits(llama_ctx_v2); } + eosID = llama_token_eos(); + if (!unbanTokens) { // set the logit of the eos token (2) to zero to avoid sampling it - logits[llama_token_eos()] = 0; - //set logits of opening square bracket to zero. (disabled as obsolete) - // logits[518] = 0; - // logits[29961] = 0; + logitsPtr[eosID] = 0; } - - - id = SampleLogits(logits, nctx, n_vocab, last_n_size, repeat_penalty, - top_k, top_p, typical_p, tfs_z, temp, rng, - params.mirostat,params.mirostat_tau,params.mirostat_eta); - } else { + logitsPtr = logits.data(); if (!unbanTokens) { //gpt2 uses negative logits, so we cant zero it // set the logit of the eos token to minimum to avoid sampling it - if ((file_format == FileFormat::GPT2_1 || + if (file_format == FileFormat::GPT2_1 || file_format == FileFormat::GPT2_2 || file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4 || @@ -995,11 +987,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::GPTJ_2 || file_format == FileFormat::GPTJ_3 || file_format == FileFormat::GPTJ_4 || - file_format == FileFormat::GPTJ_5) && - logits.size() > 50256) - { - int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[50256] = (logits[topid] < 0 ? logits[topid] : 0); + file_format == FileFormat::GPTJ_5) + { + eosID = 50256; + if(logits.size() > eosID) + { + int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); + logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); + } } // set the logit of the eos token (0) to minimum to avoid sampling it @@ -1011,16 +1006,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::NEOX_6 || file_format == FileFormat::NEOX_7) { + eosID = 0; int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[0] = (logits[topid] < 0 ? logits[topid] : 0); + logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); } - } - - id = SampleLogits(logits.data(), nctx, n_vocab, last_n_size, repeat_penalty, - top_k, top_p, typical_p, tfs_z, temp, rng, - params.mirostat,params.mirostat_tau,params.mirostat_eta); + } + } - + + id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, + top_k, top_p, typical_p, tfs_z, temp, rng, + params.mirostat,params.mirostat_tau,params.mirostat_eta); + last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); current_context_tokens.push_back(id); @@ -1031,31 +1028,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o // decrement remaining sampling budget --remaining_tokens; - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2|| file_format == FileFormat::GGJT_3) + for (auto id : embd) { - if(file_format == FileFormat::GGJT_3) - { - concat_output += llama_token_to_str(llama_ctx_v3, id); - } - else - { - concat_output += llama_v2_token_to_str(llama_ctx_v2, id); - } - - if(unbanTokens && id==llama_token_eos()) - { - printf("\n(EOS token triggered!)"); - remaining_tokens = 0; - } + concat_output += FileFormatTokenizeID(id,file_format); } - else - { - for (auto id : embd) - { - concat_output += vocab.id_to_token[id].c_str(); - } - } - + + if (startedsampling) + { + printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict); + } if(debugmode && top_picks.size()>0) { printf(" ["); @@ -1074,6 +1055,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o printf("]\n"); } + if(unbanTokens && id==eosID) + { + printf("\n(EOS token triggered!)"); + remaining_tokens = 0; + } + for (const auto &matched : stop_sequence) { if (concat_output.find(matched) != std::string::npos) @@ -1084,6 +1071,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o break; } } + fflush(stdout); } else { diff --git a/koboldcpp.py b/koboldcpp.py index 3891ca31a..3fca437fd 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -208,7 +208,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.25.1" +KcppVersion = "1.26" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/otherarch/ggml_v2-opencl.cpp b/otherarch/ggml_v2-opencl.cpp index 93f038ef5..6193662ba 100644 --- a/otherarch/ggml_v2-opencl.cpp +++ b/otherarch/ggml_v2-opencl.cpp @@ -14,7 +14,7 @@ #include "ggml_v2.h" -#define CL_DMMV_BLOCK_SIZE 32; +#define CL_DMMV_BLOCK_SIZE 64; #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE(