diff --git a/class.py b/class.py index 60fe481c8..76ad123d7 100644 --- a/class.py +++ b/class.py @@ -268,9 +268,8 @@ class model_backend(InferenceModel): if not kcpp_backend_loaded: kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename, port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads, - psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize, - blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext, - unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap, + highpriority=False, contextsize=self.kcpp_ctxsize, blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], + smartcontext=self.kcpp_smartcontext, bantokens=None, forceversion=0, nommap=self.kcpp_nommap, usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas, useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False, foreground=False) diff --git a/colab.ipynb b/colab.ipynb index da9bef59a..b3742ac04 100644 --- a/colab.ipynb +++ b/colab.ipynb @@ -1,29 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "private_outputs": true, - "provenance": [], - "gpuType": "T4", - "authorship_tag": "", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [] }, @@ -36,24 +17,43 @@ }, "outputs": [], "source": [ - "#@title v-- Enter your model below and then click this to start Koboldcpp\n", - "\n", - "Model = \"https://huggingface.co/TheBloke/Airoboros-L2-13B-2.2-GGUF/resolve/main/airoboros-l2-13b-2.2.Q4_K_M.gguf\" #@param [\"\"]{allow-input: true}\n", - "Layers = 43 #@param [43]{allow-input: true}\n", - "\n", - "%cd /content\n", - "!git clone https://github.com/LostRuins/koboldcpp\n", - "%cd /content/koboldcpp\n", - "!make LLAMA_CUBLAS=1\n", - "\n", - "!wget $Model -O model.ggml\n", - "!wget -c https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64\n", - "!chmod +x cloudflared-linux-amd64\n", - "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n", - "!sleep 10\n", - "!cat nohup.out\n", - "!python koboldcpp.py model.ggml --stream --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\n" + "#@title v-- Enter your model below and then click this to start Koboldcpp\r\n", + "\r\n", + "Model = \"https://huggingface.co/TheBloke/Airoboros-L2-13B-2.2-GGUF/resolve/main/airoboros-l2-13b-2.2.Q4_K_M.gguf\" #@param [\"\"]{allow-input: true}\r\n", + "Layers = 43 #@param [43]{allow-input: true}\r\n", + "\r\n", + "%cd /content\r\n", + "!git clone https://github.com/LostRuins/koboldcpp\r\n", + "%cd /content/koboldcpp\r\n", + "!make LLAMA_CUBLAS=1\r\n", + "\r\n", + "!wget $Model -O model.ggml\r\n", + "!wget -c https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64\r\n", + "!chmod +x cloudflared-linux-amd64\r\n", + "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n", + "!sleep 10\r\n", + "!cat nohup.out\r\n", + "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\r\n" ] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "", + "gpuType": "T4", + "include_colab_link": true, + "private_outputs": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/expose.h b/expose.h index 8791a1915..c2fbb2267 100644 --- a/expose.h +++ b/expose.h @@ -38,7 +38,6 @@ struct load_model_inputs const bool use_mmap; const bool use_mlock; const bool use_smartcontext; - const bool unban_tokens; const int clblast_info = 0; const int cublas_info = 0; const int blasbatchsize = 512; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 68db83448..260f11021 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -78,7 +78,6 @@ static int n_threads = 4; static int n_blasthreads = 4; static int n_batch = 8; static bool useSmartContext = false; -static bool unbanTokens = false; static int blasbatchsize = 512; static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall static std::string modelname; @@ -556,7 +555,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in modelname = params.model = inputs.model_filename; useSmartContext = inputs.use_smartcontext; debugmode = inputs.debugmode; - unbanTokens = inputs.unban_tokens; blasbatchsize = inputs.blasbatchsize; if(blasbatchsize<=0) { @@ -1656,7 +1654,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o lowestLogit = LowestLogit(logits); } - if (!unbanTokens && !inputs.unban_tokens_rt) + if (!inputs.unban_tokens_rt) { // set the logit of the eos token to very low to avoid sampling it logitsPtr[eosID] = lowestLogit; @@ -1721,7 +1719,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o printf("]\n"); } - if((unbanTokens||inputs.unban_tokens_rt) && id==eosID) + if(inputs.unban_tokens_rt && id==eosID) { stopper_unused_tokens = remaining_tokens; printf("\n(EOS token triggered!)"); diff --git a/koboldcpp.py b/koboldcpp.py index d954ac605..373264399 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -34,7 +34,6 @@ class load_model_inputs(ctypes.Structure): ("use_mmap", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("use_smartcontext", ctypes.c_bool), - ("unban_tokens", ctypes.c_bool), ("clblast_info", ctypes.c_int), ("cublas_info", ctypes.c_int), ("blasbatchsize", ctypes.c_int), @@ -224,7 +223,6 @@ def load_model(model_filename): if len(args.lora) > 1: inputs.lora_base = args.lora[1].encode("UTF-8") inputs.use_smartcontext = args.smartcontext - inputs.unban_tokens = args.unbantokens inputs.blasbatchsize = args.blasbatchsize inputs.forceversion = args.forceversion inputs.gpulayers = args.gpulayers @@ -307,11 +305,7 @@ def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_ inputs.grammar = grammar.encode("UTF-8") inputs.grammar_retain_state = grammar_retain_state inputs.unban_tokens_rt = not use_default_badwordsids - if args.usemirostat and args.usemirostat[0]>0: - inputs.mirostat = int(args.usemirostat[0]) - inputs.mirostat_tau = float(args.usemirostat[1]) - inputs.mirostat_eta = float(args.usemirostat[2]) - elif mirostat in (1, 2): + if mirostat in (1, 2): inputs.mirostat = mirostat inputs.mirostat_tau = mirostat_tau inputs.mirostat_eta = mirostat_eta @@ -367,7 +361,7 @@ maxhordelen = 256 modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.45.2" +KcppVersion = "1.46" showdebug = True showsamplerwarning = True showmaxctxwarning = True @@ -529,17 +523,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): force_json = False if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without / - if args.stream and not "streaming=1" in self.path: - self.path = self.path.replace("streaming=0","") - if self.path.startswith(('/?','?')): - self.path += "&streaming=1" - else: - self.path = self.path + "?streaming=1" - self.send_response(302) - self.send_header("Location", self.path) - self.end_headers() - print("Force redirect to streaming mode, as --stream is set.") - return None if self.embedded_kailite is None: response_body = (f"Embedded Kobold Lite is not found.
You will have to connect via the main KoboldAI client, or use this URL to connect.").encode() @@ -954,28 +937,18 @@ def show_new_gui(): launchbrowser = ctk.IntVar(value=1) highpriority = ctk.IntVar() disablemmap = ctk.IntVar() - psutil = ctk.IntVar() usemlock = ctk.IntVar() debugmode = ctk.IntVar() keepforeground = ctk.IntVar() lowvram_var = ctk.IntVar() mmq_var = ctk.IntVar(value=1) - blas_threads_var = ctk.StringVar() blas_size_var = ctk.IntVar() version_var =ctk.StringVar(value="0") - stream = ctk.IntVar() smartcontext = ctk.IntVar() - unbantokens = ctk.IntVar() - usemirostat = ctk.IntVar() - mirostat_var = ctk.StringVar(value="2") - mirostat_tau = ctk.StringVar(value="5.0") - mirostat_eta = ctk.StringVar(value="0.1") - context_var = ctk.IntVar() - customrope_var = ctk.IntVar() customrope_scale = ctk.StringVar(value="1.0") customrope_base = ctk.StringVar(value="10000") @@ -1066,7 +1039,7 @@ def show_new_gui(): makeslider(quick_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 7, 12, set=5) # quick boxes - quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens, "Disable MMAP":disablemmap,} + quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Use SmartContext":smartcontext, "Disable MMAP":disablemmap,} for idx, name, in enumerate(quick_boxes): makecheckbox(quick_tab, name, quick_boxes[name], int(idx/2) +20, idx%2) # context size @@ -1099,7 +1072,7 @@ def show_new_gui(): makelabelentry(hardware_tab, "Threads:" , threads_var, 8, 50) # hardware checkboxes - hardware_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Disable MMAP":disablemmap, "Use mlock":usemlock, "PSUtil Set Threads":psutil, "Debug Mode":debugmode, "Keep Foreground":keepforeground} + hardware_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Disable MMAP":disablemmap, "Use mlock":usemlock, "Debug Mode":debugmode, "Keep Foreground":keepforeground} for idx, name, in enumerate(hardware_boxes): makecheckbox(hardware_tab, name, hardware_boxes[name], int(idx/2) +30, idx%2) @@ -1117,24 +1090,10 @@ def show_new_gui(): # Tokens Tab tokens_tab = tabcontent["Tokens"] # tokens checkboxes - token_boxes = {"Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens} + token_boxes = {"Use SmartContext":smartcontext} for idx, name, in enumerate(token_boxes): makecheckbox(tokens_tab, name, token_boxes[name], idx + 1) - mirostat_entry, mirostate_label = makelabelentry(tokens_tab, "Mirostat:", mirostat_var) - mirostat_tau_entry, mirostat_tau_label = makelabelentry(tokens_tab, "Mirostat Tau:", mirostat_tau) - mirostat_eta_entry, mirostat_eta_label = makelabelentry(tokens_tab, "Mirostat Eta:", mirostat_eta) - def togglemiro(a,b,c): - items = [mirostate_label, mirostat_entry, mirostat_tau_label, mirostat_tau_entry, mirostat_eta_label, mirostat_eta_entry] - for idx, item in enumerate(items): - if usemirostat.get() == 1: - item.grid(row=11 + int(idx/2), column=idx%2, padx=8, stick="nw") - else: - item.grid_forget() - - - makecheckbox(tokens_tab, "Use Mirostat", row=10, variable=usemirostat, command=togglemiro) - togglemiro(1,1,1) # context size makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 20, set=2) @@ -1217,10 +1176,7 @@ def show_new_gui(): args.launch = launchbrowser.get()==1 args.highpriority = highpriority.get()==1 args.nommap = disablemmap.get()==1 - args.psutil_set_threads = psutil.get()==1 - args.stream = stream.get()==1 args.smartcontext = smartcontext.get()==1 - args.unbantokens = unbantokens.get()==1 args.foreground = keepforeground.get()==1 gpuchoiceidx = 0 @@ -1251,7 +1207,6 @@ def show_new_gui(): args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) - args.usemirostat = [int(mirostat_var.get()), float(mirostat_tau.get()), float(mirostat_eta.get())] if usemirostat.get()==1 else None args.contextsize = int(contextsize_text[context_var.get()]) if customrope_var.get()==1: @@ -1277,10 +1232,7 @@ def show_new_gui(): launchbrowser.set(1 if "launch" in dict and dict["launch"] else 0) highpriority.set(1 if "highpriority" in dict and dict["highpriority"] else 0) disablemmap.set(1 if "nommap" in dict and dict["nommap"] else 0) - psutil.set(1 if "psutil_set_threads" in dict and dict["psutil_set_threads"] else 0) - stream.set(1 if "stream" in dict and dict["stream"] else 0) smartcontext.set(1 if "smartcontext" in dict and dict["smartcontext"] else 0) - unbantokens.set(1 if "unbantokens" in dict and dict["unbantokens"] else 0) keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0) if "useclblast" in dict and dict["useclblast"]: if clblast_option is not None: @@ -1331,12 +1283,6 @@ def show_new_gui(): if "forceversion" in dict and dict["forceversion"]: version_var.set(str(dict["forceversion"])) - if "usemirostat" in dict and dict["usemirostat"] and len(dict["usemirostat"])>1: - usemirostat.set(0 if str(dict["usemirostat"][0])=="0" else 1) - mirostat_var.set(str(dict["usemirostat"][0])) - mirostat_tau.set(str(dict["usemirostat"][1])) - mirostat_eta.set(str(dict["usemirostat"][2])) - if "model_param" in dict and dict["model_param"]: model_var.set(dict["model_param"]) @@ -1496,18 +1442,14 @@ def show_old_gui(): frameC.grid(row=4,column=0,pady=4) onDropdownChange(None) - stream = tk.IntVar() smartcontext = tk.IntVar() launchbrowser = tk.IntVar(value=1) - unbantokens = tk.IntVar() highpriority = tk.IntVar() disablemmap = tk.IntVar() frameD = tk.Frame(root) - tk.Checkbutton(frameD, text='Streaming Mode',variable=stream, onvalue=1, offvalue=0).grid(row=0,column=0) tk.Checkbutton(frameD, text='Use SmartContext',variable=smartcontext, onvalue=1, offvalue=0).grid(row=0,column=1) tk.Checkbutton(frameD, text='High Priority',variable=highpriority, onvalue=1, offvalue=0).grid(row=1,column=0) tk.Checkbutton(frameD, text='Disable MMAP',variable=disablemmap, onvalue=1, offvalue=0).grid(row=1,column=1) - tk.Checkbutton(frameD, text='Unban Tokens',variable=unbantokens, onvalue=1, offvalue=0).grid(row=2,column=0) tk.Checkbutton(frameD, text='Launch Browser',variable=launchbrowser, onvalue=1, offvalue=0).grid(row=2,column=1) frameD.grid(row=5,column=0,pady=4) @@ -1526,11 +1468,8 @@ def show_old_gui(): #load all the vars args.threads = int(threads_var.get()) args.gpulayers = int(gpu_layers_var.get()) - - args.stream = (stream.get()==1) args.smartcontext = (smartcontext.get()==1) args.launch = (launchbrowser.get()==1) - args.unbantokens = (unbantokens.get()==1) args.highpriority = (highpriority.get()==1) args.nommap = (disablemmap.get()==1) selrunchoice = runchoice.get() @@ -1899,11 +1838,6 @@ def main(launch_args,start_server=True): else: args.lora[1] = os.path.abspath(args.lora[1]) - if args.psutil_set_threads: - import psutil - args.threads = psutil.cpu_count(logical=False) - print("Overriding thread count, using " + str(args.threads) + " threads instead.") - if not args.blasthreads or args.blasthreads <= 0: args.blasthreads = args.threads @@ -1955,17 +1889,6 @@ def main(launch_args,start_server=True): timer_thread = threading.Timer(1, onready_subprocess) #1 second delay timer_thread.start() - # show deprecation warnings - if args.unbantokens: - print("WARNING: --unbantokens is DEPRECATED and will be removed soon! EOS unbans should now be set via the generate API.") - if args.usemirostat: - print("WARNING: --usemirostat is DEPRECATED and will be removed soon! Mirostat values should now be set via the generate API.") - if args.stream: - print("WARNING: --stream is DEPRECATED and will be removed soon! This was a Kobold Lite only parameter, which is now a proper setting toggle inside Lite.") - if args.psutil_set_threads: - print("WARNING: --psutil_set_threads is DEPRECATED and will be removed soon! This parameter was generally unhelpful and unnecessary, as the defaults were usually sufficient") - - if start_server: print(f"Please connect to custom endpoint at {epurl}") asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) @@ -2012,13 +1935,7 @@ if __name__ == '__main__': parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1) - parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them. Polled-streaming is disabled while multiple requests are in queue.", action='store_true') + parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true') parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') - #deprecated - parser.add_argument("--psutil_set_threads", help="--psutil_set_threads is DEPRECATED and will be removed soon! This parameter was generally unhelpful and unnecessary, as the defaults were usually sufficient.", action='store_true') - parser.add_argument("--stream", help="--stream is DEPRECATED and will be removed soon! This was a Kobold Lite only parameter, which is now a proper setting toggle inside Lite.", action='store_true') - parser.add_argument("--unbantokens", help="--unbantokens is DEPRECATED and will be removed soon! EOS unbans should now be set via the generate API", action='store_true') - parser.add_argument("--usemirostat", help="--usemirostat is DEPRECATED and will be removed soon! Mirostat values should now be set via the generate API",metavar=('[type]', '[tau]', '[eta]'), type=float, nargs=3) - main(parser.parse_args(),start_server=True) \ No newline at end of file