diff --git a/class.py b/class.py index 76ad123d7..bdf511c0a 100644 --- a/class.py +++ b/class.py @@ -9,7 +9,7 @@ import torch import requests import numpy as np from typing import List, Optional, Union -import os +import os, time from . import koboldcpp import utils @@ -20,11 +20,9 @@ from modeling.inference_model import ( InferenceModel, ) -model_backend_name = "koboldcpp" #specific instead of ggml +model_backend_name = "KoboldCPP" #specific instead of ggml model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) -kcpp_backend_loaded = False - class KoboldCppException(Exception): """To be used for errors on cpp side of KoboldCpp.""" @@ -35,6 +33,7 @@ class KcppArgsObject: class model_backend(InferenceModel): def __init__(self) -> None: super().__init__() + self.kcpp_backend_loaded = False def is_valid(self, model_name, model_path, menu_path): @@ -257,26 +256,31 @@ class model_backend(InferenceModel): def unload(self): print("Attemping to unload library") - koboldcpp.unload_libs() - global kcpp_backend_loaded - kcpp_backend_loaded = False - pass + self.process.terminate() + def _load(self, save_model: bool, initial_load: bool) -> None: - global kcpp_backend_loaded self.tokenizer = self._get_tokenizer("gpt2") - if not kcpp_backend_loaded: - kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename, - port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads, - highpriority=False, contextsize=self.kcpp_ctxsize, blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], - smartcontext=self.kcpp_smartcontext, bantokens=None, forceversion=0, nommap=self.kcpp_nommap, - usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas, - useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, - onready='', multiuser=False, foreground=False) + kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename, + port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads, + psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize, + blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext, + unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap, + usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas, + useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, + onready='', multiuser=False, foreground=False, preloadstory=None, noshift=False, remotetunnel=False) + - koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server - kcpp_backend_loaded = True - pass + #koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server + (self.output_queue, self.input_queue, self.process) = koboldcpp.start_in_seperate_process(kcppargs) + while True: + data = self.output_queue.get() + if data['command'] == 'load status': + utils.koboldai_vars.total_layers = data['data']['total'] + utils.koboldai_vars.loaded_layers = data['data']['loaded'] + elif data['command'] == 'complete': + break + time.sleep(0.02) def _save_settings(self): pass @@ -297,16 +301,31 @@ class model_backend(InferenceModel): # Store context in memory to use it for comparison with generated content utils.koboldai_vars.lastctx = decoded_prompt - genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length, - gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p, - gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range, - sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids) + self.input_queue.put({'command': 'generate', 'data': [(decoded_prompt,max_new,utils.koboldai_vars.max_length, + gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p, + gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range), + {"sampler_order": gen_settings.sampler_order, "use_default_badwordsids": utils.koboldai_vars.use_default_badwordsids} + ]}) + + #genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length, + #gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p, + #gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range, + #sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids) + + genresult = [] + while True: + data = self.output_queue.get() + print(data) + if data['command'] == 'generated text': + genresult.append(data['data']) + if self.output_queue.empty(): + break + time.sleep(0.02) - outputs = [genresult] return GenerationResult( model=self, out_batches=np.array( - [self.tokenizer.encode(x) for x in outputs] + [self.tokenizer.encode(x) for x in genresult] ), prompt=prompt_tokens, is_whole_generation=True, diff --git a/koboldcpp.py b/koboldcpp.py index 08d43f6a7..fd3914d13 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -13,6 +13,7 @@ import os import argparse import json, sys, http.server, time, asyncio, socket, threading from concurrent.futures import ThreadPoolExecutor +import multiprocessing sampler_order_max = 7 stop_token_max = 16 @@ -2330,6 +2331,25 @@ def main(launch_args,start_server=True): else: print(f"Server was not started, main function complete. Idling.") +def run_in_queue(launch_args, input_queue, output_queue): + main(launch_args, start_server=False) + output_queue.put({'command': 'complete'}) + while True: + if not input_queue.empty(): + while not input_queue.empty(): + data = input_queue.get() + if data['command'] == 'generate': + (args, kwargs) = data['data'] + output_queue.put({'command': 'generated text', 'data': generate(*args, **kwargs)}) + time.sleep(0.2) + +def start_in_seperate_process(launch_args): + input_queue = multiprocessing.Queue() + output_queue = multiprocessing.Queue() + p = multiprocessing.Process(target=run_in_queue, args=(launch_args, input_queue, output_queue)) + p.start() + return (output_queue, input_queue, p) + if __name__ == '__main__': print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually # print("Python version: " + sys.version)