diff --git a/README.md b/README.md index e7d2c632c..a5a85a69a 100644 --- a/README.md +++ b/README.md @@ -16,4 +16,4 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan ## Usage - Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself. - Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places). -- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite. +- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite (for example, https://lite.koboldai.net/?local=1&port=5001). diff --git a/llama_for_kobold.py b/llama_for_kobold.py index dc4b5564a..584c34654 100644 --- a/llama_for_kobold.py +++ b/llama_for_kobold.py @@ -5,6 +5,7 @@ import ctypes import os +#from pathlib import Path class load_model_inputs(ctypes.Structure): _fields_ = [("threads", ctypes.c_int), @@ -33,7 +34,7 @@ handle = ctypes.CDLL(dir_path + "/llamacpp.dll") handle.load_model.argtypes = [load_model_inputs] handle.load_model.restype = ctypes.c_bool -handle.generate.argtypes = [generation_inputs] +handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever handle.generate.restype = generation_outputs def load_model(model_filename,batch_size=8,max_context_length=512,threads=4,n_parts_overwrite=-1): @@ -71,8 +72,8 @@ def generate(prompt,max_length=20,temperature=0.8,top_k=100,top_p=0.85,rep_pen=1 import json, http.server, threading, socket, sys, time # global vars -global modelname -modelname = "" +global friendlymodelname +friendlymodelname = "" maxctx = 1024 maxlen = 256 modelbusy = False @@ -95,8 +96,8 @@ class ServerRequestHandler(http.server.BaseHTTPRequestHandler): if self.path.endswith('/api/v1/model/') or self.path.endswith('/api/latest/model/'): self.send_response(200) self.end_headers() - global modelname - self.wfile.write(json.dumps({"result": modelname }).encode()) + global friendlymodelname + self.wfile.write(json.dumps({"result": friendlymodelname }).encode()) return if self.path.endswith('/api/v1/config/max_length/') or self.path.endswith('/api/latest/config/max_length/'): @@ -122,7 +123,7 @@ class ServerRequestHandler(http.server.BaseHTTPRequestHandler): def do_POST(self): content_length = int(self.headers['Content-Length']) body = self.rfile.read(content_length) - if self.path.endswith('/api/v1/generate/') or self.path.endswith('/api/latest/generate/'): + if self.path.endswith('/api/v1/generate/') or self.path.endswith('/api/latest/generate/') or self.path.endswith('/api/v1/generate') or self.path.endswith('/api/latest/generate'): global modelbusy global last_context if modelbusy: @@ -257,6 +258,9 @@ if __name__ == '__main__': loadok = load_model(modelname,24,maxctx,4,mdl_nparts) print("Load Model OK: " + str(loadok)) + #friendlymodelname = Path(modelname).stem ### this wont work on local kobold api, so we must hardcode a known HF model name + friendlymodelname = "concedo/llamacpp" + if loadok: print("Starting Kobold HTTP Server on port " + str(port)) print("Please connect to custom endpoint at http://localhost:"+str(port))