diff --git a/README.md b/README.md
index e7d2c632c..a5a85a69a 100644
--- a/README.md
+++ b/README.md
@@ -16,4 +16,4 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan
 ## Usage
 - Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself.
 - Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
-- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite.
+- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite (for example, https://lite.koboldai.net/?local=1&port=5001).
diff --git a/llama_for_kobold.py b/llama_for_kobold.py
index dc4b5564a..584c34654 100644
--- a/llama_for_kobold.py
+++ b/llama_for_kobold.py
@@ -5,6 +5,7 @@
 
 import ctypes
 import os
+#from pathlib import Path
 
 class load_model_inputs(ctypes.Structure):
     _fields_ = [("threads", ctypes.c_int),
@@ -33,7 +34,7 @@ handle = ctypes.CDLL(dir_path + "/llamacpp.dll")
 
 handle.load_model.argtypes = [load_model_inputs] 
 handle.load_model.restype = ctypes.c_bool
-handle.generate.argtypes = [generation_inputs]
+handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
 handle.generate.restype = generation_outputs
   
 def load_model(model_filename,batch_size=8,max_context_length=512,threads=4,n_parts_overwrite=-1):
@@ -71,8 +72,8 @@ def generate(prompt,max_length=20,temperature=0.8,top_k=100,top_p=0.85,rep_pen=1
 import json, http.server, threading, socket, sys, time
 
 # global vars
-global modelname 
-modelname = ""
+global friendlymodelname 
+friendlymodelname = ""
 maxctx = 1024
 maxlen = 256
 modelbusy = False
@@ -95,8 +96,8 @@ class ServerRequestHandler(http.server.BaseHTTPRequestHandler):
         if self.path.endswith('/api/v1/model/') or self.path.endswith('/api/latest/model/'):
             self.send_response(200)
             self.end_headers()
-            global modelname
-            self.wfile.write(json.dumps({"result": modelname }).encode())
+            global friendlymodelname
+            self.wfile.write(json.dumps({"result": friendlymodelname }).encode())
             return
 
         if self.path.endswith('/api/v1/config/max_length/') or self.path.endswith('/api/latest/config/max_length/'):
@@ -122,7 +123,7 @@ class ServerRequestHandler(http.server.BaseHTTPRequestHandler):
     def do_POST(self):
         content_length = int(self.headers['Content-Length'])
         body = self.rfile.read(content_length)
-        if self.path.endswith('/api/v1/generate/') or self.path.endswith('/api/latest/generate/'):
+        if self.path.endswith('/api/v1/generate/') or self.path.endswith('/api/latest/generate/') or self.path.endswith('/api/v1/generate') or self.path.endswith('/api/latest/generate'):
             global modelbusy
             global last_context
             if modelbusy:
@@ -257,6 +258,9 @@ if __name__ == '__main__':
     loadok = load_model(modelname,24,maxctx,4,mdl_nparts)
     print("Load Model OK: " + str(loadok))
 
+    #friendlymodelname = Path(modelname).stem   ### this wont work on local kobold api, so we must hardcode a known HF model name
+    friendlymodelname = "concedo/llamacpp" 
+    
     if loadok:
         print("Starting Kobold HTTP Server on port " + str(port))
         print("Please connect to custom endpoint at http://localhost:"+str(port))