From b6594ab91eb91b41732206a7b7df7703f89c053b Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 13 May 2023 15:48:17 +0800
Subject: [PATCH] do not show tokenizer warning

---
 README.md    | 1 +
 koboldcpp.py | 4 +++-
 llama.cpp    | 8 ++++----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7a11078dd..05b5bccbe 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@ For more information, be sure to run the program with the --help flag.
   - For Arch Linux: Install `cblas` `openblas` and `clblast`. 
   - For Debian: Install `libclblast-dev` and `libopenblas-dev`.
 - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
+- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
 
 ## Considerations
 - ZERO or MINIMAL changes as possible to parent repo files - do not move their function declarations elsewhere! We want to be able to update the repo and pull any changes automatically.
diff --git a/koboldcpp.py b/koboldcpp.py
index e0e26f2f8..71f1f8b6c 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -106,6 +106,8 @@ def init_library():
         else:
             use_blas = True
             print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")
+            if sys.platform=="darwin":
+                print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.")
 
     if use_noavx2:
         if use_blas:
@@ -196,7 +198,7 @@ maxctx = 2048
 maxlen = 128
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.21"
+KcppVersion = "1.21.1"
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
     sys_version = ""
diff --git a/llama.cpp b/llama.cpp
index 37b4ef800..8f600c77f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1075,11 +1075,11 @@ static bool llama_eval_internal(
             const int   n_past,
             const int   n_threads) {
 
-    // enforce that the first token is BOS
-    if (n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
+    // enforce that the first token is BOS (not needed, messes with my context manip code)
+    //if (n_past == 0 && tokens[0] != llama_token_bos()) {
+        //fprintf(stderr, "%s: first token must be BOS\n", __func__);
         // return false; //never fail. Not even in the face of Armageddon.
-    }
+    //}
 
     const int64_t t_start_us = ggml_time_us();