From b6594ab91eb91b41732206a7b7df7703f89c053b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 13 May 2023 15:48:17 +0800 Subject: [PATCH] do not show tokenizer warning --- README.md | 1 + koboldcpp.py | 4 +++- llama.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7a11078dd..05b5bccbe 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ For more information, be sure to run the program with the --help flag. - For Arch Linux: Install `cblas` `openblas` and `clblast`. - For Debian: Install `libclblast-dev` and `libopenblas-dev`. - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]` +- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds. ## Considerations - ZERO or MINIMAL changes as possible to parent repo files - do not move their function declarations elsewhere! We want to be able to update the repo and pull any changes automatically. diff --git a/koboldcpp.py b/koboldcpp.py index e0e26f2f8..71f1f8b6c 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -106,6 +106,8 @@ def init_library(): else: use_blas = True print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.") + if sys.platform=="darwin": + print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.") if use_noavx2: if use_blas: @@ -196,7 +198,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.21" +KcppVersion = "1.21.1" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/llama.cpp b/llama.cpp index 37b4ef800..8f600c77f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1075,11 +1075,11 @@ static bool llama_eval_internal( const int n_past, const int n_threads) { - // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); + // enforce that the first token is BOS (not needed, messes with my context manip code) + //if (n_past == 0 && tokens[0] != llama_token_bos()) { + //fprintf(stderr, "%s: first token must be BOS\n", __func__); // return false; //never fail. Not even in the face of Armageddon. - } + //} const int64_t t_start_us = ggml_time_us();