fixed 70B detection again, try fix horde issues, fixed lite unicode issue, fixed cmake for cuda

2023-08-09 01:05:00 +08:00 · 2023-08-09 01:05:00 +08:00 · 793cfd136c
commit 793cfd136c
parent 3554080502
4 changed files with 9 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -106,7 +106,12 @@ if (LLAMA_CUBLAS)
        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
-            set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            message("CUDA version: ${CUDA_VERSION_MAJOR}")
            if(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
                 set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
            else()
                 set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
            endif()
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
--- a/klite.embd
+++ b/klite.embd
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -1689,6 +1689,7 @@ def main(args):
    if args.hordeconfig and len(args.hordeconfig)>4:
        horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
        horde_thread.setDaemon(True)
        horde_thread.start()
    print(f"Please connect to custom endpoint at {epurl}")
--- a/llama.cpp
+++ b/llama.cpp
@ -1081,7 +1081,7 @@ static void llama_model_load_internal(
        // LLaMAv2
        // TODO: temporary until GGUF
        //patch for llama2 gqa
-        if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) {
+        if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
            fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
            n_gqa = 8;
        }