diff --git a/CMakeLists.txt b/CMakeLists.txt index c133be87b..3d21f1b56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,7 +106,12 @@ if (LLAMA_CUBLAS) if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics else() - set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + message("CUDA version: ${CUDA_VERSION_MAJOR}") + if(CUDA_VERSION_MAJOR GREATER_EQUAL 12) + set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + endif() endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/klite.embd b/klite.embd index 2ce9c166b..153b4af7d 100644 --- a/klite.embd +++ b/klite.embd @@ -25,7 +25,7 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please - diff --git a/koboldcpp.py b/koboldcpp.py index ed50f3708..34a619cd7 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1689,6 +1689,7 @@ def main(args): if args.hordeconfig and len(args.hordeconfig)>4: horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4])) + horde_thread.setDaemon(True) horde_thread.start() print(f"Please connect to custom endpoint at {epurl}") diff --git a/llama.cpp b/llama.cpp index e35af038f..61c1f7998 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1081,7 +1081,7 @@ static void llama_model_load_internal( // LLaMAv2 // TODO: temporary until GGUF //patch for llama2 gqa - if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) { + if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__); n_gqa = 8; }