fixed 70B detection again, try fix horde issues, fixed lite unicode issue, fixed cmake for cuda
This commit is contained in:
parent
3554080502
commit
793cfd136c
4 changed files with 9 additions and 3 deletions
|
@ -106,7 +106,12 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
message("CUDA version: ${CUDA_VERSION_MAJOR}")
|
||||||
|
if(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1689,6 +1689,7 @@ def main(args):
|
||||||
|
|
||||||
if args.hordeconfig and len(args.hordeconfig)>4:
|
if args.hordeconfig and len(args.hordeconfig)>4:
|
||||||
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
|
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
|
||||||
|
horde_thread.setDaemon(True)
|
||||||
horde_thread.start()
|
horde_thread.start()
|
||||||
|
|
||||||
print(f"Please connect to custom endpoint at {epurl}")
|
print(f"Please connect to custom endpoint at {epurl}")
|
||||||
|
|
|
@ -1081,7 +1081,7 @@ static void llama_model_load_internal(
|
||||||
// LLaMAv2
|
// LLaMAv2
|
||||||
// TODO: temporary until GGUF
|
// TODO: temporary until GGUF
|
||||||
//patch for llama2 gqa
|
//patch for llama2 gqa
|
||||||
if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) {
|
if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
|
||||||
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
|
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
|
||||||
n_gqa = 8;
|
n_gqa = 8;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue